Install & imports

In [4]:
!pip -q install "transformers==4.43.3" sentencepiece accelerate \
                gradio==4.44.0 langdetect yake trafilatura beautifulsoup4

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m837.9/837.9 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.7/274.7 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import re
import math
import requests
from bs4 import BeautifulSoup

import torch
from transformers import pipeline
from langdetect import detect, LangDetectException
import yake
import trafilatura

DEVICE = 0 if torch.cuda.is_available() else -1
print("Using device:", "CUDA" if DEVICE == 0 else "CPU")

URL_RE = re.compile(r"^https?://", re.I)

def is_url(text: str) -> bool:
    return bool(URL_RE.match(text.strip()))

def fetch_article(url: str) -> str:
    """Fetch and extract main article text from a URL (returns '' on failure)."""
    try:
        downloaded = trafilatura.fetch_url(url, no_ssl=True)
        if downloaded:
            extracted = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
            if extracted and len(extracted.split()) > 30:
                return extracted
        # fallback: basic HTML text if needed
        html = requests.get(url, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        text = " ".join([p.get_text(" ", strip=True) for p in soup.find_all("p")])
        return text
    except Exception:
        return ""

def chunk_text(text: str, max_words: int = 800):
    """Split long text into word chunks for summarization."""
    words = text.split()
    for i in range(0, len(words), max_words):
        yield " ".join(words[i:i + max_words])

def clean(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip()
    return s

Using device: CUDA


Build Translation, Summarization, and Keywords pipelines

In [6]:
# Language -> model id (fast + accurate for es/de/ar -> en)
TRANS_MODELS = {
    "es": "Helsinki-NLP/opus-mt-es-en",
    "de": "Helsinki-NLP/opus-mt-de-en",
    "ar": "Helsinki-NLP/opus-mt-ar-en",
}

# Build translator pipelines lazily so we only load what we need
_translators = {}

def get_translator(lang_code: str):
    if lang_code not in TRANS_MODELS:
        return None
    if lang_code not in _translators:
        _translators[lang_code] = pipeline(
            "translation",
            model=TRANS_MODELS[lang_code],
            device=DEVICE
        )
    return _translators[lang_code]

# Summarizer (English)
summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    device=DEVICE
)

# Keyword extractor (English)
kw = yake.KeywordExtractor(lan="en", n=1, top=8)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Core processing functions

In [7]:
def detect_lang(text: str) -> str:
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

def translate_to_english(text: str, lang_code: str) -> (str, str):
    """Translate text to English if needed; returns (translated_text, used_model_or_msg)."""
    lang_code = (lang_code or "").lower()
    if lang_code.startswith("en"):
        return clean(text), "already_en"

    translator = get_translator(lang_code)
    if not translator:
        # Unsupported language → return as-is with note
        return clean(text), f"unsupported_lang({lang_code})"

    out = translator(text, max_length=4000)
    eng = clean(out[0]["translation_text"])
    return eng, TRANS_MODELS[lang_code]

def summarize_english(text: str, min_len: int = 30, max_len: int = 180) -> str:
    text = clean(text)
    if len(text.split()) < 60:
        # Short text → summarization not useful
        return text

    parts = []
    for chunk in chunk_text(text, max_words=800):
        try:
            summary = summarizer(
                chunk,
                max_length=max_len,
                min_length=min_len,
                do_sample=False
            )[0]["summary_text"]
            parts.append(clean(summary))
        except Exception:
            # On any error, fall back to the original chunk
            parts.append(chunk)
    return clean(" ".join(parts))

def extract_keywords_english(text: str, topk: int = 8):
    try:
        keys = kw.extract_keywords(text)
        # returns list of (keyword, score) sorted by score asc → we flip to big-first
        keys = sorted(keys, key=lambda x: x[1])[:topk]
        return [k for k, _ in keys]
    except Exception:
        return []

One function that handles URLs or pasted text

In [8]:
def process_input(input_text: str):
    """
    1) If URL: fetch article text
    2) Auto-detect language
    3) Translate non-English to English (es/de/ar supported)
    4) Summarize in English
    5) Extract English keywords
    """
    source = ""
    if is_url(input_text):
        source = fetch_article(input_text)
        if not source:
            return {
                "language": "unknown",
                "used_model": "fetch_failed",
                "source_text": "",
                "english_text": "",
                "summary": "",
                "keywords": []
            }
    else:
        source = input_text

    source = clean(source)
    lang = detect_lang(source) if source else "unknown"
    english_text, used_model = translate_to_english(source, lang) if lang != "en" else (source, "already_en")

    summary = summarize_english(english_text)
    keywords = extract_keywords_english(english_text)

    return {
        "language": lang,
        "used_model": used_model,
        "source_text": source,
        "english_text": english_text,
        "summary": summary,
        "keywords": keywords
    }

Quick sanity checks

In [9]:
examples = {
    "spanish": "La inflación anual en España cayó al 2,3% en junio, impulsada por la bajada de los precios de la energía.",
    "german": "Die Europäische Zentralbank erwägt, die Zinssätze stabil zu halten, während sie neue Daten zur Inflation bewertet.",
    "arabic": "أعلنت الشركة عن أرباح فصلية أعلى من المتوقع بسبب زيادة الطلب في الأسواق العالمية."
}

for name, txt in examples.items():
    print("---", name, "---")
    out = process_input(txt)
    print("lang:", out["language"])
    print("model:", out["used_model"])
    print("english:", out["english_text"])
    print("summary:", out["summary"])
    print("keywords:", out["keywords"])
    print()

--- spanish ---


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



lang: es
model: Helsinki-NLP/opus-mt-es-en
english: Annual inflation in Spain fell to 2.3% in June, driven by the decline in energy prices.
summary: Annual inflation in Spain fell to 2.3% in June, driven by the decline in energy prices.
keywords: ['June', 'Spain', 'Annual', 'driven', 'prices', 'inflation', 'fell', 'decline']

--- german ---


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

lang: de
model: Helsinki-NLP/opus-mt-de-en
english: The European Central Bank is considering keeping interest rates stable while assessing new data on inflation.
summary: The European Central Bank is considering keeping interest rates stable while assessing new data on inflation.
keywords: ['European', 'Central', 'Bank', 'inflation', 'keeping', 'interest', 'rates', 'stable']

--- arabic ---


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

lang: ar
model: Helsinki-NLP/opus-mt-ar-en
english: The company announced higher-than-expected quarterly profits due to increased demand in world markets.
summary: The company announced higher-than-expected quarterly profits due to increased demand in world markets.
keywords: ['announced', 'quarterly', 'markets', 'company', 'profits', 'due', 'increased', 'demand']



Gradio demo

In [10]:
import gradio as gr

def gradio_fn(text_or_url):
    res = process_input(text_or_url)
    return (
        res["language"],
        res["used_model"],
        res["source_text"],
        res["english_text"],
        res["summary"],
        ", ".join(res["keywords"])
    )

with gr.Blocks(title="Multilingual News → English (Translate + Summarize)") as demo:
    gr.Markdown("# 🌐 Multilingual News → English\nTranslate (es/de/ar) ➜ Summarize ➜ Keywords")

    inp = gr.Textbox(label="Paste a news URL or text (Spanish / German / Arabic / English)")

    with gr.Row():
        lang = gr.Textbox(label="Detected language", interactive=False)
        used = gr.Textbox(label="Translation model", interactive=False)

    src = gr.Textbox(label="Source text", lines=8)
    eng = gr.Textbox(label="English text", lines=8)
    summ = gr.Textbox(label="Summary (English)", lines=6)
    kwds = gr.Textbox(label="Keywords (English)", lines=2)

    btn = gr.Button("Translate & Summarize")
    btn.click(gradio_fn, inputs=inp, outputs=[lang, used, src, eng, summ, kwds])

    gr.Examples(
        examples=[
            "https://elpais.com/espana/2024-06-10/...",  # replace with any real article if you want
            "La inflación anual en España cayó al 2,3% en junio, impulsada por la bajada de los precios de la energía.",
            "Die Europäische Zentralbank erwägt, die Zinssätze stabil zu halten.",
            "أعلنت الشركة عن أرباح فصلية أعلى من المتوقع بسبب زيادة الطلب في الأسواق العالمية."
        ],
        inputs=[inp]
    )

demo.launch(share=True)

--------


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://ad96fa6a6e975ec8ad.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


