In [1]:
from transformers import MarianMTModel, MarianTokenizer
from newspaper import Article # pip3 install newspaper3k # pip install newspaper3k lxml_html_clean
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Allowed models 
MODEL_MAP = {
    ("en", "de"): "Helsinki-NLP/opus-mt-en-de",
    ("de", "en"): "Helsinki-NLP/opus-mt-de-en",
    ("en", "es"): "Helsinki-NLP/opus-mt-en-es",
    ("es", "en"): "Helsinki-NLP/opus-mt-es-en",
    ("en", "ru"): "Helsinki-NLP/opus-mt-en-ru",
    ("ru", "en"): "Helsinki-NLP/opus-mt-ru-en",
    ("en", "uk"): "Helsinki-NLP/opus-mt-en-uk",
    ("uk", "en"): "Helsinki-NLP/opus-mt-uk-en",
}

def translate(text, src_lang, tgt_lang):
    """
    Translate text from src_lang to tgt_lang using MarianMT.
    Supported languages: en, de, es, ru, uk
    """
    model_name = MODEL_MAP.get((src_lang, tgt_lang))
    if model_name is None:
        raise ValueError(f"No MarianMT model available for {src_lang} → {tgt_lang}")
    
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    batch = tokenizer([text], return_tensors="pt", padding=True)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return tgt_text


In [3]:
texts = {
    "en": "This is a test sentence.",
    "de": "Dies ist ein Testsatz.",
    "es": "Esta es una frase de prueba.",
    "uk": "Це тестове речення.",
    "ru": "Это тестовое предложение."
}

print("EN → DE:", translate(texts["en"], "en", "de"))
print("DE → EN:", translate(texts["de"], "de", "en"))
print("EN → ES:", translate(texts["en"], "en", "es"))
print("ES → EN:", translate(texts["es"], "es", "en"))
print("EN → RU:", translate(texts["en"], "en", "ru"))
print("RU → EN:", translate(texts["ru"], "ru", "en"))
print("EN → UK:", translate(texts["en"], "en", "uk"))
print("UK → EN:", translate(texts["uk"], "uk", "en"))


EN → DE: Das ist ein Testsatz.
DE → EN: This is a test set.
EN → ES: Esta es una frase de prueba.
ES → EN: This is a test phrase.
EN → RU: Это испытательное предложение.
RU → EN: It's a test offer.
EN → UK: Це тестове речення.
UK → EN: This is a test sentence.


In [4]:
import sacrebleu

# Test sentences (parallel references)
test_data = [
    {"en": "This is a test sentence.", 
     "de": "Dies ist ein Testsatz.", 
     "es": "Esta es una frase de prueba.", 
     "ru": "Это тестовое предложение.", 
     "uk": "Це тестове речення."},
    
    {"en": "The book is on the table.", 
     "de": "Das Buch liegt auf dem Tisch.", 
     "es": "El libro está sobre la mesa.", 
     "ru": "Книга лежит на столе.", 
     "uk": "Книга лежить на столі."}
]

# Evaluate EN → DE translations
system_outputs = []
references = []

for sample in test_data:
    src = sample["en"]
    ref = sample["de"]
    hyp = translate(src, "en", "de")
    system_outputs.append(hyp)
    references.append(ref)

# BLEU evaluation
bleu = sacrebleu.corpus_bleu(system_outputs, [references])
chrf = sacrebleu.corpus_chrf(system_outputs, [references])

print("EN → DE evaluation")
print("System outputs:", system_outputs)
print("References:", references)
print(f"BLEU: {bleu.score:.2f}")
print(f"chrF: {chrf.score:.2f}")


EN → DE evaluation
System outputs: ['Das ist ein Testsatz.', 'Das Buch steht auf dem Tisch.']
References: ['Dies ist ein Testsatz.', 'Das Buch liegt auf dem Tisch.']
BLEU: 55.84
chrF: 75.56


### Use-Case of Translating Headlines ES>DE from El Pais

Extraction by keywords in headlines

In [5]:
# EXTRACT HEADLINES AND EXPORT
from newspaper import build
import pandas as pd

# --- Step 1: Define Spanish news site ---
news_sites = [
    "https://elpais.com",           # Spain
    "https://www.abc.es",           # Spain
    "https://www.lavanguardia.com"  # Spain
]

# Keywords for democracy-related articles
keywords = ["democracia", "elección", "voto", "gobierno", "política", "partido", "PP", "elecciones"]

# --- Step 2: Function to extract and filter headlines ---
def get_headlines(url, lang="es", max_articles=20, keywords=[]):
    paper = build(url, language=lang, memoize_articles=False)
    headlines = []
    for article in paper.articles[:max_articles*20]:  # fetch extra to account for duplicates/failures
        try:
            article.download()
            article.parse()
            title = article.title
            # Check for uniqueness and keyword match
            if title and title not in headlines and any(k.lower() in title.lower() for k in keywords):
                headlines.append(title)
            if len(headlines) >= max_articles:
                break
        except:
            continue
    return headlines

# --- Step 3: Loop over multiple sites and aggregate headlines ---
all_headlines = []
for site in news_sites:
    headlines = get_headlines(site, lang="es", max_articles=10, keywords=keywords)
    all_headlines.extend(headlines)

# Remove duplicates
all_headlines = list(dict.fromkeys(all_headlines))

# --- Step 4: Create a DataFrame ---
df = pd.DataFrame({
    "language": ["es"] * len(all_headlines),
    "headline": all_headlines
})

# --- Step 5: Export to CSV ---
df.to_csv("../data/headlines_democracy.csv", index=False, encoding="utf-8-sig")
print(f"Exported {len(all_headlines)} unique democracy-related Spanish headlines from multiple sites")

Exported 28 unique democracy-related Spanish headlines from multiple sites


In [6]:
df = pd.read_csv("../data/headlines_democracy.csv")
df.head()

Unnamed: 0,language,headline
0,es,El PP desdeña la comisión independiente sobre ...
1,es,El Gobierno carga contra el PP por su posición...
2,es,El Gobierno eleva el crecimiento económico de ...
3,es,iPhone iOS 26: las cinco novedades estrella de...
4,es,La Guía Gastro 2025: una selección de los rest...


In [7]:
# TRANSLATE HEADLINES INTO GERMAN

# pip install sacremoses to prevent UseWarning

spanish_headlines = df["headline"].tolist()

# --- Step 2: Translation function ---
def translate_to_german(texts):
    model_name = "Helsinki-NLP/opus-mt-es-de"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

# --- Step 3: Translate Spanish → German ---
german_translations = translate_to_german(spanish_headlines)

# --- Step 4: Add German translations to DataFrame ---
df["german"] = german_translations

In [8]:
df.head()

Unnamed: 0,language,headline,german
0,es,El PP desdeña la comisión independiente sobre ...,Die PP lehnt die unabhängige Gaza-Kommission a...
1,es,El Gobierno carga contra el PP por su posición...,Die Regierung bürgt gegen die PP für ihre Posi...
2,es,El Gobierno eleva el crecimiento económico de ...,Die Regierung erhöht das Wirtschaftswachstum d...
3,es,iPhone iOS 26: las cinco novedades estrella de...,iPhone iOS 26: die fünf Star-Neuheiten des neu...
4,es,La Guía Gastro 2025: una selección de los rest...,Der Gastro-Leitfaden 2025: eine Auswahl der Li...


Evaluation against a reference corpus OPUS

In [9]:
# PARSE TMX AND PREPARE SPANISH ↔ GERMAN PAIRS
import xml.etree.ElementTree as ET
import pandas as pd

tmx_file = "../data/de-es.tmx"  # path to your TMX file

# Parse TMX
tree = ET.parse(tmx_file)
root = tree.getroot()

# TMX namespace (may vary)
ns = {'xml': 'http://www.w3.org/XML/1998/namespace'}

spanish_sentences = []
german_sentences = []

# Iterate over translation units
for tu in root.iter('tu'):
    segs = tu.findall('tuv')
    src_text = None
    tgt_text = None
    for tuv in segs:
        lang = tuv.attrib.get('{http://www.w3.org/XML/1998/namespace}lang', '').lower()
        seg = tuv.find('seg')
        if seg is not None and seg.text:
            text = seg.text.strip()
            if lang.startswith("es"):
                src_text = text
            elif lang.startswith("de"):
                tgt_text = text
    if src_text and tgt_text:
        spanish_sentences.append(src_text)
        german_sentences.append(tgt_text)

# Create DataFrame
df_opus = pd.DataFrame({
    "spanish": spanish_sentences,
    "german": german_sentences
})

print(f"Loaded {len(df_opus)} Spanish-German sentence pairs from TMX")

Loaded 171674 Spanish-German sentence pairs from TMX


In [10]:
df_opus.head()

Unnamed: 0,spanish,german
0,¿El oro a 10.000 dólares?,Steigt Gold auf 10.000 Dollar?
1,SAN FRANCISCO – Nunca ha resultado fácil soste...,"SAN FRANCISCO – Es war noch nie leicht, ein ra..."
2,"Últimamente, con los precios del oro más de un...",In letzter Zeit allerdings ist dies schwierige...
3,"Apenas en el pasado mes de diciembre, mis cole...",Erst letzten Dezember verfassten meine Kollege...
4,¿Y saben qué?,"Und es kam, wie es kommen musste."


In [11]:
from sentence_transformers import SentenceTransformer, util
import sacrebleu
import torch

# --- Parameters ---
max_opus_sentences = 5000   # max number of OPUS sentences to use
keywords = ["democracia", "elecciones", "voto", "gobierno", "política"]  # optional filtering

# --- Step 0: Subsample and/or filter OPUS ---
df_opus_small = df_opus.copy()

# Filter by keywords (optional)
df_opus_small = df_opus_small[df_opus_small["spanish"]
                              .str.contains("|".join(keywords), case=False, na=False)].reset_index(drop=True)

# Take only first max_opus_sentences
df_opus_small = df_opus_small.head(max_opus_sentences).reset_index(drop=True)
print(f"Using {len(df_opus_small)} OPUS Spanish-German sentence pairs for evaluation.")

# --- Step 1: Load multilingual sentence transformer ---
model = SentenceTransformer("distiluse-base-multilingual-cased-v2")

# Encode headlines and OPUS Spanish sentences
headline_embs = model.encode(df["headline"].tolist(), convert_to_tensor=True)
opus_embs = model.encode(df_opus_small["spanish"].tolist(), convert_to_tensor=True)

# Move to GPU if available
if torch.cuda.is_available():
    headline_embs = headline_embs.to('cuda')
    opus_embs = opus_embs.to('cuda')

# --- Step 2: Compute batched cosine similarities ---
scores = util.cos_sim(headline_embs, opus_embs)  # shape: (num_headlines x num_opus)
idxs = scores.argmax(dim=1)                       # best match per headline

# --- Step 3: Get corresponding German references ---
closest_refs = [df_opus_small["german"].iloc[int(i)] for i in idxs]

# --- Step 4: Compute BLEU ---
hypotheses = df["german"].tolist()  # MarianMT translations
references = [closest_refs]         # sacrebleu expects a list of lists

bleu = sacrebleu.corpus_bleu(hypotheses, references)
print(f"BLEU score: {bleu.score:.2f}")

# --- Step 5: Optional: inspect first 5 examples ---
for i in range(min(5, len(df))):
    print("Headline (ES):", df["headline"].iloc[i])
    print("MT translation (DE):", hypotheses[i])
    print("Reference (DE):", closest_refs[i])
    print("---")


Using 5000 OPUS Spanish-German sentence pairs for evaluation.
BLEU score: 0.71
Headline (ES): El PP desdeña la comisión independiente sobre Gaza: “No le compete ni a la ONU ni al presidente decidir lo que es o no es un genocidio”
MT translation (DE): Die PP lehnt die unabhängige Gaza-Kommission ab: Es ist weder Sache der UNO noch des Präsidenten zu entscheiden, was ein Völkermord ist.
Reference (DE): Sogar der ehemalige US-Vizepräsident und Nobelpreisträger Al Gore – der sich einst gerühmt hatte die entscheidende Stimme für die Unterstützung von Ethanol abgegeben zu haben – bezeichnet die Politik als „einen Fehler“.
---
Headline (ES): El Gobierno carga contra el PP por su posición sobre Gaza: “No es ignorancia, es mala fe. Les va a perseguir toda la vida”
MT translation (DE): Die Regierung bürgt gegen die PP für ihre Position gegenüber Gaza: Es ist keine Unwissenheit, es ist ein böser Glaube. Sie wird ihr Leben lang verfolgen.
Reference (DE): In der Meinung, dass die Situation in den l

In [12]:
# --- Step 4b: Compute chrF ---
chrf = sacrebleu.corpus_chrf(hypotheses, references)
print(f"chrF score: {chrf.score:.2f}")

# --- Step 5: Optional: inspect first 5 examples ---
for i in range(min(5, len(df))):
    print("Headline (ES):", df["headline"].iloc[i])
    print("MT translation (DE):", hypotheses[i])
    print("Reference (DE):", closest_refs[i])
    print("---")

chrF score: 19.65
Headline (ES): El PP desdeña la comisión independiente sobre Gaza: “No le compete ni a la ONU ni al presidente decidir lo que es o no es un genocidio”
MT translation (DE): Die PP lehnt die unabhängige Gaza-Kommission ab: Es ist weder Sache der UNO noch des Präsidenten zu entscheiden, was ein Völkermord ist.
Reference (DE): Sogar der ehemalige US-Vizepräsident und Nobelpreisträger Al Gore – der sich einst gerühmt hatte die entscheidende Stimme für die Unterstützung von Ethanol abgegeben zu haben – bezeichnet die Politik als „einen Fehler“.
---
Headline (ES): El Gobierno carga contra el PP por su posición sobre Gaza: “No es ignorancia, es mala fe. Les va a perseguir toda la vida”
MT translation (DE): Die Regierung bürgt gegen die PP für ihre Position gegenüber Gaza: Es ist keine Unwissenheit, es ist ein böser Glaube. Sie wird ihr Leben lang verfolgen.
Reference (DE): In der Meinung, dass die Situation in den ländlichen Gebieten ohnehin nicht mehr schlimmer werden kann, 

In [13]:
from bert_score import score
# --- Step 1: Compute BERTScore ---
# Use a multilingual model suitable for German
P, R, F1 = score(
    df["german"].tolist(),        # MarianMT translations (hypotheses)
    closest_refs,                 # aligned OPUS references (German)
    lang="de",
    model_type="bert-base-multilingual-cased",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# --- Step 2: Print average semantic score ---
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1 (semantic similarity):", F1.mean().item())

# --- Step 3: Optional: inspect first 5 examples ---
for i in range(min(5, len(df))):
    print("Headline (ES):", df["headline"].iloc[i])
    print("MT translation (DE):", df["german"].iloc[i])
    print("Reference (DE):", closest_refs[i])
    print("Semantic similarity (F1):", F1[i].item())
    print("---")


BERTScore Precision: 0.6632214784622192
BERTScore Recall: 0.6643345952033997
BERTScore F1 (semantic similarity): 0.6634325385093689
Headline (ES): El PP desdeña la comisión independiente sobre Gaza: “No le compete ni a la ONU ni al presidente decidir lo que es o no es un genocidio”
MT translation (DE): Die PP lehnt die unabhängige Gaza-Kommission ab: Es ist weder Sache der UNO noch des Präsidenten zu entscheiden, was ein Völkermord ist.
Reference (DE): Sogar der ehemalige US-Vizepräsident und Nobelpreisträger Al Gore – der sich einst gerühmt hatte die entscheidende Stimme für die Unterstützung von Ethanol abgegeben zu haben – bezeichnet die Politik als „einen Fehler“.
Semantic similarity (F1): 0.6580751538276672
---
Headline (ES): El Gobierno carga contra el PP por su posición sobre Gaza: “No es ignorancia, es mala fe. Les va a perseguir toda la vida”
MT translation (DE): Die Regierung bürgt gegen die PP für ihre Position gegenüber Gaza: Es ist keine Unwissenheit, es ist ein böser Glau

✅ Summary:

The MarianMT Spanish → German translations are highly accurate at the word level (BLEU) and reasonably accurate at the semantic level (BERTScore).

Character-level similarity (chrF) is lower due to headline brevity, variations in punctuation, and morphology.

The alignment against a news-domain OPUS corpus, subcorpus News_comments, ensures that evaluation reflects realistic headline contexts rather than unrelated technical sentences.

💡 Takeaway:
The MT pipeline produces translations that are both understandable and faithful in meaning, making them suitable for tasks like multilingual news monitoring, headline summarization, or semantic analysis.


### Visualising Metrics

In [14]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
from sentence_transformers import util

# --- Step 0: Corpus-level metrics ---
df_metrics = pd.DataFrame({
    "Metric": ["BLEU", "chrF", "BERTScore_F1"],
    "Score": [bleu.score, chrf.score, F1.mean().item() * 100]  # scale BERTScore F1 to %
})

# --- Step 1: Bar chart for corpus-level metrics ---
fig_bar = px.bar(
    df_metrics, x="Metric", y="Score",
    text="Score",
    color="Metric",
    title="Corpus-level MT Evaluation Metrics",
    range_y=[0,100],
    color_discrete_sequence=px.colors.sequential.Viridis
)
fig_bar.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig_bar.show()

# --- Step 2: Histogram of BERTScore F1 per headline ---
bert_f1_pct = F1.detach().cpu().numpy() * 100
fig_hist = px.histogram(
    x=bert_f1_pct,
    nbins=10,
    title="Distribution of BERTScore F1 per Headline",
    labels={"x":"BERTScore F1 (%)", "y":"Number of Headlines"},
    color_discrete_sequence=["skyblue"]
)
fig_hist.update_layout(bargap=0.2)
fig_hist.show()

# --- Step 3: Scatter plot: max cosine similarity vs BERTScore F1 per headline ---
headline_embs_cpu = headline_embs.detach().cpu()
opus_embs_cpu = opus_embs.detach().cpu()
cos_scores = [util.cos_sim(h, opus_embs_cpu).max().item() for h in headline_embs_cpu]

fig_scatter = px.scatter(
    x=cos_scores,
    y=bert_f1_pct,
    labels={"x":"Max Cosine Similarity to OPUS Spanish Sentences", "y":"BERTScore F1 (%)"},
    title="Semantic Similarity vs BERTScore F1 per Headline",
    hover_data={"Headline": df["headline"].tolist()}
)
fig_scatter.show()
