In [None]:
pip install evaluate transformers bert-score sacrebleu rouge-score --quiet

In [None]:
# Sample Evaluation Script

In [None]:
from transformers import pipeline
import evaluate
import time
import pandas as pd

# 1. Sample translation data
data = [
    {
        "source": "The weather is nice today.",
        "reference": "Il fait beau aujourd'hui."
    },
    {
        "source": "Welcome to our customer service.",
        "reference": "Bienvenue à notre service client."
    },
    {
        "source": "I would like to book a flight to Paris.",
        "reference": "Je voudrais réserver un vol pour Paris."
    }
]

# 2. Load a translation pipeline (Helsinki-NLP for en-fr)
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr")

predictions = []
latencies = []

# 3. Run translation with latency measurement
for item in data:
    start_time = time.time()
    translated = translator(item["source"], max_length=128)[0]["translation_text"]
    end_time = time.time()
    latency = round((end_time - start_time) * 1000, 2)  # in ms

    predictions.append(translated)
    latencies.append(latency)

# 4. Extract references
references = [item["reference"] for item in data]

# 5. Load evaluation metrics
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
ter = evaluate.load("ter")
bertscore = evaluate.load("bertscore")

# 6. Compute all metrics
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_score = rouge.compute(predictions=predictions, references=references)
ter_score = ter.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="fr")

# 7. Output summary
summary = {
    "BLEU": bleu_score["score"],
    "ROUGE-L": rouge_score["rougeL"],
    "TER": ter_score["score"],
    "BERTScore (F1 avg)": sum(bertscore_result["f1"]) / len(bertscore_result["f1"]),
    "Avg Latency (ms)": sum(latencies) / len(latencies)
}

print("\n📊 Evaluation Summary:")
for k, v in summary.items():
    print(f"{k}: {v:.2f}")

# 8. Optional: Show results in a table
df = pd.DataFrame({
    "Source": [item["source"] for item in data],
    "Reference": references,
    "Prediction": predictions,
    "Latency (ms)": latencies
})

print("\Per-Sentence Results:")
print(df)


In [None]:
# sample output 

In [None]:
# A sample output:  Evaluation Summary:
BLEU: 41.23
ROUGE-L: 0.59
TER: 0.31
BERTScore (F1 avg): 0.87
Avg Latency (ms): 128.45

 Per-Sentence Results:
                                Source                              Reference                           Prediction  Latency (ms)
0       The weather is nice today.       Il fait beau aujourd'hui.     Le temps est agréable aujourd'hui.        130.24
1  Welcome to our customer service.  Bienvenue à notre service client.   Bienvenue dans notre service client.     127.14
2  I would like to book a flight to Paris.  Je voudrais réserver un vol pour Paris.  Je souhaite réserver un vol à Paris.   128.34


In [None]:
# further explore the below for effective perf monitoring

1. Add support for multiple languages.

2. Visualize metrics with matplotlib or seaborn.

3. Integrate with wandb or MLflow for tracking.