# Explore BLEU Score Computation

Compute BLEU scores for translation quality assessment.

We have:
- `en_translation`: Generated French (EN → FR)
- `fr_translation`: Generated English (FR → EN)
- `fr_text`: Reference French
- `en_text`: Reference English

We'll compute:
- **EN→FR BLEU**: Compare `en_translation` with `fr_text`
- **FR→EN BLEU**: Compare `fr_translation` with `en_text`

## 1. Install and Import Libraries

In [None]:
# Install sacrebleu if needed (standard BLEU implementation)
# !pip install sacrebleu

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pandas as pd
from sacrebleu import sentence_bleu

sns.set_style("whitegrid")
print("✓ Libraries imported")

## 2. Load TDA Results

Load results that contain both translations and original texts.

In [None]:
# Load TDA results (contains translations)
data_path = Path("../data/tda_results/tda_results_last_layer_filtered.pkl")

print(f"Loading data from {data_path}...")
with open(data_path, 'rb') as f:
    results = pickle.load(f)

print(f"✓ Loaded {len(results)} sentence pairs")
print()

# Examine first result
print("Data structure:")
print(f"Keys: {list(results[0].keys())}")
print()
print("Sample:")
print(f"EN text:         {results[0]['en_text']}")
print(f"FR text:         {results[0]['fr_text']}")
print(f"EN→FR (generated): {results[0]['en_translation']}")
print(f"FR→EN (generated): {results[0]['fr_translation']}")

## 3. Compute BLEU Scores for Sample Examples

In [None]:
def compute_bleu_scores(en_text, fr_text, en_translation, fr_translation):
    """
    Compute BLEU scores for both translation directions.
    
    Args:
        en_text: Original English text (reference for FR→EN)
        fr_text: Original French text (reference for EN→FR)
        en_translation: Generated French from English (hypothesis for EN→FR)
        fr_translation: Generated English from French (hypothesis for FR→EN)
    
    Returns:
        dict with BLEU scores
    """
    # EN→FR: Compare generated French with reference French
    bleu_en_fr = sentence_bleu(en_translation, [fr_text]).score
    
    # FR→EN: Compare generated English with reference English
    bleu_fr_en = sentence_bleu(fr_translation, [en_text]).score
    
    # Average BLEU
    bleu_avg = (bleu_en_fr + bleu_fr_en) / 2
    
    return {
        'bleu_en_fr': bleu_en_fr,
        'bleu_fr_en': bleu_fr_en,
        'bleu_avg': bleu_avg
    }

print("✓ Function defined")

In [None]:
# Test on first 10 examples
print("Testing BLEU computation on first 10 examples:")
print("=" * 80)

for i in range(10):
    example = results[i]
    scores = compute_bleu_scores(
        en_text=example['en_text'],
        fr_text=example['fr_text'],
        en_translation=example['en_translation'],
        fr_translation=example['fr_translation']
    )
    
    print(f"\n[{i}] EN→FR: {scores['bleu_en_fr']:.2f}, FR→EN: {scores['bleu_fr_en']:.2f}, Avg: {scores['bleu_avg']:.2f}")
    print(f"    EN: {example['en_text'][:70]}...")
    print(f"    FR: {example['fr_text'][:70]}...")

## 4. Compute BLEU for All Examples

In [None]:
# Compute BLEU scores for all examples
print(f"Computing BLEU scores for {len(results)} sentence pairs...")

bleu_results = []
for i, example in enumerate(results):
    scores = compute_bleu_scores(
        en_text=example['en_text'],
        fr_text=example['fr_text'],
        en_translation=example['en_translation'],
        fr_translation=example['fr_translation']
    )
    
    bleu_results.append({
        'idx': i,
        **scores
    })
    
    if (i + 1) % 500 == 0:
        print(f"  Processed {i + 1}/{len(results)}")

print(f"✓ Computed BLEU scores for all {len(bleu_results)} pairs")

# Convert to DataFrame
df_bleu = pd.DataFrame(bleu_results)
print("\nDataFrame:")
print(df_bleu.head())

## 5. Summary Statistics

In [None]:
print("=" * 70)
print("BLEU SCORE STATISTICS")
print("=" * 70)
print()

print("EN→FR BLEU:")
print(f"  Min:    {df_bleu['bleu_en_fr'].min():.2f}")
print(f"  Max:    {df_bleu['bleu_en_fr'].max():.2f}")
print(f"  Mean:   {df_bleu['bleu_en_fr'].mean():.2f}")
print(f"  Median: {df_bleu['bleu_en_fr'].median():.2f}")
print(f"  Std:    {df_bleu['bleu_en_fr'].std():.2f}")
print()

print("FR→EN BLEU:")
print(f"  Min:    {df_bleu['bleu_fr_en'].min():.2f}")
print(f"  Max:    {df_bleu['bleu_fr_en'].max():.2f}")
print(f"  Mean:   {df_bleu['bleu_fr_en'].mean():.2f}")
print(f"  Median: {df_bleu['bleu_fr_en'].median():.2f}")
print(f"  Std:    {df_bleu['bleu_fr_en'].std():.2f}")
print()

print("Average BLEU:")
print(f"  Min:    {df_bleu['bleu_avg'].min():.2f}")
print(f"  Max:    {df_bleu['bleu_avg'].max():.2f}")
print(f"  Mean:   {df_bleu['bleu_avg'].mean():.2f}")
print(f"  Median: {df_bleu['bleu_avg'].median():.2f}")
print(f"  Std:    {df_bleu['bleu_avg'].std():.2f}")

## 6. Visualize BLEU Score Distribution

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# EN→FR BLEU
axes[0].hist(df_bleu['bleu_en_fr'], bins=50, alpha=0.7, color='blue', edgecolor='black')
axes[0].axvline(df_bleu['bleu_en_fr'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df_bleu["bleu_en_fr"].mean():.2f}')
axes[0].set_xlabel('BLEU Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('EN→FR BLEU Distribution')
axes[0].legend()
axes[0].grid(alpha=0.3)

# FR→EN BLEU
axes[1].hist(df_bleu['bleu_fr_en'], bins=50, alpha=0.7, color='green', edgecolor='black')
axes[1].axvline(df_bleu['bleu_fr_en'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df_bleu["bleu_fr_en"].mean():.2f}')
axes[1].set_xlabel('BLEU Score')
axes[1].set_ylabel('Frequency')
axes[1].set_title('FR→EN BLEU Distribution')
axes[1].legend()
axes[1].grid(alpha=0.3)

# Average BLEU
axes[2].hist(df_bleu['bleu_avg'], bins=50, alpha=0.7, color='purple', edgecolor='black')
axes[2].axvline(df_bleu['bleu_avg'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df_bleu["bleu_avg"].mean():.2f}')
axes[2].set_xlabel('BLEU Score')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Average BLEU Distribution')
axes[2].legend()
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Best and Worst Translations

In [None]:
# Sort by average BLEU
df_bleu_sorted = df_bleu.sort_values('bleu_avg', ascending=False)

print("=" * 70)
print("BEST TRANSLATIONS (Highest BLEU)")
print("=" * 70)
for i in range(5):
    idx = int(df_bleu_sorted.iloc[i]['idx'])
    example = results[idx]
    scores = df_bleu_sorted.iloc[i]
    
    print(f"\n[{i+1}] Pair {idx}: Avg BLEU = {scores['bleu_avg']:.2f} (EN→FR: {scores['bleu_en_fr']:.2f}, FR→EN: {scores['bleu_fr_en']:.2f})")
    print(f"    EN: {example['en_text']}")
    print(f"    FR: {example['fr_text']}")
    print(f"    Generated FR: {example['en_translation']}")
    print(f"    Generated EN: {example['fr_translation']}")

print("\n" + "=" * 70)
print("WORST TRANSLATIONS (Lowest BLEU)")
print("=" * 70)
for i in range(5):
    idx = int(df_bleu_sorted.iloc[-(i+1)]['idx'])
    example = results[idx]
    scores = df_bleu_sorted.iloc[-(i+1)]
    
    print(f"\n[{i+1}] Pair {idx}: Avg BLEU = {scores['bleu_avg']:.2f} (EN→FR: {scores['bleu_en_fr']:.2f}, FR→EN: {scores['bleu_fr_en']:.2f})")
    print(f"    EN: {example['en_text']}")
    print(f"    FR: {example['fr_text']}")
    print(f"    Generated FR: {example['en_translation']}")
    print(f"    Generated EN: {example['fr_translation']}")

## 8. Correlation Between EN→FR and FR→EN BLEU

In [None]:
# Scatter plot
plt.figure(figsize=(8, 8))
plt.scatter(df_bleu['bleu_en_fr'], df_bleu['bleu_fr_en'], alpha=0.3, s=10)
plt.xlabel('EN→FR BLEU')
plt.ylabel('FR→EN BLEU')
plt.title('Correlation Between Translation Directions')
plt.plot([0, 100], [0, 100], 'r--', alpha=0.5, label='y=x')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

# Compute correlation
corr = df_bleu['bleu_en_fr'].corr(df_bleu['bleu_fr_en'])
print(f"Correlation between EN→FR and FR→EN BLEU: r = {corr:.4f}")