# Explore TDA-BLEU Correlation

Combine topological similarity (Wasserstein distance) with translation quality (BLEU scores) to test our hypothesis:

**Does topological similarity between English and French attention patterns predict translation quality?**

## 1. Import Libraries

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pandas as pd
from scipy import stats
from scipy.stats import spearmanr, pearsonr

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported")

## 2. Load TDA Results

In [None]:
# Load TDA results
tda_path = Path("../data/tda_results/tda_results_last_layer_filtered.pkl")

print(f"Loading TDA results from {tda_path}...")
with open(tda_path, 'rb') as f:
    tda_results = pickle.load(f)

print(f"✓ Loaded {len(tda_results)} TDA results")

# Extract relevant TDA metrics
df_tda = pd.DataFrame([{
    'idx': r['idx'],
    'wasserstein_distance': r['wasserstein_distance'],
    'wasserstein_h0': r['wasserstein_h0'],
    'wasserstein_h1': r['wasserstein_h1'],
    'en_num_tokens': r['en_num_tokens'],
    'fr_num_tokens': r['fr_num_tokens'],
    'en_h0_features': r['en_h0_features'],
    'en_h1_features': r['en_h1_features'],
    'fr_h0_features': r['fr_h0_features'],
    'fr_h1_features': r['fr_h1_features']
} for r in tda_results])

print(f"\nTDA DataFrame shape: {df_tda.shape}")
print(df_tda.head())

## 3. Load BLEU Scores

In [None]:
# Load BLEU scores
bleu_path = Path("../data/bleu_scores.csv")

print(f"Loading BLEU scores from {bleu_path}...")
df_bleu = pd.read_csv(bleu_path)

print(f"✓ Loaded {len(df_bleu)} BLEU scores")
print(f"\nBLEU DataFrame shape: {df_bleu.shape}")
print(df_bleu.head())

## 4. Merge Datasets

In [None]:
# Merge on idx
df = pd.merge(df_tda, df_bleu, on='idx')

print(f"✓ Merged DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())

## 5. Summary Statistics

In [None]:
print("=" * 70)
print("SUMMARY STATISTICS")
print("=" * 70)
print()

print("Wasserstein Distance:")
print(f"  Mean: {df['wasserstein_distance'].mean():.6f}")
print(f"  Std:  {df['wasserstein_distance'].std():.6f}")
print()

print("BLEU Scores:")
print(f"  EN→FR - Mean: {df['bleu_en_fr'].mean():.2f}, Std: {df['bleu_en_fr'].std():.2f}")
print(f"  FR→EN - Mean: {df['bleu_fr_en'].mean():.2f}, Std: {df['bleu_fr_en'].std():.2f}")
print(f"  Avg   - Mean: {df['bleu_avg'].mean():.2f}, Std: {df['bleu_avg'].std():.2f}")

## 6. Correlation Analysis

In [None]:
# Compute correlations between Wasserstein distance and BLEU scores
print("=" * 70)
print("CORRELATION ANALYSIS: Wasserstein Distance vs BLEU")
print("=" * 70)
print()

# Pearson correlation (linear relationship)
pearson_en_fr, p_pearson_en_fr = pearsonr(df['wasserstein_distance'], df['bleu_en_fr'])
pearson_fr_en, p_pearson_fr_en = pearsonr(df['wasserstein_distance'], df['bleu_fr_en'])
pearson_avg, p_pearson_avg = pearsonr(df['wasserstein_distance'], df['bleu_avg'])

print("Pearson Correlation (linear):")
print(f"  Wasserstein vs EN→FR BLEU: r = {pearson_en_fr:.4f}, p = {p_pearson_en_fr:.2e}")
print(f"  Wasserstein vs FR→EN BLEU: r = {pearson_fr_en:.4f}, p = {p_pearson_fr_en:.2e}")
print(f"  Wasserstein vs Avg BLEU:   r = {pearson_avg:.4f}, p = {p_pearson_avg:.2e}")
print()

# Spearman correlation (monotonic relationship)
spearman_en_fr, p_spearman_en_fr = spearmanr(df['wasserstein_distance'], df['bleu_en_fr'])
spearman_fr_en, p_spearman_fr_en = spearmanr(df['wasserstein_distance'], df['bleu_fr_en'])
spearman_avg, p_spearman_avg = spearmanr(df['wasserstein_distance'], df['bleu_avg'])

print("Spearman Correlation (monotonic):")
print(f"  Wasserstein vs EN→FR BLEU: ρ = {spearman_en_fr:.4f}, p = {p_spearman_en_fr:.2e}")
print(f"  Wasserstein vs FR→EN BLEU: ρ = {spearman_fr_en:.4f}, p = {p_spearman_fr_en:.2e}")
print(f"  Wasserstein vs Avg BLEU:   ρ = {spearman_avg:.4f}, p = {p_spearman_avg:.2e}")
print()

# Interpretation
print("Interpretation:")
if abs(pearson_avg) < 0.1:
    strength = "negligible"
elif abs(pearson_avg) < 0.3:
    strength = "weak"
elif abs(pearson_avg) < 0.5:
    strength = "moderate"
else:
    strength = "strong"

direction = "negative" if pearson_avg < 0 else "positive"
print(f"  Overall correlation is {strength} and {direction}.")

if pearson_avg < 0:
    print(f"  → Lower Wasserstein distance (more similar topology) is associated with higher BLEU (better translation).")
else:
    print(f"  → Higher Wasserstein distance (more different topology) is associated with higher BLEU (better translation).")

## 7. Scatter Plots: Wasserstein Distance vs BLEU

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# EN→FR BLEU
axes[0].scatter(df['wasserstein_distance'], df['bleu_en_fr'], alpha=0.3, s=10)
axes[0].set_xlabel('Wasserstein Distance\n(Lower = More Topologically Similar)')
axes[0].set_ylabel('EN→FR BLEU Score')
axes[0].set_title(f'Wasserstein vs EN→FR BLEU\nr = {pearson_en_fr:.3f}, p = {p_pearson_en_fr:.2e}')
axes[0].grid(alpha=0.3)

# Add trend line
z = np.polyfit(df['wasserstein_distance'], df['bleu_en_fr'], 1)
p = np.poly1d(z)
axes[0].plot(df['wasserstein_distance'], p(df['wasserstein_distance']), "r--", alpha=0.5, label='Trend')
axes[0].legend()

# FR→EN BLEU
axes[1].scatter(df['wasserstein_distance'], df['bleu_fr_en'], alpha=0.3, s=10)
axes[1].set_xlabel('Wasserstein Distance\n(Lower = More Topologically Similar)')
axes[1].set_ylabel('FR→EN BLEU Score')
axes[1].set_title(f'Wasserstein vs FR→EN BLEU\nr = {pearson_fr_en:.3f}, p = {p_pearson_fr_en:.2e}')
axes[1].grid(alpha=0.3)

z = np.polyfit(df['wasserstein_distance'], df['bleu_fr_en'], 1)
p = np.poly1d(z)
axes[1].plot(df['wasserstein_distance'], p(df['wasserstein_distance']), "r--", alpha=0.5, label='Trend')
axes[1].legend()

# Average BLEU
axes[2].scatter(df['wasserstein_distance'], df['bleu_avg'], alpha=0.3, s=10)
axes[2].set_xlabel('Wasserstein Distance\n(Lower = More Topologically Similar)')
axes[2].set_ylabel('Average BLEU Score')
axes[2].set_title(f'Wasserstein vs Average BLEU\nr = {pearson_avg:.3f}, p = {p_pearson_avg:.2e}')
axes[2].grid(alpha=0.3)

z = np.polyfit(df['wasserstein_distance'], df['bleu_avg'], 1)
p = np.poly1d(z)
axes[2].plot(df['wasserstein_distance'], p(df['wasserstein_distance']), "r--", alpha=0.5, label='Trend')
axes[2].legend()

plt.tight_layout()
plt.show()

## 8. H0 vs H1 Contribution

In [None]:
# Analyze H0 and H1 components separately
print("=" * 70)
print("H0 vs H1 CONTRIBUTION")
print("=" * 70)
print()

# H0 correlations
pearson_h0, p_h0 = pearsonr(df['wasserstein_h0'], df['bleu_avg'])
print(f"H0 (Connected Components) vs Avg BLEU:")
print(f"  Pearson r = {pearson_h0:.4f}, p = {p_h0:.2e}")
print()

# H1 correlations
pearson_h1, p_h1 = pearsonr(df['wasserstein_h1'], df['bleu_avg'])
print(f"H1 (Loops/Holes) vs Avg BLEU:")
print(f"  Pearson r = {pearson_h1:.4f}, p = {p_h1:.2e}")
print()

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].scatter(df['wasserstein_h0'], df['bleu_avg'], alpha=0.3, s=10)
axes[0].set_xlabel('H0 Wasserstein Distance')
axes[0].set_ylabel('Average BLEU Score')
axes[0].set_title(f'H0 vs BLEU\nr = {pearson_h0:.3f}, p = {p_h0:.2e}')
axes[0].grid(alpha=0.3)

z = np.polyfit(df['wasserstein_h0'], df['bleu_avg'], 1)
p = np.poly1d(z)
axes[0].plot(df['wasserstein_h0'], p(df['wasserstein_h0']), "r--", alpha=0.5)

axes[1].scatter(df['wasserstein_h1'], df['bleu_avg'], alpha=0.3, s=10)
axes[1].set_xlabel('H1 Wasserstein Distance')
axes[1].set_ylabel('Average BLEU Score')
axes[1].set_title(f'H1 vs BLEU\nr = {pearson_h1:.3f}, p = {p_h1:.2e}')
axes[1].grid(alpha=0.3)

z = np.polyfit(df['wasserstein_h1'], df['bleu_avg'], 1)
p = np.poly1d(z)
axes[1].plot(df['wasserstein_h1'], p(df['wasserstein_h1']), "r--", alpha=0.5)

plt.tight_layout()
plt.show()

## 9. Binned Analysis

In [None]:
# Bin by Wasserstein distance and compare mean BLEU scores
df['w_bin'] = pd.qcut(df['wasserstein_distance'], q=5, labels=['Very Similar', 'Similar', 'Moderate', 'Dissimilar', 'Very Dissimilar'])

print("=" * 70)
print("BINNED ANALYSIS: Mean BLEU by Topological Similarity")
print("=" * 70)
print()

binned_stats = df.groupby('w_bin').agg({
    'bleu_avg': ['mean', 'std', 'count'],
    'wasserstein_distance': ['mean', 'std']
})

print(binned_stats)
print()

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))

bin_means = df.groupby('w_bin')['bleu_avg'].mean()
bin_stds = df.groupby('w_bin')['bleu_avg'].std()

ax.bar(range(len(bin_means)), bin_means, yerr=bin_stds, capsize=5, alpha=0.7, edgecolor='black')
ax.set_xticks(range(len(bin_means)))
ax.set_xticklabels(bin_means.index, rotation=45, ha='right')
ax.set_xlabel('Topological Similarity (Wasserstein Distance Bins)')
ax.set_ylabel('Mean BLEU Score')
ax.set_title('Translation Quality by Topological Similarity')
ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 10. Examples: High Similarity vs Low Similarity

In [None]:
# Sort by Wasserstein distance
df_sorted = df.sort_values('wasserstein_distance')

print("=" * 70)
print("HIGH TOPOLOGICAL SIMILARITY (Low Wasserstein Distance)")
print("=" * 70)
for i in range(5):
    row = df_sorted.iloc[i]
    print(f"\n[{i+1}] Pair {int(row['idx'])}: W = {row['wasserstein_distance']:.4f}, BLEU = {row['bleu_avg']:.2f}")
    
    # Get original text from TDA results
    original = tda_results[int(row['idx'])]
    print(f"    EN: {original['en_text']}")
    print(f"    FR: {original['fr_text']}")

print("\n" + "=" * 70)
print("LOW TOPOLOGICAL SIMILARITY (High Wasserstein Distance)")
print("=" * 70)
for i in range(5):
    row = df_sorted.iloc[-(i+1)]
    print(f"\n[{i+1}] Pair {int(row['idx'])}: W = {row['wasserstein_distance']:.4f}, BLEU = {row['bleu_avg']:.2f}")
    
    original = tda_results[int(row['idx'])]
    print(f"    EN: {original['en_text']}")
    print(f"    FR: {original['fr_text']}")

## Summary

**Key Findings:**
- Correlation between topological similarity (Wasserstein distance) and translation quality (BLEU)
- H0 vs H1 contribution to the relationship
- Binned analysis showing trend across similarity levels

**Hypothesis Test:**
Does lower Wasserstein distance (more topologically similar attention patterns) predict higher BLEU scores (better translation quality)?