# Explore TDA-BLEU Correlation

Combine topological similarity (Wasserstein distance) with translation quality (BLEU scores) to test our hypothesis:

**Does topological similarity between English and French attention patterns predict translation quality?**

## 1. Import Libraries

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pandas as pd
from scipy import stats
from scipy.stats import spearmanr, pearsonr
import warnings

# Suppress warnings about infinite death times in persistence diagrams
warnings.filterwarnings('ignore', message='.*non-finite death times.*')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported")

## 2. Load TDA Results

In [None]:
# Load TDA results
tda_path = Path("../data/tda_results_fr_en/tda_results_last_layer_filtered.pkl")

print(f"Loading TDA results from {tda_path}...")
with open(tda_path, 'rb') as f:
    tda_results = pickle.load(f)

print(f"✓ Loaded {len(tda_results)} TDA results")

# Extract relevant TDA metrics
df_tda = pd.DataFrame([{
    'idx': r['idx'],
    'wasserstein_distance': r['wasserstein_distance'],
    'wasserstein_h0': r['wasserstein_h0'],
    'wasserstein_h1': r['wasserstein_h1'],
    'en_num_tokens': r['en_num_tokens'],
    'fr_num_tokens': r['fr_num_tokens'],
    'en_h0_features': r['en_h0_features'],
    'en_h1_features': r['en_h1_features'],
    'fr_h0_features': r['fr_h0_features'],
    'fr_h1_features': r['fr_h1_features']
} for r in tda_results])

print(f"\nTDA DataFrame shape: {df_tda.shape}")
print(df_tda.head())

## 3. Load BLEU Scores

In [None]:
# Load BLEU scores
bleu_path = Path("../data/bleu_scores_fr_en.csv")

print(f"Loading BLEU scores from {bleu_path}...")
df_bleu = pd.read_csv(bleu_path)

print(f"✓ Loaded {len(df_bleu)} BLEU scores")
print(f"\nBLEU DataFrame shape: {df_bleu.shape}")
print(df_bleu.head())

## 4. Merge Datasets

In [None]:
# Merge on idx
df = pd.merge(df_tda, df_bleu, on='idx')

print(f"✓ Merged DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())

## 5. Summary Statistics

In [None]:
print("=" * 70)
print("SUMMARY STATISTICS")
print("=" * 70)
print()

print("Wasserstein Distance:")
print(f"  Mean: {df['wasserstein_distance'].mean():.6f}")
print(f"  Std:  {df['wasserstein_distance'].std():.6f}")
print()

print("BLEU Scores:")
print(f"  EN→FR - Mean: {df['bleu_en_fr'].mean():.2f}, Std: {df['bleu_en_fr'].std():.2f}")
print(f"  FR→EN - Mean: {df['bleu_fr_en'].mean():.2f}, Std: {df['bleu_fr_en'].std():.2f}")
print(f"  Avg   - Mean: {df['bleu_avg'].mean():.2f}, Std: {df['bleu_avg'].std():.2f}")

## 6. Correlation Analysis

In [None]:
# Compute correlations between Wasserstein distance and BLEU scores
print("=" * 70)
print("CORRELATION ANALYSIS: Wasserstein Distance vs BLEU")
print("=" * 70)
print()

# Pearson correlation (linear relationship)
pearson_en_fr, p_pearson_en_fr = pearsonr(df['wasserstein_distance'], df['bleu_en_fr'])
pearson_fr_en, p_pearson_fr_en = pearsonr(df['wasserstein_distance'], df['bleu_fr_en'])
pearson_avg, p_pearson_avg = pearsonr(df['wasserstein_distance'], df['bleu_avg'])

print("Pearson Correlation (linear):")
print(f"  Wasserstein vs EN→FR BLEU: r = {pearson_en_fr:.4f}, p = {p_pearson_en_fr:.2e}")
print(f"  Wasserstein vs FR→EN BLEU: r = {pearson_fr_en:.4f}, p = {p_pearson_fr_en:.2e}")
print(f"  Wasserstein vs Avg BLEU:   r = {pearson_avg:.4f}, p = {p_pearson_avg:.2e}")
print()

# Spearman correlation (monotonic relationship)
spearman_en_fr, p_spearman_en_fr = spearmanr(df['wasserstein_distance'], df['bleu_en_fr'])
spearman_fr_en, p_spearman_fr_en = spearmanr(df['wasserstein_distance'], df['bleu_fr_en'])
spearman_avg, p_spearman_avg = spearmanr(df['wasserstein_distance'], df['bleu_avg'])

print("Spearman Correlation (monotonic):")
print(f"  Wasserstein vs EN→FR BLEU: ρ = {spearman_en_fr:.4f}, p = {p_spearman_en_fr:.2e}")
print(f"  Wasserstein vs FR→EN BLEU: ρ = {spearman_fr_en:.4f}, p = {p_spearman_fr_en:.2e}")
print(f"  Wasserstein vs Avg BLEU:   ρ = {spearman_avg:.4f}, p = {p_spearman_avg:.2e}")
print()

# Interpretation
print("Interpretation:")
if abs(pearson_avg) < 0.1:
    strength = "negligible"
elif abs(pearson_avg) < 0.3:
    strength = "weak"
elif abs(pearson_avg) < 0.5:
    strength = "moderate"
else:
    strength = "strong"

direction = "negative" if pearson_avg < 0 else "positive"
print(f"  Overall correlation is {strength} and {direction}.")

if pearson_avg < 0:
    print(f"  → Lower Wasserstein distance (more similar topology) is associated with higher BLEU (better translation).")
else:
    print(f"  → Higher Wasserstein distance (more different topology) is associated with higher BLEU (better translation).")

## 7. Scatter Plots: Wasserstein Distance vs BLEU

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# EN→FR BLEU
axes[0].scatter(df['wasserstein_distance'], df['bleu_en_fr'], alpha=0.3, s=10)
axes[0].set_xlabel('Wasserstein Distance\n(Lower = More Topologically Similar)')
axes[0].set_ylabel('EN→FR BLEU Score')
axes[0].set_title(f'Wasserstein vs EN→FR BLEU\nr = {pearson_en_fr:.3f}, p = {p_pearson_en_fr:.2e}')
axes[0].grid(alpha=0.3)

# Add trend line
z = np.polyfit(df['wasserstein_distance'], df['bleu_en_fr'], 1)
p = np.poly1d(z)
axes[0].plot(df['wasserstein_distance'], p(df['wasserstein_distance']), "r--", alpha=0.5, label='Trend')
axes[0].legend()

# FR→EN BLEU
axes[1].scatter(df['wasserstein_distance'], df['bleu_fr_en'], alpha=0.3, s=10)
axes[1].set_xlabel('Wasserstein Distance\n(Lower = More Topologically Similar)')
axes[1].set_ylabel('FR→EN BLEU Score')
axes[1].set_title(f'Wasserstein vs FR→EN BLEU\nr = {pearson_fr_en:.3f}, p = {p_pearson_fr_en:.2e}')
axes[1].grid(alpha=0.3)

z = np.polyfit(df['wasserstein_distance'], df['bleu_fr_en'], 1)
p = np.poly1d(z)
axes[1].plot(df['wasserstein_distance'], p(df['wasserstein_distance']), "r--", alpha=0.5, label='Trend')
axes[1].legend()

# Average BLEU
axes[2].scatter(df['wasserstein_distance'], df['bleu_avg'], alpha=0.3, s=10)
axes[2].set_xlabel('Wasserstein Distance\n(Lower = More Topologically Similar)')
axes[2].set_ylabel('Average BLEU Score')
axes[2].set_title(f'Wasserstein vs Average BLEU\nr = {pearson_avg:.3f}, p = {p_pearson_avg:.2e}')
axes[2].grid(alpha=0.3)

z = np.polyfit(df['wasserstein_distance'], df['bleu_avg'], 1)
p = np.poly1d(z)
axes[2].plot(df['wasserstein_distance'], p(df['wasserstein_distance']), "r--", alpha=0.5, label='Trend')
axes[2].legend()

plt.tight_layout()
plt.show()

## 8. H0 vs H1 Contribution

In [None]:
# Analyze H0 and H1 components separately
print("=" * 70)
print("H0 vs H1 CONTRIBUTION")
print("=" * 70)
print()

# H0 correlations
pearson_h0, p_h0 = pearsonr(df['wasserstein_h0'], df['bleu_avg'])
print(f"H0 (Connected Components) vs Avg BLEU:")
print(f"  Pearson r = {pearson_h0:.4f}, p = {p_h0:.2e}")
print()

# H1 correlations
pearson_h1, p_h1 = pearsonr(df['wasserstein_h1'], df['bleu_avg'])
print(f"H1 (Loops/Holes) vs Avg BLEU:")
print(f"  Pearson r = {pearson_h1:.4f}, p = {p_h1:.2e}")
print()

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].scatter(df['wasserstein_h0'], df['bleu_avg'], alpha=0.3, s=10)
axes[0].set_xlabel('H0 Wasserstein Distance')
axes[0].set_ylabel('Average BLEU Score')
axes[0].set_title(f'H0 vs BLEU\nr = {pearson_h0:.3f}, p = {p_h0:.2e}')
axes[0].grid(alpha=0.3)

z = np.polyfit(df['wasserstein_h0'], df['bleu_avg'], 1)
p = np.poly1d(z)
axes[0].plot(df['wasserstein_h0'], p(df['wasserstein_h0']), "r--", alpha=0.5)

axes[1].scatter(df['wasserstein_h1'], df['bleu_avg'], alpha=0.3, s=10)
axes[1].set_xlabel('H1 Wasserstein Distance')
axes[1].set_ylabel('Average BLEU Score')
axes[1].set_title(f'H1 vs BLEU\nr = {pearson_h1:.3f}, p = {p_h1:.2e}')
axes[1].grid(alpha=0.3)

z = np.polyfit(df['wasserstein_h1'], df['bleu_avg'], 1)
p = np.poly1d(z)
axes[1].plot(df['wasserstein_h1'], p(df['wasserstein_h1']), "r--", alpha=0.5)

plt.tight_layout()
plt.show()

## 8.5. BLEU vs Token Count Correlation

In [None]:
# Analyze correlation between BLEU scores and token counts
print("=" * 70)
print("CORRELATION ANALYSIS: Token Count vs BLEU")
print("=" * 70)
print()

# Compute correlations
pearson_en_tokens_en_fr, p_en_en_fr = pearsonr(df['en_num_tokens'], df['bleu_en_fr'])
pearson_en_tokens_fr_en, p_en_fr_en = pearsonr(df['en_num_tokens'], df['bleu_fr_en'])
pearson_en_tokens_avg, p_en_avg = pearsonr(df['en_num_tokens'], df['bleu_avg'])

pearson_fr_tokens_en_fr, p_fr_en_fr = pearsonr(df['fr_num_tokens'], df['bleu_en_fr'])
pearson_fr_tokens_fr_en, p_fr_fr_en = pearsonr(df['fr_num_tokens'], df['bleu_fr_en'])
pearson_fr_tokens_avg, p_fr_avg = pearsonr(df['fr_num_tokens'], df['bleu_avg'])

print("English Token Count vs BLEU:")
print(f"  EN tokens vs EN→FR BLEU: r = {pearson_en_tokens_en_fr:.4f}, p = {p_en_en_fr:.2e}")
print(f"  EN tokens vs FR→EN BLEU: r = {pearson_en_tokens_fr_en:.4f}, p = {p_en_fr_en:.2e}")
print(f"  EN tokens vs Avg BLEU:   r = {pearson_en_tokens_avg:.4f}, p = {p_en_avg:.2e}")
print()

print("French Token Count vs BLEU:")
print(f"  FR tokens vs EN→FR BLEU: r = {pearson_fr_tokens_en_fr:.4f}, p = {p_fr_en_fr:.2e}")
print(f"  FR tokens vs FR→EN BLEU: r = {pearson_fr_tokens_fr_en:.4f}, p = {p_fr_fr_en:.2e}")
print(f"  FR tokens vs Avg BLEU:   r = {pearson_fr_tokens_avg:.4f}, p = {p_fr_avg:.2e}")
print()

# Visualize
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# English tokens vs BLEU scores
axes[0, 0].scatter(df['en_num_tokens'], df['bleu_en_fr'], alpha=0.3, s=10, color='blue')
en_slope_en_fr, en_int_en_fr, _, _, _ = stats.linregress(df['en_num_tokens'], df['bleu_en_fr'])
en_x = np.array([df['en_num_tokens'].min(), df['en_num_tokens'].max()])
axes[0, 0].plot(en_x, en_slope_en_fr * en_x + en_int_en_fr, 'b-', linewidth=2, alpha=0.8)
axes[0, 0].set_xlabel('English Token Count')
axes[0, 0].set_ylabel('EN→FR BLEU Score')
axes[0, 0].set_title(f'EN Tokens vs EN→FR BLEU\nr = {pearson_en_tokens_en_fr:.3f}, p = {p_en_en_fr:.2e}')
axes[0, 0].grid(alpha=0.3)

axes[0, 1].scatter(df['en_num_tokens'], df['bleu_fr_en'], alpha=0.3, s=10, color='blue')
en_slope_fr_en, en_int_fr_en, _, _, _ = stats.linregress(df['en_num_tokens'], df['bleu_fr_en'])
axes[0, 1].plot(en_x, en_slope_fr_en * en_x + en_int_fr_en, 'b-', linewidth=2, alpha=0.8)
axes[0, 1].set_xlabel('English Token Count')
axes[0, 1].set_ylabel('FR→EN BLEU Score')
axes[0, 1].set_title(f'EN Tokens vs FR→EN BLEU\nr = {pearson_en_tokens_fr_en:.3f}, p = {p_en_fr_en:.2e}')
axes[0, 1].grid(alpha=0.3)

axes[0, 2].scatter(df['en_num_tokens'], df['bleu_avg'], alpha=0.3, s=10, color='blue')
en_slope_avg, en_int_avg, _, _, _ = stats.linregress(df['en_num_tokens'], df['bleu_avg'])
axes[0, 2].plot(en_x, en_slope_avg * en_x + en_int_avg, 'b-', linewidth=2, alpha=0.8)
axes[0, 2].set_xlabel('English Token Count')
axes[0, 2].set_ylabel('Average BLEU Score')
axes[0, 2].set_title(f'EN Tokens vs Avg BLEU\nr = {pearson_en_tokens_avg:.3f}, p = {p_en_avg:.2e}')
axes[0, 2].grid(alpha=0.3)

# French tokens vs BLEU scores
axes[1, 0].scatter(df['fr_num_tokens'], df['bleu_en_fr'], alpha=0.3, s=10, color='green')
fr_slope_en_fr, fr_int_en_fr, _, _, _ = stats.linregress(df['fr_num_tokens'], df['bleu_en_fr'])
fr_x = np.array([df['fr_num_tokens'].min(), df['fr_num_tokens'].max()])
axes[1, 0].plot(fr_x, fr_slope_en_fr * fr_x + fr_int_en_fr, 'g-', linewidth=2, alpha=0.8)
axes[1, 0].set_xlabel('French Token Count')
axes[1, 0].set_ylabel('EN→FR BLEU Score')
axes[1, 0].set_title(f'FR Tokens vs EN→FR BLEU\nr = {pearson_fr_tokens_en_fr:.3f}, p = {p_fr_en_fr:.2e}')
axes[1, 0].grid(alpha=0.3)

axes[1, 1].scatter(df['fr_num_tokens'], df['bleu_fr_en'], alpha=0.3, s=10, color='green')
fr_slope_fr_en, fr_int_fr_en, _, _, _ = stats.linregress(df['fr_num_tokens'], df['bleu_fr_en'])
axes[1, 1].plot(fr_x, fr_slope_fr_en * fr_x + fr_int_fr_en, 'g-', linewidth=2, alpha=0.8)
axes[1, 1].set_xlabel('French Token Count')
axes[1, 1].set_ylabel('FR→EN BLEU Score')
axes[1, 1].set_title(f'FR Tokens vs FR→EN BLEU\nr = {pearson_fr_tokens_fr_en:.3f}, p = {p_fr_fr_en:.2e}')
axes[1, 1].grid(alpha=0.3)

axes[1, 2].scatter(df['fr_num_tokens'], df['bleu_avg'], alpha=0.3, s=10, color='green')
fr_slope_avg, fr_int_avg, _, _, _ = stats.linregress(df['fr_num_tokens'], df['bleu_avg'])
axes[1, 2].plot(fr_x, fr_slope_avg * fr_x + fr_int_avg, 'g-', linewidth=2, alpha=0.8)
axes[1, 2].set_xlabel('French Token Count')
axes[1, 2].set_ylabel('Average BLEU Score')
axes[1, 2].set_title(f'FR Tokens vs Avg BLEU\nr = {pearson_fr_tokens_avg:.3f}, p = {p_fr_avg:.2e}')
axes[1, 2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 8.6. Partial Correlation: Wasserstein vs BLEU (Controlling for Token Count)

Since both Wasserstein distance and BLEU are correlated with token counts, we need to compute **partial correlation** to determine if the Wasserstein-BLEU relationship is genuine or spurious.

**Method**: Use linear regression residuals
1. Regress Wasserstein distance on token counts → get residuals
2. Regress BLEU on token counts → get residuals  
3. Correlate the residuals (= partial correlation, controlling for token count effect)

In [None]:
from sklearn.linear_model import LinearRegression

# Prepare predictor matrix: both English and French token counts
X_tokens = df[['en_num_tokens', 'fr_num_tokens']].values

print("=" * 70)
print("PARTIAL CORRELATION: Wasserstein vs BLEU (Controlling for Token Counts)")
print("=" * 70)
print()

# Function to compute partial correlation via residuals
def partial_correlation_residuals(y1, y2, X):
    """
    Compute partial correlation between y1 and y2, controlling for X.
    
    Method: Regress both y1 and y2 on X, then correlate the residuals.
    """
    # Reshape if needed
    y1 = np.array(y1).reshape(-1, 1)
    y2 = np.array(y2).reshape(-1, 1)
    
    # Regress y1 on X
    model1 = LinearRegression()
    model1.fit(X, y1)
    residuals1 = y1.flatten() - model1.predict(X).flatten()
    
    # Regress y2 on X
    model2 = LinearRegression()
    model2.fit(X, y2)
    residuals2 = y2.flatten() - model2.predict(X).flatten()
    
    # Correlate residuals
    r, p = pearsonr(residuals1, residuals2)
    return r, p, residuals1, residuals2

# Compute partial correlations for each BLEU metric
print("Partial Correlation (controlling for EN and FR token counts):")
print()

# Wasserstein vs EN→FR BLEU
r_partial_en_fr, p_partial_en_fr, w_resid_en_fr, bleu_resid_en_fr = partial_correlation_residuals(
    df['wasserstein_distance'], df['bleu_en_fr'], X_tokens
)
print(f"Wasserstein vs EN→FR BLEU:")
print(f"  Original correlation:  r = {pearson_en_fr:.4f}, p = {p_pearson_en_fr:.2e}")
print(f"  Partial correlation:   r = {r_partial_en_fr:.4f}, p = {p_partial_en_fr:.2e}")
print()

# Wasserstein vs FR→EN BLEU
r_partial_fr_en, p_partial_fr_en, w_resid_fr_en, bleu_resid_fr_en = partial_correlation_residuals(
    df['wasserstein_distance'], df['bleu_fr_en'], X_tokens
)
print(f"Wasserstein vs FR→EN BLEU:")
print(f"  Original correlation:  r = {pearson_fr_en:.4f}, p = {p_pearson_fr_en:.2e}")
print(f"  Partial correlation:   r = {r_partial_fr_en:.4f}, p = {p_partial_fr_en:.2e}")
print()

# Wasserstein vs Average BLEU
r_partial_avg, p_partial_avg, w_resid_avg, bleu_resid_avg = partial_correlation_residuals(
    df['wasserstein_distance'], df['bleu_avg'], X_tokens
)
print(f"Wasserstein vs Average BLEU:")
print(f"  Original correlation:  r = {pearson_avg:.4f}, p = {p_pearson_avg:.2e}")
print(f"  Partial correlation:   r = {r_partial_avg:.4f}, p = {p_partial_avg:.2e}")
print()

# Interpretation
print("Interpretation:")
change_en_fr = abs(r_partial_en_fr) - abs(pearson_en_fr)
change_fr_en = abs(r_partial_fr_en) - abs(pearson_fr_en)
change_avg = abs(r_partial_avg) - abs(pearson_avg)

print(f"  EN→FR: Correlation changed by {change_en_fr:+.4f} after controlling for token counts")
print(f"  FR→EN: Correlation changed by {change_fr_en:+.4f} after controlling for token counts")
print(f"  Avg:   Correlation changed by {change_avg:+.4f} after controlling for token counts")
print()

if abs(change_avg) < 0.05:
    print("  → Token count has minimal confounding effect. The Wasserstein-BLEU relationship is genuine.")
elif abs(r_partial_avg) < abs(pearson_avg) * 0.5:
    print("  → Token count is a major confounder. Much of the Wasserstein-BLEU correlation is explained by token count.")
else:
    print("  → Token count has some confounding effect, but the Wasserstein-BLEU relationship persists.")

# Visualize partial correlation (residuals plot)
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# EN→FR
axes[0].scatter(w_resid_en_fr, bleu_resid_en_fr, alpha=0.3, s=10)
z = np.polyfit(w_resid_en_fr, bleu_resid_en_fr, 1)
p_fit = np.poly1d(z)
axes[0].plot(w_resid_en_fr, p_fit(w_resid_en_fr), "r--", alpha=0.5)
axes[0].axhline(0, color='gray', linestyle='--', alpha=0.3)
axes[0].axvline(0, color='gray', linestyle='--', alpha=0.3)
axes[0].set_xlabel('Wasserstein Distance\n(residuals after removing token count effect)')
axes[0].set_ylabel('EN→FR BLEU\n(residuals after removing token count effect)')
axes[0].set_title(f'Partial Correlation: Wasserstein vs EN→FR BLEU\nr_partial = {r_partial_en_fr:.3f}, p = {p_partial_en_fr:.2e}')
axes[0].grid(alpha=0.3)

# FR→EN
axes[1].scatter(w_resid_fr_en, bleu_resid_fr_en, alpha=0.3, s=10)
z = np.polyfit(w_resid_fr_en, bleu_resid_fr_en, 1)
p_fit = np.poly1d(z)
axes[1].plot(w_resid_fr_en, p_fit(w_resid_fr_en), "r--", alpha=0.5)
axes[1].axhline(0, color='gray', linestyle='--', alpha=0.3)
axes[1].axvline(0, color='gray', linestyle='--', alpha=0.3)
axes[1].set_xlabel('Wasserstein Distance\n(residuals after removing token count effect)')
axes[1].set_ylabel('FR→EN BLEU\n(residuals after removing token count effect)')
axes[1].set_title(f'Partial Correlation: Wasserstein vs FR→EN BLEU\nr_partial = {r_partial_fr_en:.3f}, p = {p_partial_fr_en:.2e}')
axes[1].grid(alpha=0.3)

# Average
axes[2].scatter(w_resid_avg, bleu_resid_avg, alpha=0.3, s=10)
z = np.polyfit(w_resid_avg, bleu_resid_avg, 1)
p_fit = np.poly1d(z)
axes[2].plot(w_resid_avg, p_fit(w_resid_avg), "r--", alpha=0.5)
axes[2].axhline(0, color='gray', linestyle='--', alpha=0.3)
axes[2].axvline(0, color='gray', linestyle='--', alpha=0.3)
axes[2].set_xlabel('Wasserstein Distance\n(residuals after removing token count effect)')
axes[2].set_ylabel('Average BLEU\n(residuals after removing token count effect)')
axes[2].set_title(f'Partial Correlation: Wasserstein vs Average BLEU\nr_partial = {r_partial_avg:.3f}, p = {p_partial_avg:.2e}')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Binned Analysis

In [None]:
# Bin by Wasserstein distance and compare mean BLEU scores
df['w_bin'] = pd.qcut(df['wasserstein_distance'], q=5, labels=['Very Similar', 'Similar', 'Moderate', 'Dissimilar', 'Very Dissimilar'])

print("=" * 70)
print("BINNED ANALYSIS: Mean BLEU by Topological Similarity")
print("=" * 70)
print()

binned_stats = df.groupby('w_bin').agg({
    'bleu_avg': ['mean', 'std', 'count'],
    'wasserstein_distance': ['mean', 'std']
})

print(binned_stats)
print()

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))

bin_means = df.groupby('w_bin')['bleu_avg'].mean()
bin_stds = df.groupby('w_bin')['bleu_avg'].std()

ax.bar(range(len(bin_means)), bin_means, yerr=bin_stds, capsize=5, alpha=0.7, edgecolor='black')
ax.set_xticks(range(len(bin_means)))
ax.set_xticklabels(bin_means.index, rotation=45, ha='right')
ax.set_xlabel('Topological Similarity (Wasserstein Distance Bins)')
ax.set_ylabel('Mean BLEU Score')
ax.set_title('Translation Quality by Topological Similarity')
ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 10. Examples: High Similarity vs Low Similarity

In [None]:
# Sort by Wasserstein distance
df_sorted = df.sort_values('wasserstein_distance')

print("=" * 70)
print("HIGH TOPOLOGICAL SIMILARITY (Low Wasserstein Distance)")
print("=" * 70)
for i in range(5):
    row = df_sorted.iloc[i]
    print(f"\n[{i+1}] Pair {int(row['idx'])}: W = {row['wasserstein_distance']:.4f}, BLEU = {row['bleu_avg']:.2f}")
    
    # Get original text from TDA results
    original = tda_results[int(row['idx'])]
    print(f"    EN: {original['en_text']}")
    print(f"    FR: {original['fr_text']}")

print("\n" + "=" * 70)
print("LOW TOPOLOGICAL SIMILARITY (High Wasserstein Distance)")
print("=" * 70)
for i in range(5):
    row = df_sorted.iloc[-(i+1)]
    print(f"\n[{i+1}] Pair {int(row['idx'])}: W = {row['wasserstein_distance']:.4f}, BLEU = {row['bleu_avg']:.2f}")
    
    original = tda_results[int(row['idx'])]
    print(f"    EN: {original['en_text']}")
    print(f"    FR: {original['fr_text']}")

## Summary

**Key Findings:**
- Correlation between topological similarity (Wasserstein distance) and translation quality (BLEU)
- H0 vs H1 contribution to the relationship
- Binned analysis showing trend across similarity levels

**Hypothesis Test:**
Does lower Wasserstein distance (more topologically similar attention patterns) predict higher BLEU scores (better translation quality)?