# Analyze TDA Results

Load and analyze persistent homology and Wasserstein distance results.

**For Google Colab:**
1. Mount Google Drive (run cell below)
2. Set `ROOT_DIR` to your project folder path in code_fr_en

**For local execution:** Skip the Google Drive cell and run from "Verify Working Directory"

---

**Note:** TDA analysis uses last encoder layer (layer 23/24) attention only.

In [None]:
# Mount Google Drive (only needed for Google Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    
    # IMPORTANT: Set this to your code_fr_en directory path
    # This should point to where THIS notebook is located
    ROOT_DIR = "/content/drive/MyDrive/UofT/CSC2517/term_paper/code_fr_en"
    
    import os
    os.chdir(ROOT_DIR)
    print(f"✓ Changed to: {os.getcwd()}")
except ImportError:
    print("Not running on Colab, using local environment")

## Verify Working Directory

In [None]:
# Verify working directory and required files
import os
from pathlib import Path

print(f"Current directory: {os.getcwd()}")

# Check TDA results directory
tda_dir = "../data/tda_results_fr_en"
if os.path.exists(tda_dir):
    print(f"✓ TDA results directory exists: {tda_dir}")
    # List available result files
    result_files = sorted(Path(tda_dir).glob("tda_results_*.pkl"))
    print(f"  Found {len(result_files)} result file(s):")
    for f in result_files:
        print(f"    - {f.name} ({f.stat().st_size / (1024**2):.1f} MB)")
else:
    print(f"✗ TDA results directory NOT found: {tda_dir}")

## Configuration

**Modify this variable to load different result files:**

In [None]:
# ============================================================================
# CONFIGURATION - Change this to analyze different results
# ============================================================================

FILTER_SPECIAL = True   # True to use filtered results, False for unfiltered

# ============================================================================

# Generate filename based on config
filter_str = "filtered" if FILTER_SPECIAL else "unfiltered"
FILENAME = f"tda_results_last_layer_{filter_str}.pkl"

print(f"Configuration:")
print(f"  Special tokens filtered: {FILTER_SPECIAL}")
print(f"  File: {FILENAME}")

## 1. Import Libraries

In [None]:
# Install TDA libraries if not already installed
!pip install ripser persim

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pandas as pd
from scipy import stats
import warnings

# Suppress warnings about infinite death times in persistence diagrams
warnings.filterwarnings('ignore', message='.*non-finite death times.*')

# TDA visualization
from persim import plot_diagrams

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported")

## 2. Load TDA Results

In [None]:
# Load results
data_path = Path(f"../data/tda_results_fr_en/{FILENAME}")

print(f"Loading results from {data_path}...")
print(f"File size: {data_path.stat().st_size / (1024**2):.1f} MB")
print()

with open(data_path, 'rb') as f:
    results = pickle.load(f)

print(f"✓ Loaded {len(results)} sentence pairs")
print()

# Convert to DataFrame for easier analysis
df = pd.DataFrame([{
    'idx': r['idx'],
    'en_text': r['en_text'],
    'fr_text': r['fr_text'],
    'wasserstein_distance': r['wasserstein_distance'],
    'wasserstein_h0': r['wasserstein_h0'],
    'wasserstein_h1': r['wasserstein_h1'],
    'en_num_tokens': r['en_num_tokens'],
    'fr_num_tokens': r['fr_num_tokens'],
    'en_h0_features': r['en_h0_features'],
    'en_h1_features': r['en_h1_features'],
    'fr_h0_features': r['fr_h0_features'],
    'fr_h1_features': r['fr_h1_features']
} for r in results])

print("DataFrame created:")
print(df.head())

## 3. Summary Statistics

In [None]:
print("=" * 70)
print("SUMMARY STATISTICS")
print("=" * 70)
print()

print("Wasserstein Distance (Total):")
print(f"  Min:    {df['wasserstein_distance'].min():.6f}")
print(f"  Max:    {df['wasserstein_distance'].max():.6f}")
print(f"  Mean:   {df['wasserstein_distance'].mean():.6f}")
print(f"  Median: {df['wasserstein_distance'].median():.6f}")
print(f"  Std:    {df['wasserstein_distance'].std():.6f}")
print()

print("Wasserstein Distance by Dimension:")
print(f"  H0 - Min: {df['wasserstein_h0'].min():.6f}, Max: {df['wasserstein_h0'].max():.6f}, Mean: {df['wasserstein_h0'].mean():.6f}, Median: {df['wasserstein_h0'].median():.6f}")
print(f"  H1 - Min: {df['wasserstein_h1'].min():.6f}, Max: {df['wasserstein_h1'].max():.6f}, Mean: {df['wasserstein_h1'].mean():.6f}, Median: {df['wasserstein_h1'].median():.6f}")
print()

print("Token Counts:")
print(f"  English - Min: {df['en_num_tokens'].min()}, Max: {df['en_num_tokens'].max()}, Mean: {df['en_num_tokens'].mean():.1f}, Median: {df['en_num_tokens'].median():.1f}")
print(f"  French  - Min: {df['fr_num_tokens'].min()}, Max: {df['fr_num_tokens'].max()}, Mean: {df['fr_num_tokens'].mean():.1f}, Median: {df['fr_num_tokens'].median():.1f}")
print()

print("H0 Features (β₀ - Connected Components):")
print(f"  English - Min: {df['en_h0_features'].min()}, Max: {df['en_h0_features'].max()}, Mean: {df['en_h0_features'].mean():.1f}, Median: {df['en_h0_features'].median():.1f}")
print(f"  French  - Min: {df['fr_h0_features'].min()}, Max: {df['fr_h0_features'].max()}, Mean: {df['fr_h0_features'].mean():.1f}, Median: {df['fr_h0_features'].median():.1f}")
print()

print("H1 Features (β₁ - Loops/Holes):")
print(f"  English - Min: {df['en_h1_features'].min()}, Max: {df['en_h1_features'].max()}, Mean: {df['en_h1_features'].mean():.1f}, Median: {df['en_h1_features'].median():.1f}")
print(f"  French  - Min: {df['fr_h1_features'].min()}, Max: {df['fr_h1_features'].max()}, Mean: {df['fr_h1_features'].mean():.1f}, Median: {df['fr_h1_features'].median():.1f}")
print(f"  Pairs with H1 features: {(df['en_h1_features'] > 0).sum()} EN, {(df['fr_h1_features'] > 0).sum()} FR")

## 4. Visualize Wasserstein Distance Distribution

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Total Wasserstein distance
axes[0].hist(df['wasserstein_distance'], bins=50, alpha=0.7, color='blue', edgecolor='black')
axes[0].axvline(df['wasserstein_distance'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["wasserstein_distance"].mean():.4f}')
axes[0].axvline(df['wasserstein_distance'].median(), color='green', linestyle='--', 
                label=f'Median: {df["wasserstein_distance"].median():.4f}')
axes[0].set_xlabel('Wasserstein Distance')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Total Wasserstein Distance\n(Lower = More Topologically Similar)')
axes[0].legend()
axes[0].grid(alpha=0.3)

# H0 component
axes[1].hist(df['wasserstein_h0'], bins=50, alpha=0.7, color='orange', edgecolor='black')
axes[1].axvline(df['wasserstein_h0'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["wasserstein_h0"].mean():.4f}')
axes[1].set_xlabel('Wasserstein Distance (H0)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('H0 Component (Connected Components)')
axes[1].legend()
axes[1].grid(alpha=0.3)

# H1 component
axes[2].hist(df['wasserstein_h1'], bins=50, alpha=0.7, color='purple', edgecolor='black')
axes[2].axvline(df['wasserstein_h1'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["wasserstein_h1"].mean():.4f}')
axes[2].set_xlabel('Wasserstein Distance (H1)')
axes[2].set_ylabel('Frequency')
axes[2].set_title('H1 Component (Loops/Holes)')
axes[2].legend()
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"H0 contributes {df['wasserstein_h0'].mean() / df['wasserstein_distance'].mean() * 100:.1f}% to total distance")
print(f"H1 contributes {df['wasserstein_h1'].mean() / df['wasserstein_distance'].mean() * 100:.1f}% to total distance")

## 5. Analyze Topological Features

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# H0 feature counts
axes[0, 0].hist(df['en_h0_features'], bins=30, alpha=0.5, color='blue', label='English', edgecolor='black')
axes[0, 0].hist(df['fr_h0_features'], bins=30, alpha=0.5, color='green', label='French', edgecolor='black')
axes[0, 0].set_xlabel('Number of H0 Features')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('H0 Feature Count Distribution (Connected Components)')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# H1 feature counts
axes[0, 1].hist(df['en_h1_features'], bins=range(0, max(df['en_h1_features'].max(), df['fr_h1_features'].max()) + 2), 
                alpha=0.5, color='blue', label='English', edgecolor='black', align='left')
axes[0, 1].hist(df['fr_h1_features'], bins=range(0, max(df['en_h1_features'].max(), df['fr_h1_features'].max()) + 2), 
                alpha=0.5, color='green', label='French', edgecolor='black', align='left')
axes[0, 1].set_xlabel('Number of H1 Features')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('H1 Feature Count Distribution (Loops/Holes)')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Token count distribution
axes[1, 0].hist(df['en_num_tokens'], bins=30, alpha=0.5, color='blue', label='English', edgecolor='black')
axes[1, 0].hist(df['fr_num_tokens'], bins=30, alpha=0.5, color='green', label='French', edgecolor='black')
axes[1, 0].set_xlabel('Number of Tokens')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Token Count Distribution')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Wasserstein vs token count
axes[1, 1].scatter(df['en_num_tokens'], df['wasserstein_distance'], alpha=0.3, s=10, color='blue', label='English tokens')
axes[1, 1].scatter(df['fr_num_tokens'], df['wasserstein_distance'], alpha=0.3, s=10, color='green', label='French tokens')

# Best fit lines (linear regression)
# English
en_slope, en_intercept, en_r, _, _ = stats.linregress(df['en_num_tokens'], df['wasserstein_distance'])
en_fit_x = np.array([df['en_num_tokens'].min(), df['en_num_tokens'].max()])
en_fit_y = en_slope * en_fit_x + en_intercept
axes[1, 1].plot(en_fit_x, en_fit_y, color='blue', linewidth=2, alpha=0.8, label=f'EN fit (r={en_r:.3f})')

# French
fr_slope, fr_intercept, fr_r, _, _ = stats.linregress(df['fr_num_tokens'], df['wasserstein_distance'])
fr_fit_x = np.array([df['fr_num_tokens'].min(), df['fr_num_tokens'].max()])
fr_fit_y = fr_slope * fr_fit_x + fr_intercept
axes[1, 1].plot(fr_fit_x, fr_fit_y, color='green', linewidth=2, alpha=0.8, label=f'FR fit (r={fr_r:.3f})')

axes[1, 1].set_xlabel('Number of Tokens')
axes[1, 1].set_ylabel('Wasserstein Distance')
axes[1, 1].set_title('Wasserstein Distance vs Token Count')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Most vs Least Topologically Similar Pairs

In [None]:
# Sort by Wasserstein distance
df_sorted = df.sort_values('wasserstein_distance')

print("=" * 70)
print("MOST TOPOLOGICALLY SIMILAR PAIRS (Lowest Wasserstein Distance)")
print("=" * 70)
for i in range(5):
    row = df_sorted.iloc[i]
    print(f"\n[{i+1}] Pair {row['idx']}: W-dist = {row['wasserstein_distance']:.6f} (H0: {row['wasserstein_h0']:.4f}, H1: {row['wasserstein_h1']:.4f})")
    print(f"    EN ({row['en_num_tokens']} tokens): {row['en_text']}")
    print(f"    FR ({row['fr_num_tokens']} tokens): {row['fr_text']}")

print("\n" + "=" * 70)
print("LEAST TOPOLOGICALLY SIMILAR PAIRS (Highest Wasserstein Distance)")
print("=" * 70)
for i in range(5):
    row = df_sorted.iloc[-(i+1)]
    print(f"\n[{i+1}] Pair {row['idx']}: W-dist = {row['wasserstein_distance']:.6f} (H0: {row['wasserstein_h0']:.4f}, H1: {row['wasserstein_h1']:.4f})")
    print(f"    EN ({row['en_num_tokens']} tokens): {row['en_text']}")
    print(f"    FR ({row['fr_num_tokens']} tokens): {row['fr_text']}")

## 7. Visualize Persistence Diagrams for Extreme Cases

In [None]:
# Most similar pair
most_similar_idx = df_sorted.iloc[0]['idx']
most_similar = results[most_similar_idx]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

plt.sca(axes[0])
plot_diagrams(most_similar['en_diagrams'], show=False)
axes[0].set_title('English')

plt.sca(axes[1])
plot_diagrams(most_similar['fr_diagrams'], show=False)
axes[1].set_title('French')

fig.suptitle(f"Most Topologically Similar Pair (W-dist: {most_similar['wasserstein_distance']:.6f})\nEN: {most_similar['en_text'][:60]}...\nFR: {most_similar['fr_text'][:60]}...", fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
# Least similar pair
least_similar_idx = df_sorted.iloc[-1]['idx']
least_similar = results[least_similar_idx]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

plt.sca(axes[0])
plot_diagrams(least_similar['en_diagrams'], show=False)
axes[0].set_title('English')

plt.sca(axes[1])
plot_diagrams(least_similar['fr_diagrams'], show=False)
axes[1].set_title('French')

fig.suptitle(f"Least Topologically Similar Pair (W-dist: {least_similar['wasserstein_distance']:.6f})\nEN: {least_similar['en_text'][:60]}...\nFR: {least_similar['fr_text'][:60]}...", fontsize=10)
plt.tight_layout()
plt.show()

## 8. Correlation Analysis

In [None]:
# Compute correlations
print("Correlation Analysis:")
print("=" * 70)
print()

# Token count vs Wasserstein distance
corr_en_tokens = df['en_num_tokens'].corr(df['wasserstein_distance'])
corr_fr_tokens = df['fr_num_tokens'].corr(df['wasserstein_distance'])
print(f"Token count vs Wasserstein distance:")
print(f"  English tokens: r = {corr_en_tokens:.4f}")
print(f"  French tokens:  r = {corr_fr_tokens:.4f}")
print()

# H0 features vs Wasserstein
corr_en_h0 = df['en_h0_features'].corr(df['wasserstein_distance'])
corr_fr_h0 = df['fr_h0_features'].corr(df['wasserstein_distance'])
print(f"H0 features vs Wasserstein distance:")
print(f"  English H0: r = {corr_en_h0:.4f}")
print(f"  French H0:  r = {corr_fr_h0:.4f}")
print()

# Cross-language feature correlation
corr_h0_cross = df['en_h0_features'].corr(df['fr_h0_features'])
corr_h1_cross = df['en_h1_features'].corr(df['fr_h1_features'])
print(f"Cross-language feature correlation:")
print(f"  H0 (EN vs FR): r = {corr_h0_cross:.4f}")
print(f"  H1 (EN vs FR): r = {corr_h1_cross:.4f}")
print()

print("Interpretation:")
print(f"  - Cross-language H0 correlation of {corr_h0_cross:.2f} suggests {'high' if corr_h0_cross > 0.7 else 'moderate' if corr_h0_cross > 0.4 else 'low'} structural similarity")
print(f"  - Mean Wasserstein distance: {df['wasserstein_distance'].mean():.4f}")

## Summary

✅ **TDA Results Analysis Complete!**

**What we analyzed:**
- Wasserstein distance distributions (total, H0, H1)
- Topological feature counts (β₀, β₁)
- Correlation between token count and topological similarity
- Cross-language structural similarity
- Most/least topologically similar pairs

**Next steps:**
1. Compute BLEU scores for translation quality (notebook 12-13)
2. Correlate topological similarity with translation quality (notebook 14)
3. Statistical significance testing