# WMT17 EN-ZH Dataset Exploration
Load and explore the 2000-example validation subset

In [None]:
# Import all required libraries
from datasets import load_from_disk
import pandas as pd
import random
import matplotlib.pyplot as plt
import pickle

## 1. Load the saved dataset

In [None]:
# Load the saved dataset
dataset = load_from_disk("../data/wmt17_zh_en_validation_2000")
print(f"Loaded {len(dataset)} sentence pairs")
print(f"\nFeatures: {dataset.features}")
print(f"Column names: {dataset.column_names}")

## 2. Convert to convenient formats

In [None]:
# Convert to list of dictionaries (easiest for iteration)
data_list = []
for item in dataset:
    data_list.append({
        'english': item['translation']['en'],
        'chinese': item['translation']['zh']
    })

print(f"Created list with {len(data_list)} sentence pairs")

In [None]:
# Convert to pandas DataFrame (good for analysis)
df = pd.DataFrame(data_list)
print(f"DataFrame shape: {df.shape}")
df.head()

## 3. Explore the data

In [None]:
# Show random examples
print("="*80)
print("Random Examples:")
print("="*80)

for i in random.sample(range(len(data_list)), 5):
    print(f"\nExample {i}:")
    print(f"ZH: {data_list[i]['chinese']}")
    print(f"EN: {data_list[i]['english']}")

In [None]:
# Compute sentence lengths
df['en_length'] = df['english'].str.split().str.len()
df['zh_length'] = df['chinese'].str.len()  # Chinese: character count since no spaces

print("Sentence Length Statistics:")
print("\nEnglish (words):")
print(df['en_length'].describe())
print("\nChinese (characters):")
print(df['zh_length'].describe())

In [None]:
# Visualize length distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(df['en_length'], bins=30, alpha=0.7, color='blue', edgecolor='black')
axes[0].set_xlabel('Sentence Length (words)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('English Sentence Lengths')
axes[0].grid(alpha=0.3)

axes[1].hist(df['zh_length'], bins=30, alpha=0.7, color='red', edgecolor='black')
axes[1].set_xlabel('Sentence Length (characters)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Chinese Sentence Lengths')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Compare EN vs ZH lengths
plt.figure(figsize=(8, 6))
plt.scatter(df['en_length'], df['zh_length'], alpha=0.5)
plt.xlabel('English Length (words)')
plt.ylabel('Chinese Length (characters)')
plt.title('English vs Chinese Sentence Lengths')
plt.grid(alpha=0.3)
plt.show()

correlation = df['en_length'].corr(df['zh_length'])
print(f"\nCorrelation between EN and ZH lengths: {correlation:.3f}")

## 4. Export convenient formats for later use

In [None]:
# Save as pickle for quick loading
with open('../data/sentence_pairs_zh_en.pkl', 'wb') as f:
    pickle.dump(data_list, f)

print("âœ“ Saved sentence pairs as pickle: ../data/sentence_pairs_zh_en.pkl")
print("\nTo load later: data_list = pickle.load(open('../data/sentence_pairs_zh_en.pkl', 'rb'))")

## Summary

**Available formats:**
- `dataset`: Original HuggingFace Dataset object
- `data_list`: List of dictionaries `[{'english': ..., 'chinese': ...}, ...]`
- `df`: Pandas DataFrame with columns ['english', 'chinese', 'en_length', 'zh_length']

**Exported files:**
- `sentence_pairs_zh_en.pkl`: Pickle format for quick loading in Python

**Next steps:**
1. Load NLLB-600M model
2. Extract attention maps
3. Build attention graphs
4. Compute persistent homology