# NLLB-600M Model Exploration
Load the saved model and test French ↔ English translation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

## 1. Load the saved model

In [None]:
# Load tokenizer and model from local directory
model_dir = "../models/nllb-600M"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
print("✓ Tokenizer loaded")

print("\nLoading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
print("✓ Model loaded")

print(f"\nModel: NLLB-200-distilled-600M")
print(f"Parameters: ~600M")

## 2. Test French → English translation

In [None]:
# NLLB uses language codes: fra_Latn (French), eng_Latn (English)
test_sentence = "Bonjour, comment allez-vous?"
print(f"Input (French): {test_sentence}")

# Set source language
tokenizer.src_lang = "fra_Latn"

# Tokenize
inputs = tokenizer(test_sentence, return_tensors="pt")

# Generate translation
translated_tokens = model.generate(
    **inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"],
    max_length=50
)

# Decode
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
print(f"Output (English): {translation}")

## 3. Test with multiple sentences

In [None]:
test_sentences = [
    "Bonjour, comment allez-vous?",
    "Je suis étudiant à l'université.",
    "Le chat est sur la table.",
    "Quelle heure est-il?",
    "J'aime apprendre les langues."
]

print("="*80)
print("French → English Translations")
print("="*80)

tokenizer.src_lang = "fra_Latn"

for i, sentence in enumerate(test_sentences, 1):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"],
        max_length=50
    )
    translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    
    print(f"\n{i}. FR: {sentence}")
    print(f"   EN: {translation}")

## 4. Inspect model architecture

In [None]:
# Check model configuration
print("Model Configuration:")
print(f"  Number of encoder layers: {model.config.encoder_layers}")
print(f"  Number of decoder layers: {model.config.decoder_layers}")
print(f"  Number of attention heads: {model.config.encoder_attention_heads}")
print(f"  Hidden size: {model.config.d_model}")
print(f"  Vocabulary size: {model.config.vocab_size}")
print(f"\nModel has encoder-decoder architecture for sequence-to-sequence translation")

In [None]:
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"\nParameters (in millions): {total_params / 1e6:.1f}M")

## 5. Test with dataset examples

In [None]:
# Load some examples from our saved dataset
from datasets import load_from_disk

dataset = load_from_disk("../data/wmt14_fr-en_validation_2000")
print(f"Loaded {len(dataset)} sentence pairs\n")

# Test on first 3 examples
print("="*80)
print("Testing on WMT14 dataset examples")
print("="*80)

tokenizer.src_lang = "fra_Latn"

for i in range(3):
    example = dataset[i]["translation"]
    french = example["fr"]
    english_ref = example["en"]
    
    # Translate
    inputs = tokenizer(french, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"],
        max_length=100
    )
    translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    
    print(f"\nExample {i+1}:")
    print(f"FR: {french}")
    print(f"EN (reference): {english_ref}")
    print(f"EN (translated): {translation}")

## Summary

**Model loaded successfully:**
- NLLB-200-distilled-600M (~600M parameters)
- French ↔ English translation working
- Ready for attention extraction

**Next steps:**
1. Extract attention weights from encoder and decoder
2. Build attention graphs
3. Compute persistent homology