# Neural Machine Translation with Attention
## English to Portuguese Translation

This notebook demonstrates training and inference for a neural machine translation model using LSTM with attention mechanism.

## 0. Setup Python Path

In [None]:
import sys
import os

# Add parent directory (project root) to Python path
sys.path.append('..')

print(f"Current directory: {os.getcwd()}")
print("✅ Path configured")

## 1. Setup and Imports

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from models.translator import Translator
from utils.data_loader import prepare_datasets, MAX_VOCAB_SIZE
from utils.metrics import masked_loss, masked_acc
import inference

print(f"✅ TensorFlow version: {tf.__version__}")
print(f"✅ NumPy version: {np.__version__}")
print("✅ All imports successful!")

## 2. Configuration

In [None]:
# Model hyperparameters
UNITS = 256
EPOCHS = 20
STEPS_PER_EPOCH = 500

print(f"Vocabulary Size: {MAX_VOCAB_SIZE}")
print(f"LSTM Units: {UNITS}")
print(f"Training Epochs: {EPOCHS}")
print(f"Steps per Epoch: {STEPS_PER_EPOCH}")

## 3. Load and Explore Data

In [None]:
# Load datasets
print("Loading data...")
train_data, val_data, english_vectorizer, portuguese_vectorizer = prepare_datasets()

# Initialize inference module
inference.initialize_vectorizers()

print(f"\n✅ Data loaded successfully!")
print(f"English vocabulary size: {english_vectorizer.vocabulary_size()}")
print(f"Portuguese vocabulary size: {portuguese_vectorizer.vocabulary_size()}")

In [None]:
# Show sample vocabulary
print("First 10 English words:")
print(english_vectorizer.get_vocabulary()[:10])

print("\nFirst 10 Portuguese words:")
print(portuguese_vectorizer.get_vocabulary()[:10])

In [None]:
# Inspect a batch
for (context, target_in), target_out in train_data.take(1):
    print(f"Batch size: {context.shape[0]}")
    print(f"Context shape (English): {context.shape}")
    print(f"Target input shape (Portuguese): {target_in.shape}")
    print(f"Target output shape (Portuguese): {target_out.shape}")
    
    print(f"\nFirst example:")
    print(f"English tokens: {context[0].numpy()}")
    print(f"Portuguese input tokens: {target_in[0].numpy()}")
    print(f"Portuguese output tokens: {target_out[0].numpy()}")

## 4. Build Model

In [None]:
# Create model
translator = Translator(MAX_VOCAB_SIZE, UNITS)

# Compile
translator.compile(
    optimizer="adam",
    loss=masked_loss,
    metrics=[masked_acc, masked_loss]
)

print("✅ Model created and compiled successfully!")

## 5. Train Model

**Note:** Training will take approximately 5-10 minutes per epoch depending on your hardware.

In [None]:
# Train the model
history = translator.fit(
    train_data.repeat(),
    epochs=EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=val_data,
    validation_steps=50,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)]
)

print("\n✅ Training complete!")

## 6. Visualize Training

In [None]:
# Plot training history
plt.figure(figsize=(14, 5))

# Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss', linewidth=2)
plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Model Loss Over Time', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)

# Accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['masked_acc'], label='Training Accuracy', linewidth=2)
plt.plot(history.history['val_masked_acc'], label='Validation Accuracy', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Model Accuracy Over Time', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print final metrics
print(f"\nFinal Training Loss: {history.history['loss'][-1]:.4f}")
print(f"Final Training Accuracy: {history.history['masked_acc'][-1]:.4f}")
print(f"Final Validation Loss: {history.history['val_loss'][-1]:.4f}")
print(f"Final Validation Accuracy: {history.history['val_masked_acc'][-1]:.4f}")

## 7. Save Model

In [None]:
# Create checkpoints directory if it doesn't exist
import os
os.makedirs('../checkpoints', exist_ok=True)

# Save weights
translator.save_weights('../checkpoints/translator_weights.h5')
print("✅ Model weights saved to checkpoints/translator_weights.h5")

## 8. Test Translations

### 8.1 Greedy Decoding (Temperature = 0)

In [None]:
# Test sentences
test_sentences = [
    "I love languages",
    "How are you?",
    "Good morning",
    "Thank you very much",
    "Where is the bathroom?",
    "I am learning Portuguese",
    "The weather is beautiful today"
]

print("=" * 70)
print("GREEDY DECODING (Temperature = 0.0)")
print("=" * 70)

for sentence in test_sentences:
    translation, logit, _ = inference.translate(translator, sentence, temperature=0.0)
    print(f"\nEN: {sentence}")
    print(f"PT: {translation}")
    print(f"Confidence: {logit:.3f}")

### 8.2 Sampling with Different Temperatures

In [None]:
# Test with different temperatures
test_sentence = "I love languages"
temperatures = [0.0, 0.3, 0.6, 1.0]

print("=" * 70)
print(f"TEMPERATURE SAMPLING: '{test_sentence}'")
print("=" * 70)

for temp in temperatures:
    translation, logit, _ = inference.translate(translator, test_sentence, temperature=temp)
    print(f"\nTemp {temp:.1f}: {translation}")
    print(f"Logit: {logit:.3f}")

### 8.3 Multiple Samples at Same Temperature

In [None]:
# Generate multiple samples to see diversity
test_sentence = "I love languages"
num_samples = 5
temperature = 0.6

print("=" * 70)
print(f"MULTIPLE SAMPLES (Temperature = {temperature})")
print(f"Input: '{test_sentence}'")
print("=" * 70)

for i in range(num_samples):
    translation, logit, _ = inference.translate(translator, test_sentence, temperature=temperature)
    print(f"\nSample {i+1}: {translation}")
    print(f"Logit: {logit:.3f}")

### 8.4 Minimum Bayes Risk (MBR) Decoding

In [None]:
# MBR decoding
test_sentence = "I love languages"

print("=" * 70)
print(f"MBR DECODING: '{test_sentence}'")
print("=" * 70)

translation, candidates = inference.mbr_decode(
    translator, 
    test_sentence, 
    n_samples=10, 
    temperature=0.6
)

print("\nCandidate Translations:")
for i, candidate in enumerate(candidates, 1):
    print(f"{i:2d}. {candidate}")

print(f"\n{'='*70}")
print(f"✅ SELECTED TRANSLATION: {translation}")
print(f"{'='*70}")

### 8.5 Compare Greedy vs MBR

In [None]:
# Compare different decoding strategies
test_sentences_comparison = [
    "I love programming",
    "Machine learning is fascinating",
    "Hello, how are you today?"
]

print("=" * 70)
print("COMPARISON: Greedy vs MBR Decoding")
print("=" * 70)

for sentence in test_sentences_comparison:
    print(f"\nInput: {sentence}")
    print("-" * 70)
    
    # Greedy
    greedy_translation, _, _ = inference.translate(translator, sentence, temperature=0.0)
    print(f"Greedy:  {greedy_translation}")
    
    # MBR
    mbr_translation, _ = inference.mbr_decode(translator, sentence, n_samples=8, temperature=0.6)
    print(f"MBR:     {mbr_translation}")

## 9. Interactive Translation

Change the sentence below and re-run the cell to translate your own sentences!

In [None]:
# Interactive cell - change the sentence and run
YOUR_SENTENCE = "I love programming"  # ← Change this!

print("=" * 70)
translation, logit, _ = inference.translate(translator, YOUR_SENTENCE, temperature=0.0)
print(f"English:    {YOUR_SENTENCE}")
print(f"Portuguese: {translation}")
print(f"Confidence: {logit:.3f}")
print("=" * 70)

## 10. Model Architecture Summary

In [None]:
# Display model structure
print("=" * 70)
print("ENCODER ARCHITECTURE")
print("=" * 70)
translator.encoder.summary()

print("\n" + "=" * 70)
print("DECODER ARCHITECTURE")
print("=" * 70)
translator.decoder.summary()

## 11. Translation Quality Analysis

In [None]:
# Analyze translation quality by length
short_sentences = [
    "Hello",
    "Thank you",
    "Good night"
]

medium_sentences = [
    "I love learning new languages",
    "The weather is nice today",
    "Where can I find a restaurant?"
]

long_sentences = [
    "I am very excited to learn Portuguese because it is a beautiful language",
    "Machine translation has improved significantly with the advent of neural networks",
    "Could you please tell me where I can find the nearest train station?"
]

print("=" * 70)
print("TRANSLATION QUALITY BY SENTENCE LENGTH")
print("=" * 70)

for category, sentences in [("SHORT", short_sentences), 
                             ("MEDIUM", medium_sentences), 
                             ("LONG", long_sentences)]:
    print(f"\n{category} SENTENCES:")
    print("-" * 70)
    for sentence in sentences:
        translation, logit, _ = inference.translate(translator, sentence, temperature=0.0)
        print(f"\nEN: {sentence}")
        print(f"PT: {translation}")
        print(f"Confidence: {logit:.3f}")

## 12. Summary and Next Steps

### Model Performance
- The model uses bidirectional LSTM encoder with attention mechanism
- Training was performed with early stopping to prevent overfitting
- Both greedy and MBR decoding strategies are available

### Possible Improvements
1. **Increase training data**: Use larger datasets for better generalization
2. **Transformer architecture**: Replace LSTM with Transformer for better performance
3. **Beam search**: Implement beam search decoding as alternative to greedy/MBR
4. **Fine-tuning**: Train for more epochs or adjust learning rate schedule
5. **Evaluation metrics**: Implement BLEU score for quantitative evaluation

---
**End of Notebook**