# Model Training - Fine-tune NLLB with LoRA

This notebook fine-tunes the NLLB-200-distilled-600M model for idiom-aware English-Sinhala translation using LoRA (Low-Rank Adaptation).

## Training Pipeline:
1. Load configuration
2. Setup base NLLB model and tokenizer
3. Add special tokens (`<IDIOM>`, `</IDIOM>`)
4. Apply LoRA adapters
5. Prepare training dataset
6. Train with progress tracking
7. Save checkpoints and final model
8. Visualize training metrics

In [None]:
import sys
sys.path.append('..')

from src.trainer import (
    setup_model_and_tokenizer,
    apply_lora,
    prepare_dataset,
    train_model,
    save_checkpoint,
    load_config
)
import torch
import yaml
import matplotlib.pyplot as plt
import json
from pathlib import Path

print("✓ Imports successful")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## 1. Load Configuration

In [None]:
# Load configuration
config_path = '../config/training_config.yaml'
config = load_config(config_path)

print("=== Training Configuration ===")
print(f"Base model: {config['model']['base_model']}")
print(f"Source language: {config['model']['source_lang']}")
print(f"Target language: {config['model']['target_lang']}")
print(f"\nLoRA settings:")
print(f"  r: {config['lora']['r']}")
print(f"  lora_alpha: {config['lora']['lora_alpha']}")
print(f"  lora_dropout: {config['lora']['lora_dropout']}")
print(f"\nTraining settings:")
print(f"  Learning rate: {config['training']['learning_rate']}")
print(f"  Epochs: {config['training']['num_epochs']}")
print(f"  Batch size: {config['training']['batch_size']}")

## 2. Setup Model and Tokenizer

In [None]:
# Setup special tokens
special_tokens = [
    config['special_tokens']['idiom_start'],
    config['special_tokens']['idiom_end']
]

# Load model and tokenizer
model, tokenizer = setup_model_and_tokenizer(
    model_name=config['model']['base_model'],
    special_tokens=special_tokens
)

print(f"\n✓ Model and tokenizer loaded")
print(f"Vocabulary size: {len(tokenizer)}")

## 3. Apply LoRA Adapters

In [None]:
# Apply LoRA to the model
model = apply_lora(model, config['lora'])

print("\n✓ LoRA adapters applied successfully")

## 4. Prepare Training Dataset

In [None]:
# Load augmented training data
train_dataset = prepare_dataset(
    data_path=config['data']['augmented_json'],
    tokenizer=tokenizer,
    src_lang=config['model']['source_lang'],
    tgt_lang=config['model']['target_lang'],
    max_length=config['training']['max_length']
)

print(f"\n✓ Training dataset prepared")
print(f"Number of training examples: {len(train_dataset)}")

## 5. Train the Model

In [None]:
# Create output directory
output_dir = Path(config['paths']['checkpoints'])
output_dir.mkdir(parents=True, exist_ok=True)

# Train the model
print("\nStarting training...")
print("This may take a while depending on your hardware.")
print("On CPU: ~2-4 hours | On GPU: ~30-60 minutes\n")

trained_model, trainer = train_model(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    config={
        **config['training'],
        **config['settings']
    },
    output_dir=str(output_dir)
)

print("\n✓ Training completed!")

## 6. Save Final Model

In [None]:
# Save the final model
final_model_path = config['paths']['final_model']
save_checkpoint(trained_model, tokenizer, final_model_path)

print(f"\n✓ Final model saved to {final_model_path}")

## 7. Visualize Training Metrics

In [None]:
# Extract training history
history = trainer.state.log_history

# Extract loss values
train_loss = []
steps = []

for entry in history:
    if 'loss' in entry:
        train_loss.append(entry['loss'])
        steps.append(entry['step'])

# Plot training loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(steps, train_loss, marker='o', linestyle='-', linewidth=2, markersize=4)
plt.title('Training Loss Over Time', fontsize=12, fontweight='bold')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)

# Plot learning rate if available
learning_rates = [entry.get('learning_rate', None) for entry in history if 'learning_rate' in entry]
lr_steps = [entry['step'] for entry in history if 'learning_rate' in entry]

if learning_rates:
    plt.subplot(1, 2, 2)
    plt.plot(lr_steps, learning_rates, marker='o', linestyle='-', linewidth=2, markersize=4, color='orange')
    plt.title('Learning Rate Schedule', fontsize=12, fontweight='bold')
    plt.xlabel('Training Steps')
    plt.ylabel('Learning Rate')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/training_metrics.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Training metrics visualized and saved to outputs/training_metrics.png")

## 8. Training Summary

In [None]:
# Print training summary
print("=" * 60)
print("TRAINING SUMMARY")
print("=" * 60)
print(f"\nModel: {config['model']['base_model']}")
print(f"Training method: LoRA (Low-Rank Adaptation)")
print(f"\nDataset:")
print(f"  Training examples: {len(train_dataset)}")
print(f"  Max sequence length: {config['training']['max_length']}")
print(f"\nTraining configuration:")
print(f"  Epochs: {config['training']['num_epochs']}")
print(f"  Batch size: {config['training']['batch_size']}")
print(f"  Learning rate: {config['training']['learning_rate']}")
print(f"  LoRA rank (r): {config['lora']['r']}")
print(f"\nFinal metrics:")
if train_loss:
    print(f"  Final training loss: {train_loss[-1]:.4f}")
    print(f"  Best training loss: {min(train_loss):.4f}")
print(f"\nModel saved to: {final_model_path}")
print("\n" + "=" * 60)

## Summary

Model training completed successfully!

- ✅ NLLB-600M base model loaded
- ✅ Special tokens added (`<IDIOM>`, `</IDIOM>`)
- ✅ LoRA adapters applied
- ✅ Model trained on augmented dataset
- ✅ Checkpoints saved
- ✅ Final model saved to `models/final/`

**Next Step**: Run `04_inference_test.ipynb` to test the model on the 50 test examples.