In [1]:
import shutil
import os

# Delete old checkpoints
checkpoint_dir = 'models/checkpoints'
if os.path.exists(checkpoint_dir):
    shutil.rmtree(checkpoint_dir, ignore_errors=True)
    print("üóëÔ∏è  Deleted old checkpoints")

os.makedirs(checkpoint_dir, exist_ok=True)
print("‚úÖ Clean checkpoint directory ready")

# Also delete final model if it exists
final_model = 'models/final'
if os.path.exists(final_model):
    shutil.rmtree(final_model, ignore_errors=True)
    print("üóëÔ∏è  Deleted old final model")
    
print("\n‚úÖ Ready for fresh training!")

üóëÔ∏è  Deleted old checkpoints
‚úÖ Clean checkpoint directory ready

‚úÖ Ready for fresh training!


In [2]:
import os
import sys

# Fix directory if in notebooks folder
current_dir = os.getcwd()
if os.path.basename(current_dir) == 'notebooks':
    os.chdir(os.path.dirname(current_dir))
    print(f"‚úÖ Changed to: {os.getcwd()}")

# Verify files
required_files = [
    'config/training_config.yaml',
    'data/processed/augmented_train.json',
    'src/trainer.py',
    'src/inference.py',
    'src/evaluation.py'
]

print("\n=== Verifying Files ===")
for file_path in required_files:
    exists = os.path.exists(file_path)
    print(f"{'‚úÖ' if exists else '‚ùå'} {file_path}")
    if not exists:
        raise FileNotFoundError(f"{file_path} not found!")

print("\n‚úÖ All files verified!")

‚úÖ Changed to: d:\MINESTUDY\Research\idiom3.0\idiom3.0

=== Verifying Files ===
‚úÖ config/training_config.yaml
‚úÖ data/processed/augmented_train.json
‚úÖ src/trainer.py
‚úÖ src/inference.py
‚úÖ src/evaluation.py

‚úÖ All files verified!


In [3]:
import sys
sys.path.append('src')

from src.trainer import (
    setup_model_and_tokenizer,
    apply_lora,
    prepare_dataset,
    train_model,
    save_checkpoint
)
from src.inference import translate_with_idioms, extract_idioms
from src.evaluation import evaluate_model, print_evaluation_report, save_predictions

import torch
import yaml
import json
from pathlib import Path
import shutil

print("‚úÖ Imports successful!")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

‚úÖ Imports successful!
PyTorch: 2.10.0+cpu
CUDA: False


In [4]:
with open('config/training_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("‚úÖ Config loaded!")
print(f"Model: {config['model']['base_model']}")
print(f"Epochs: {config['training']['num_epochs']}")
print(f"Batch size: {config['training']['batch_size']}")
print(f"Learning rate: {config['training']['learning_rate']}")

‚úÖ Config loaded!
Model: facebook/nllb-200-distilled-600M
Epochs: 10
Batch size: 4
Learning rate: 0.0003


In [None]:
print("="*80)
print("LOADING MODEL")
print("="*80)

model, tokenizer = setup_model_and_tokenizer(
    model_name=config['model']['base_model'],
    special_tokens=None  # CRITICAL: No special tokens!
)

print(f"\n‚úì Model loaded")
print(f"‚úì Vocab size: {len(tokenizer)}")

# VERIFY - Must be 256204!
assert len(tokenizer) == 256204, f"ERROR: Vocab is {len(tokenizer)}, should be 256204!"
print("‚úÖ Vocabulary size correct!")

# Verify language support
if hasattr(tokenizer, 'lang_code_to_id'):
    print(f"‚úì {len(tokenizer.lang_code_to_id)} languages supported")
    print(f"‚úì eng_Latn: {tokenizer.lang_code_to_id.get('eng_Latn', 'NOT FOUND')}")
    print(f"‚úì sin_Sinh: {tokenizer.lang_code_to_id.get('sin_Sinh', 'NOT FOUND')}")

LOADING MODEL
Loading model: facebook/nllb-200-distilled-600M


In [None]:
import sys
sys.path.append('src')

from src.trainer import setup_model_and_tokenizer

model, tokenizer = setup_model_and_tokenizer(
    model_name="facebook/nllb-200-distilled-600M",
    special_tokens=None
)

print(f"Vocabulary size: {len(tokenizer)}")
print(f"Should be: 256204")
print(f"Match: {len(tokenizer) == 256204}")

Loading model: facebook/nllb-200-distilled-600M
‚úì Loaded NllbTokenizer
  Tokenizer type: NllbTokenizer


Loading weights:   0%|          | 0/512 [00:00<?, ?it/s]



‚úì Vocabulary size: 256204 (unchanged)
‚úì Manually added 202 language codes
‚úì Source language: eng_Latn
‚úì Model loaded successfully
Vocabulary size: 256204
Should be: 256204
Match: True


In [None]:
# Test how idiom tags are tokenized
test_text = "He <IDIOM>kicked the bucket</IDIOM> yesterday."

tokenizer.src_lang = "eng_Latn"
tokens = tokenizer(test_text, return_tensors="pt")

print(f"Input: {test_text}")
print(f"Token IDs: {tokens['input_ids'][0][:15]}...")
print(f"Decoded: {tokenizer.decode(tokens['input_ids'][0])}")
print("\n‚úÖ Idiom tags are tokenized as regular text!")

Input: He <IDIOM>kicked the bucket</IDIOM> yesterday.
Token IDs: tensor([256047,   1808,     45, 231189,  13646, 248123,    307,  46163,    349,
           548,  19054,  68213, 231189,  13646, 248123])...
Decoded: eng_Latn He <IDIOM>kicked the bucket</IDIOM> yesterday.</s>

‚úÖ Idiom tags are tokenized as regular text!


In [None]:
model = apply_lora(model, config['lora'])
print("‚úÖ LoRA applied!")

trainable params: 2,359,296 || all params: 1,404,497,920 || trainable%: 0.1680
‚úì LoRA adapters applied
‚úÖ LoRA applied!


In [None]:
checkpoint_dir = 'models/checkpoints'
if os.path.exists(checkpoint_dir):
    shutil.rmtree(checkpoint_dir, ignore_errors=True)
    print("üóëÔ∏è  Old checkpoints deleted")
    
os.makedirs(checkpoint_dir, exist_ok=True)
print(f"‚úÖ Clean directory: {checkpoint_dir}")

üóëÔ∏è  Old checkpoints deleted
‚úÖ Clean directory: models/checkpoints


In [None]:
train_dataset = prepare_dataset(
    data_path=config['data']['augmented_json'],
    tokenizer=tokenizer,
    src_lang=config['model']['source_lang'],
    tgt_lang=config['model']['target_lang'],
    max_length=config['training']['max_length']
)

print(f"‚úÖ Dataset prepared: {len(train_dataset)} examples")

# Show sample
print(f"\nSample:")
print(f"  Input IDs: {train_dataset[0]['input_ids'][:10]}...")
print(f"  Labels: {train_dataset[0]['labels'][:10]}...")

‚úì Loaded 920 examples from data/processed/augmented_train.json


Map:   0%|          | 0/920 [00:00<?, ? examples/s]

‚úÖ Dataset prepared: 920 examples

Sample:
  Input IDs: [256047, 1617, 153219, 502, 158826, 349, 32639, 811, 138239, 349]...
  Labels: [256153, 256047, 46010, 30936, 5743, 46328, 171036, 91412, 38954, 2123]...


In [None]:
import sys
sys.path.append('src')

# Force reload
if 'trainer' in sys.modules:
    del sys.modules['trainer']

from src.trainer import setup_model_and_tokenizer

model, tokenizer = setup_model_and_tokenizer(
    model_name="facebook/nllb-200-distilled-600M"
)

print("="*60)
print(f"Model class: {type(model).__name__}")
print(f"Tokenizer vocab: {len(tokenizer)}")
print(f"Model embeddings: {model.model.shared.num_embeddings}")
print(f"MATCH: {len(tokenizer) == model.model.shared.num_embeddings}")
print("="*60)

Loading model: facebook/nllb-200-distilled-600M
‚úì Loaded NllbTokenizer
  Tokenizer type: NllbTokenizer


Loading weights:   0%|          | 0/512 [00:00<?, ?it/s]



‚úì Vocabulary size: 256204 (unchanged)
‚úì Manually added 202 language codes
‚úì Source language: eng_Latn
‚úì Model loaded successfully
Model class: M2M100ForConditionalGeneration
Tokenizer vocab: 256204
Model embeddings: 256206
MATCH: False


In [None]:
from pathlib import Path

output_dir = Path(config['paths']['checkpoints'])
output_dir.mkdir(parents=True, exist_ok=True)

print("\n" + "="*80)
print("STARTING TRAINING")
print("="*80)
print("\nExpected loss:")
print("  Initial: ~2-3 ‚úÖ")
print("  After 50 steps: ~1.8")
print("  Final: ~1.0-1.5")
print("\n‚ö†Ô∏è  If loss > 10, there's an error!")
print("="*80 + "\n")

trained_model, trainer = train_model(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    config={**config['training'], **config['settings']},
    output_dir=str(output_dir)
)

print("\n‚úÖ TRAINING COMPLETE!")


STARTING TRAINING

Expected loss:
  Initial: ~2-3 ‚úÖ
  After 50 steps: ~1.8
  Final: ~1.0-1.5

‚ö†Ô∏è  If loss > 10, there's an error!

Starting training...


  super().__init__(loader)


In [None]:
final_model_path = config['paths']['final_model']
save_checkpoint(trained_model, tokenizer, final_model_path)

print(f"‚úÖ Model saved to: {final_model_path}")

# Verify
if Path(final_model_path).exists():
    files = list(Path(final_model_path).glob('*'))
    print(f"   Saved {len(files)} files")

In [None]:
print("="*80)
print("TEST TRANSLATIONS")
print("="*80 + "\n")

test_sentences = [
    "He <IDIOM>kicked the bucket</IDIOM> yesterday.",
    "She has a <IDIOM>green thumb</IDIOM> for gardening.",
    "It's <IDIOM>raining cats and dogs</IDIOM> outside.",
    "Don't <IDIOM>cry over spilled milk</IDIOM>.",
    "Let's <IDIOM>break the ice</IDIOM> with conversation."
]

translations = translate_with_idioms(
    model=trained_model,
    tokenizer=tokenizer,
    source_texts=test_sentences,
    src_lang=config['model']['source_lang'],
    tgt_lang=config['model']['target_lang'],
    num_beams=5
)

for i, (src, tgt) in enumerate(zip(test_sentences, translations), 1):
    print(f"{i}. EN: {src}")
    print(f"   SI: {tgt}")
    
    src_idioms = extract_idioms(src)
    tgt_idioms = extract_idioms(tgt)
    
    if src_idioms:
        print(f"   Source idioms: {src_idioms}")
    if tgt_idioms:
        print(f"   Target idioms: {tgt_idioms}")
    print()

print("="*80)

In [None]:
print("="*80)
print("MODEL EVALUATION")
print("="*80 + "\n")

# Load test data
with open(config['data']['augmented_json'], 'r', encoding='utf-8') as f:
    all_data = json.load(f)

# Use subset for quick evaluation
test_data = all_data[:100]
print(f"Evaluating on {len(test_data)} examples...\n")

# Evaluate
metrics, predictions = evaluate_model(
    model=trained_model,
    tokenizer=tokenizer,
    test_data=test_data,
    src_lang=config['model']['source_lang'],
    tgt_lang=config['model']['target_lang']
)

# Print report
print_evaluation_report(metrics)

# Save predictions
os.makedirs('outputs', exist_ok=True)
save_predictions(
    predictions=predictions,
    references=[ex['target_si'] for ex in test_data],
    source_texts=[ex['source_en'] for ex in test_data],
    output_path='outputs/predictions.json'
)

print("\n‚úÖ Evaluation complete!")
print("   Results saved to: outputs/predictions.json")

In [None]:
# Load and display some predictions
with open('outputs/predictions.json', 'r', encoding='utf-8') as f:
    results = json.load(f)

print("="*80)
print("EXAMPLE PREDICTIONS")
print("="*80 + "\n")

# Show first 5 examples
for i, result in enumerate(results[:5], 1):
    print(f"Example {i}:")
    print(f"  Source: {result['source']}")
    print(f"  Prediction: {result['prediction']}")
    print(f"  Reference: {result['reference']}")
    
    if result['source_idioms']:
        print(f"  Source idioms: {result['source_idioms']}")
    if result['predicted_idioms']:
        print(f"  Predicted idioms: {result['predicted_idioms']}")
    if result['reference_idioms']:
        print(f"  Reference idioms: {result['reference_idioms']}")
    print()

print("="*80)

In [None]:
# Calculate summary statistics from predictions
with open('outputs/predictions.json', 'r', encoding='utf-8') as f:
    results = json.load(f)

total_examples = len(results)
examples_with_idioms = sum(1 for r in results if r['source_idioms'])
idioms_correctly_translated = sum(
    1 for r in results 
    if r['source_idioms'] and r['predicted_idioms'] and 
    any(pred in ref for pred in r['predicted_idioms'] for ref in r['reference_idioms'])
)

print("="*80)
print("SUMMARY STATISTICS")
print("="*80 + "\n")

print(f"Total examples evaluated: {total_examples}")
print(f"Examples with idioms: {examples_with_idioms}")
print(f"Idioms with correct translation: {idioms_correctly_translated}")

if examples_with_idioms > 0:
    idiom_acc = (idioms_correctly_translated / examples_with_idioms) * 100
    print(f"\nIdiom translation accuracy: {idiom_acc:.2f}%")

print("\n" + "="*80)