In [15]:
import os

# FORCE change to project root
current = os.getcwd()
print(f"Current directory: {current}")

# If we're in notebooks, go up one level
if 'notebooks' in current:
    project_root = os.path.dirname(current)
    os.chdir(project_root)
    print(f"‚úÖ Changed to project root: {os.getcwd()}")
else:
    print(f"‚úÖ Already in project root")

# Verify
assert os.path.exists('config/training_config.yaml'), "ERROR: Not in project root!"
assert os.path.exists('data/processed/augmented_train.json'), "ERROR: Data file missing!"
assert os.path.exists('src/trainer.py'), "ERROR: trainer.py missing!"

print("\n‚úÖ All files verified! Safe to proceed.")

Current directory: d:\MINESTUDY\Research\idiom3.0\idiom3.0
‚úÖ Already in project root

‚úÖ All files verified! Safe to proceed.


In [16]:
# ==========================================
# FORCE RELOAD - Run this after editing .py files
# ==========================================
import sys
import importlib

# Remove cached modules
modules_to_reload = [
    'trainer',
    'data_processor', 
    'augmentation',
    'inference',
    'evaluation'
]

for module in modules_to_reload:
    if module in sys.modules:
        del sys.modules[module]
        
print("‚úì Cleared module cache - changes will be loaded")

‚úì Cleared module cache - changes will be loaded


In [17]:
# ==========================================
# FORCE RELOAD - Run this after editing .py files
# ==========================================
import sys
import importlib

# Remove cached modules
modules_to_reload = [
    'trainer',
    'data_processor', 
    'augmentation',
    'inference',
    'evaluation'
]

for module in modules_to_reload:
    if module in sys.modules:
        del sys.modules[module]
        
print("‚úì Cleared module cache - changes will be loaded")

‚úì Cleared module cache - changes will be loaded


In [18]:
import sys
sys.path.append('..')

from src.trainer import (
    setup_model_and_tokenizer,
    apply_lora,
    prepare_dataset,
    train_model,
    save_checkpoint,
    load_config
)
import torch
import yaml
import matplotlib.pyplot as plt
import json
from pathlib import Path

print("‚úì Imports successful")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

‚úì Imports successful
PyTorch version: 2.10.0+cpu
CUDA available: False


In [19]:
import yaml
import os

# Get the correct path (go up one level from notebooks folder)
if os.path.basename(os.getcwd()) == 'notebooks':
    config_path = '../config/training_config.yaml'
else:
    config_path = 'config/training_config.yaml'

# Load configuration
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("\n=== Training Configuration ===")
print(f"Base model: {config['model']['base_model']}")
print(f"Source language: {config['model']['source_lang']}")
print(f"Target language: {config['model']['target_lang']}")
print(f"Learning rate: {config['training']['learning_rate']}")
print(f"Batch size: {config['training']['batch_size']}")
print(f"Number of epochs: {config['training']['num_epochs']}")


=== Training Configuration ===
Base model: facebook/nllb-200-distilled-600M
Source language: eng_Latn
Target language: sin_Sinh
Learning rate: 3e-4
Batch size: 4
Number of epochs: 10


In [20]:
# Setup special tokens (kept for compatibility, not used)
special_tokens = [
    config['special_tokens']['idiom_start'],
    config['special_tokens']['idiom_end']
]

# Load model and tokenizer
model, tokenizer = setup_model_and_tokenizer(
    model_name=config['model']['base_model'],
    special_tokens=special_tokens
)

print(f"\n‚úì Model and tokenizer loaded")
print(f"Vocabulary size: {len(tokenizer)}")

Loading model: facebook/nllb-200-distilled-600M


‚úì Loaded NllbTokenizer
  Tokenizer type: NllbTokenizer


Loading weights:   0%|          | 0/512 [00:00<?, ?it/s]



‚úì Vocabulary size: 256204 (unchanged)
‚úì Manually added 202 language codes
‚úì Source language: eng_Latn
‚úì Model loaded successfully

‚úì Model and tokenizer loaded
Vocabulary size: 256204


In [21]:
# ============================================================
# TOKENIZER DIAGNOSTIC - Check if tokenizer loaded correctly
# ============================================================
print("="*80)
print("TOKENIZER DIAGNOSTIC")
print("="*80)

# 1. Check tokenizer type
print(f"\n1. TOKENIZER TYPE:")
print(f"   Type: {type(tokenizer)}")
print(f"   Class: {tokenizer.__class__.__name__}")

# 2. Check for language code support
print(f"\n2. LANGUAGE CODE SUPPORT:")
if hasattr(tokenizer, 'lang_code_to_id'):
    print(f"   ‚úì Has lang_code_to_id")
    print(f"   Available languages: {len(tokenizer.lang_code_to_id)}")
    print(f"   Sample languages: {list(tokenizer.lang_code_to_id.keys())[:5]}")
    
    # Check if our specific languages are present
    if "eng_Latn" in tokenizer.lang_code_to_id:
        print(f"   ‚úì eng_Latn found (ID: {tokenizer.lang_code_to_id['eng_Latn']})")
    else:
        print(f"   ‚úó eng_Latn NOT FOUND!")
        
    if "sin_Sinh" in tokenizer.lang_code_to_id:
        print(f"   ‚úì sin_Sinh found (ID: {tokenizer.lang_code_to_id['sin_Sinh']})")
    else:
        print(f"   ‚úó sin_Sinh NOT FOUND!")
else:
    print("   ‚úó Missing lang_code_to_id attribute!")
    print("   ‚ö†Ô∏è  This will cause high training loss!")

# 3. Check src_lang support
print(f"\n3. SOURCE LANGUAGE ATTRIBUTE:")
if hasattr(tokenizer, 'src_lang'):
    print(f"   ‚úì Has src_lang: {tokenizer.src_lang}")
else:
    print("   ‚úó Missing src_lang attribute!")

# 4. Check special tokens
print(f"\n4. SPECIAL TOKENS:")
print(f"   {tokenizer.special_tokens_map}")

# 5. Check vocabulary size
print(f"\n5. VOCABULARY:")
print(f"   Total vocab size: {len(tokenizer)}")
print(f"   Expected: 256204 (original NLLB)")

# 6. Quick test tokenization
print(f"\n6. QUICK TOKENIZATION TEST:")
try:
    test_text = "Hello world"
    tokenizer.src_lang = config['model']['source_lang']
    test_tokens = tokenizer(test_text, return_tensors="pt")
    print(f"   ‚úì Tokenization works")
    print(f"   Sample: '{test_text}' ‚Üí {test_tokens['input_ids'][0][:5]}...")
except Exception as e:
    print(f"   ‚úó Tokenization failed: {e}")

print("\n" + "="*80)
print("END OF DIAGNOSTIC")
print("="*80 + "\n")

# Decision helper
if not hasattr(tokenizer, 'lang_code_to_id'):
    print("‚ùå ERROR: Tokenizer missing lang_code_to_id!")
    print("   Action: Update src/trainer.py with the corrected version")
    print("="*80)
elif len(tokenizer) != 256204:
    print("‚ö†Ô∏è  WARNING: Vocabulary size is wrong!")
    print(f"   Got: {len(tokenizer)}, Expected: 256204")
    print("   This will cause high loss - special tokens were added!")
    print("="*80)
else:
    print("‚úÖ Tokenizer looks good! Safe to proceed with training.")
    print("="*80)

TOKENIZER DIAGNOSTIC

1. TOKENIZER TYPE:
   Type: <class 'transformers.models.nllb.tokenization_nllb.NllbTokenizer'>
   Class: NllbTokenizer

2. LANGUAGE CODE SUPPORT:
   ‚úì Has lang_code_to_id
   Available languages: 202
   Sample languages: ['nus_Latn', 'ory_Orya', 'cat_Latn', 'shn_Mymr', 'glg_Latn']
   ‚úì eng_Latn found (ID: 256047)
   ‚úì sin_Sinh found (ID: 256153)

3. SOURCE LANGUAGE ATTRIBUTE:
   ‚úì Has src_lang: eng_Latn

4. SPECIAL TOKENS:
   {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}

5. VOCABULARY:
   Total vocab size: 256204
   Expected: 256204 (original NLLB)

6. QUICK TOKENIZATION TEST:
   ‚úì Tokenization works
   Sample: 'Hello world' ‚Üí tensor([256047,  94124,  15697,      2])...

END OF DIAGNOSTIC

‚úÖ Tokenizer looks good! Safe to proceed with training.


In [22]:
# Apply LoRA to the model
model = apply_lora(model, config['lora'])

print("\n‚úì LoRA adapters applied successfully")

trainable params: 4,718,592 || all params: 1,406,857,216 || trainable%: 0.3354
‚úì LoRA adapters applied

‚úì LoRA adapters applied successfully


In [23]:
# Clean up old checkpoints before training
import shutil
import os

checkpoint_dir = 'models/checkpoints'

if os.path.exists(checkpoint_dir):
    print(f"üóëÔ∏è  Deleting old checkpoints from {checkpoint_dir}...")
    shutil.rmtree(checkpoint_dir, ignore_errors=True)
    print("‚úì Old checkpoints deleted")
else:
    print("‚úì No old checkpoints to delete")

# Recreate empty directory
os.makedirs(checkpoint_dir, exist_ok=True)
print(f"‚úì Clean checkpoint directory ready: {checkpoint_dir}")

üóëÔ∏è  Deleting old checkpoints from models/checkpoints...
‚úì Old checkpoints deleted
‚úì Clean checkpoint directory ready: models/checkpoints


In [24]:
# Load augmented training data
train_dataset = prepare_dataset(
    data_path=config['data']['augmented_json'],
    tokenizer=tokenizer,
    src_lang=config['model']['source_lang'],
    tgt_lang=config['model']['target_lang'],
    max_length=config['training']['max_length']
)

print(f"\n‚úì Training dataset prepared")
print(f"Number of training examples: {len(train_dataset)}")

‚úì Loaded 920 examples from data/processed/augmented_train.json


Map:   0%|          | 0/920 [00:00<?, ? examples/s]


‚úì Training dataset prepared
Number of training examples: 920


In [25]:
# Create output directory
output_dir = Path(config['paths']['checkpoints'])
output_dir.mkdir(parents=True, exist_ok=True)

# Train the model
print("\n" + "="*80)
print("STARTING TRAINING")
print("="*80)
print("This may take a while depending on your hardware:")
print("  ‚Ä¢ CPU: ~2-4 hours")
print("  ‚Ä¢ GPU: ~30-60 minutes")
print("="*80 + "\n")

trained_model, trainer = train_model(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    config={
        **config['training'],
        **config['settings']
    },
    output_dir=str(output_dir)
)

print("\n" + "="*80)
print("‚úì TRAINING COMPLETED!")
print("="*80)


STARTING TRAINING
This may take a while depending on your hardware:
  ‚Ä¢ CPU: ~2-4 hours
  ‚Ä¢ GPU: ~30-60 minutes

‚úì Using forced_bos_token_id: 256153 for sin_Sinh
‚úì Set model.generation_config.forced_bos_token_id = 256153
‚úì Set model.config.forced_bos_token_id = 256153
Starting training...


  super().__init__(loader)


Step,Training Loss
10,23.409413
20,23.23819


KeyboardInterrupt: 