In [None]:
import sys, os

# Ensure project root is on sys.path for imports
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.training_utils import evaluate_bleu, train_one_epoch


In [1]:

import os, sys, json
from pathlib import Path
import torch
import random
import numpy as np

# Stable memory + determinism
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision('high')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

home = Path.home()
desktop = home / "Desktop"
base_dir = desktop / "HindiToEnglishMT"
if not base_dir.exists():
    base_dir = Path.cwd().resolve().parent
print("Base dir:", base_dir)

sys.path.append(str(base_dir / "utils"))
from training_utils import TrainConfig, TokenBatchDataset, build_adaptive_dataloader, train_one_epoch, WarmupInverseSqrtScheduler
from model_utils import Seq2SeqTransformer
import sentencepiece as spm


Using device: cuda
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
Base dir: C:\Users\ashwi\OneDrive\Desktop\HindiToEnglishMT


In [2]:

# Load tokenized data
processed_dir = base_dir / "data" / "processed"
train_path = processed_dir / "train_tokenized.jsonl"
val_path = processed_dir / "val_tokenized.jsonl"

def load_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_data = load_jsonl(train_path)
val_data = load_jsonl(val_path)
print("Train examples:", len(train_data), "Val examples:", len(val_data))

sp = spm.SentencePieceProcessor(model_file=str(base_dir / "models" / "vocab" / "hi_en_unigram.model"))
pad_id = sp.pad_id(); bos_id = sp.bos_id(); eos_id = sp.eos_id()
vocab_size = sp.vocab_size()
print("Vocab size:", vocab_size)


Train examples: 1603359 Val examples: 520
Vocab size: 32000


In [3]:

# Build datasets
train_src = [ex["src"] for ex in train_data]
train_tgt = [ex["tgt"] for ex in train_data]
val_src = [ex["src"] for ex in val_data]
val_tgt = [ex["tgt"] for ex in val_data]

train_ds = TokenBatchDataset(train_src, train_tgt)
val_ds = TokenBatchDataset(val_src, val_tgt)

cfg = TrainConfig(
    d_model=384, num_heads=6, num_encoder_layers=6, num_decoder_layers=6, d_ff=1024,
    dropout=0.15, lr=3e-4, weight_decay=0.01, warmup_steps=2000, max_epochs=30, patience=4,
    label_smoothing=0.1, max_src_len=128, max_tgt_len=128,
    max_tokens_per_microbatch=2048, grad_accum_steps=8, clip_grad_norm=1.0,
    beam_size=4, length_penalty=0.6, seed=42
)
print(cfg)


TrainConfig(d_model=384, num_heads=6, num_encoder_layers=6, num_decoder_layers=6, d_ff=1024, dropout=0.15, lr=0.0003, weight_decay=0.01, warmup_steps=2000, max_epochs=30, patience=4, label_smoothing=0.1, max_src_len=128, max_tgt_len=128, max_tokens_per_microbatch=2048, grad_accum_steps=8, clip_grad_norm=1.0, beam_size=4, length_penalty=0.6, seed=42)


In [4]:

# Build model
from torch.optim import AdamW
from torch.cuda.amp import GradScaler

model = Seq2SeqTransformer(
    vocab_size=vocab_size,
    d_model=cfg.d_model,
    num_heads=cfg.num_heads,
    num_encoder_layers=cfg.num_encoder_layers,
    num_decoder_layers=cfg.num_decoder_layers,
    d_ff=cfg.d_ff,
    dropout=cfg.dropout,
    pad_id=pad_id,
).to(device)

optimizer = AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay, fused=True if device.type == "cuda" else False)
scheduler = WarmupInverseSqrtScheduler(optimizer, warmup_steps=cfg.warmup_steps)
scaler = GradScaler(enabled=(device.type=="cuda"))


  scaler = GradScaler(enabled=(device.type=="cuda"))


In [5]:

# Build adaptive micro-batches to avoid OOMs
train_batches, collate_fn = build_adaptive_dataloader(train_ds, pad_id, cfg, shuffle=True)
val_batches, _ = build_adaptive_dataloader(val_ds, pad_id, cfg, shuffle=False)
print("Train micro-batches:", len(train_batches), "Val micro-batches:", len(val_batches))


Train micro-batches: 30209 Val micro-batches: 13


In [9]:

# Training loop with early stopping based on validation BLEU (greedy for speed during val)
from utils.training_utils import evaluate_bleu
import copy

best_bleu = -1.0
best_state = None
epochs_no_improve = 0

save_dir = base_dir / "models" / "from_scratch"
save_dir.mkdir(parents=True, exist_ok=True)

for epoch in range(1, cfg.max_epochs + 1):
    print(f"\n=== Epoch {epoch}/{cfg.max_epochs} ===")
    train_loss = train_one_epoch(model, optimizer, scheduler, scaler, train_batches, collate_fn, device, pad_id, cfg)
    print(f"Train loss: {train_loss:.4f}")

    # Validation BLEU
    bleu = evaluate_bleu(model, val_batches, collate_fn, device, pad_id, sp, bos_id, eos_id, cfg)
    print(f"Validation BLEU: {bleu:.2f}")

    if bleu > best_bleu:
        best_bleu = bleu
        best_state = copy.deepcopy(model.state_dict())
        torch.save(best_state, save_dir / "best_model.pt")
        torch.save({"cfg": cfg.__dict__, "vocab_path": str(base_dir / "models" / "vocab" / "hi_en_unigram.model")},
                   save_dir / "training_meta.pt")
        epochs_no_improve = 0
        print("Saved new best model.")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epoch(s).")

    if epochs_no_improve >= cfg.patience:
        print("Early stopping.")
        break

print("Best val BLEU:", best_bleu)



=== Epoch 1/30 ===


Train micro-batches:   0%|          | 0/30209 [00:00<?, ?it/s]

  with autocast():
Train micro-batches:   0%|          | 19/30209 [00:38<16:59:54,  2.03s/it]


AcceleratorError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [8]:
import sys, os

# Add project root (where utils/ lives) to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

# Now you can import
from utils.training_utils import evaluate_bleu


In [None]:
import torch

def gpu_mem_report():
    allocated = round(torch.cuda.memory_allocated(0)/1024**2, 1)
    reserved = round(torch.cuda.memory_reserved(0)/1024**2, 1)
    print(f"GPU memory allocated: {allocated} MB | reserved: {reserved} MB")
