In [None]:
import time, pandas as pd, numpy as np, random, torch
import torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import sentencepiece as spm, nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from sklearn.model_selection import train_test_split

#  1. SETUP 
nltk.download('punkt', quiet=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
random.seed(42); np.random.seed(42); torch.manual_seed(42)
if torch.cuda.is_available(): torch.cuda.manual_seed(42)

#  2. LOAD & TOKENIZER 
df = pd.read_csv('/kaggle/input/filtered/Filtered_data.tsv', sep='\t',
                 names=['asm','eng'], on_bad_lines='skip').dropna()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df  = train_test_split(train_df, test_size=0.1, random_state=42)

with open('all.txt','w',encoding='utf-8') as f:
    for t in pd.concat([train_df['asm'], train_df['eng']]):
        f.write(t.lower().strip() + '\n')

spm.SentencePieceTrainer.Train(
    '--input=all.txt --model_prefix=spm --vocab_size=16000 '
    '--character_coverage=1.0 --model_type=bpe '
    '--pad_id=0 --pad_piece=<pad> --bos_id=1 --bos_piece=<s> '
    '--eos_id=2 --eos_piece=</s> --unk_id=3 --unk_piece=<unk>'
)
sp = spm.SentencePieceProcessor(); sp.Load('spm.model')
PAD,SOS,EOS,UNK = sp.pad_id(), sp.bos_id(), sp.eos_id(), sp.unk_id()
VOCAB_SIZE = sp.GetPieceSize()

#  3. DATASET & DATALOADERS 
class TranslationDataset(Dataset):
    def __init__(self, df, max_len=100):
        self.src = df['asm'].astype(str).tolist()
        self.trg = df['eng'].astype(str).tolist()
        self.max = max_len
    def __len__(self): return len(self.src)
    def __getitem__(self,i):
        src_ids = [SOS] + sp.EncodeAsIds(self.src[i].lower())[:self.max-2] + [EOS]
        trg_ids = [SOS] + sp.EncodeAsIds(self.trg[i].lower())[:self.max-2] + [EOS]
        return torch.LongTensor(src_ids), torch.LongTensor(trg_ids)

def collate_fn(batch):
    srcs, trgs = zip(*batch)
    return (pad_sequence(srcs, batch_first=True, padding_value=PAD),
            pad_sequence(trgs, batch_first=True, padding_value=PAD))

BATCH_SIZE = 32
train_loader = DataLoader(TranslationDataset(train_df), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(TranslationDataset(val_df),   batch_size=BATCH_SIZE, shuffle=False,collate_fn=collate_fn)
test_loader  = DataLoader(TranslationDataset(test_df),  batch_size=BATCH_SIZE, shuffle=False,collate_fn=collate_fn)

#  4. MODEL ARCH 
EMB_DIM, HID_DIM = 400, 1024
ENC_LAYERS, DEC_LAYERS, DROPOUT = 4, 4, 0.1

class Encoder(nn.Module):
    def __init__(self, vs,ed,hd,nl,drop):
        super().__init__()
        self.embedding = nn.Embedding(vs,ed,padding_idx=PAD)
        self.gru = nn.GRU(ed,hd,nl,bidirectional=True,dropout=drop,batch_first=True)
        self.fc  = nn.Linear(hd*2,hd)
        self.do  = nn.Dropout(drop)
        self.nl,self.hd = nl,hd
    def forward(self,src):
        emb = self.do(self.embedding(src))
        out,h = self.gru(emb)
        B = src.size(0)
        h = h.view(self.nl,2,B,self.hd)
        h_cat = torch.cat([h[:,0],h[:,1]],dim=2)
        return out, torch.tanh(self.fc(h_cat))

class Attention(nn.Module):
    def __init__(self,hd):
        super().__init__()
        self.attn = nn.Linear(hd*3,hd)
        self.v    = nn.Linear(hd,1,bias=False)
    def forward(self,dec_h,enc_o,mask):
        B,T,_ = enc_o.size()
        dt = dec_h[-1].unsqueeze(1).expand(-1,T,-1)
        e  = torch.tanh(self.attn(torch.cat([dt,enc_o],dim=2)))
        s  = self.v(e).squeeze(2).masked_fill(mask==0,-1e10)
        return torch.softmax(s,dim=1)

class Decoder(nn.Module):
    def __init__(self,vs,ed,eh,dh,nl,drop,attn):
        super().__init__()
        self.emb  = nn.Embedding(vs,ed,padding_idx=PAD)
        self.attn = attn
        self.gru  = nn.GRU(ed+eh*2,dh,nl,dropout=drop,batch_first=True)
        self.fc   = nn.Linear(dh+eh*2+ed,vs)
        self.do   = nn.Dropout(drop)
    def forward(self,in_tok,h,enc_o,mask):
        emb = self.do(self.emb(in_tok).unsqueeze(1))
        w   = self.attn(h,enc_o,mask).unsqueeze(1)
        ctx = torch.bmm(w,enc_o)
        inp = torch.cat([emb,ctx],dim=2)
        out,nh = self.gru(inp,h)
        o,emb,ctx = out.squeeze(1),emb.squeeze(1),ctx.squeeze(1)
        return self.fc(torch.cat([o,ctx,emb],1)), nh

class Seq2Seq(nn.Module):
    def __init__(self,e,d):
        super().__init__(); self.enc=e; self.dec=d
    def forward(self,src,trg,tf):
        B,T = trg.size(); V=self.dec.emb.num_embeddings
        out = torch.zeros(B,T,V,device=src.device)
        enc_o,h = self.enc(src)
        mask = (src!=PAD)
        inp = trg[:,0]
        for t in range(1,T):
            pred,h = self.dec(inp,h,enc_o,mask)
            out[:,t] = pred
            inp = trg[:,t] if random.random()<tf else pred.argmax(1)
        return out

# share embeddings
enc = Encoder(VOCAB_SIZE,EMB_DIM,HID_DIM,ENC_LAYERS,DROPOUT)
dec = Decoder(VOCAB_SIZE,EMB_DIM,HID_DIM,HID_DIM,DEC_LAYERS,DROPOUT,Attention(HID_DIM))
dec.emb.weight = enc.embedding.weight
model = Seq2Seq(enc,dec).to(device)

#  5. TRAINING SETUP 
NUM_EPOCHS, PATIENCE = 30, 10
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=PAD, label_smoothing=0.1)
from torch.optim.lr_scheduler import OneCycleLR
total_steps = NUM_EPOCHS * len(train_loader)
scheduler = OneCycleLR(optimizer,
    max_lr=5e-4,
    total_steps=NUM_EPOCHS * len(train_loader),
    pct_start=0.2,
    anneal_strategy='cos') 
smooth = SmoothingFunction().method1

#  6. 
def beam_search(src, k=5, max_len=50):
    model.eval()
    with torch.no_grad():
        enc_o,h = model.enc(src)
        mask = (src!=PAD)
        beams = [([SOS],0.0,h)]
        for _ in range(max_len):
            nxt=[]
            for seq,sc,hh in beams:
                if seq[-1]==EOS:
                    nxt.append((seq,sc,hh)); continue
                inp = torch.tensor([seq[-1]],device=src.device)
                out,hh2 = model.dec(inp,hh,enc_o,mask)
                lp = torch.log_softmax(out,1).squeeze(0)
                v,i = lp.topk(k)
                for j in range(k):
                    nxt.append((seq+[i[j].item()], sc+v[j].item(), hh2))
            beams = sorted(nxt, key=lambda x:-x[1])[:k]
            if all(s[-1]==EOS for s,_,_ in beams): break
        return beams[0][0][1:-1]

def compute_val_loss():
    model.eval(); vl=0
    with torch.no_grad():
        for src,trg in val_loader:
            src,trg=src.to(device),trg.to(device)
            out = model(src,trg,0.0)
            of = out[:,1:,:].reshape(-1,VOCAB_SIZE)
            tf = trg[:,1:].reshape(-1)
            vl += criterion(of,tf).item()
    return vl / len(val_loader)

def calc_bleu(quick=True):
    refs,hyps=[],[]
    model.eval()
    max_batches = 2 if quick else 10
    samples_per = 3 if quick else 5
    with torch.no_grad():
        for i,(src,trg) in enumerate(val_loader):
            if i>=max_batches: break
            src,trg=src.to(device),trg.to(device)
            for j in range(min(src.size(0), samples_per)):
                ps = beam_search(src[j:j+1],k=5,max_len=trg.size(1))
                hyps.append(sp.DecodeIds(ps).split())
                r = trg[j].tolist()
                if EOS in r: r = r[1:r.index(EOS)]
                refs.append([sp.DecodeIds(r).split()])
    return corpus_bleu(refs,hyps,smoothing_function=smooth) * 100

#  6b. Teacher‑forcing schedule 
def get_tf_ratio(ep):
    if ep <= 5:
        return 1.0
    return max(0.1, 1.0 - (ep - 5) / (NUM_EPOCHS - 5))

#  7. TRAINING LOOP w combined early stop 
print("Training started—see epoch markers below.")
best_bleu = 0.0
best_val_loss = float('inf')
no_imp = 0

for ep in range(1, NUM_EPOCHS + 1):
    start = time.time()
    print(f"\n>>> Epoch {ep}/{NUM_EPOCHS}", flush=True)

    model.train()
    tf_ratio = get_tf_ratio(ep)
    train_loss = 0.0

    for b, (src, trg) in enumerate(train_loader, 1):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        out = model(src, trg, tf_ratio)
        of = out[:,1:,:].reshape(-1, VOCAB_SIZE)
        tf = trg[:,1:].reshape(-1)
        loss = criterion(of, tf)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
        if b % 20 == 0:
            print(".", end="", flush=True)

    # --- VALIDATION METRICS ---
    val_bleu = calc_bleu(quick=True)
    val_loss = compute_val_loss()
    epoch_time = (time.time() - start) / 60
    avg_train = train_loss / len(train_loader)
    lr = optimizer.param_groups[0]['lr']

    print(f"\nEpoch {ep} done in {epoch_time:.1f}m | "
          f"TrainLoss {avg_train:.3f} | ValLoss {val_loss:.3f} | "
          f"BLEU {val_bleu:.2f} | TF {tf_ratio:.2f} | LR {lr:.5f}")

    improved = False

    if val_bleu > best_bleu + 1e-4:
        best_bleu = val_bleu
        torch.save(model.state_dict(), 'best_beam_bleu.pt')
        improved = True
        print("  → New best BLEU saved")

    if val_loss < best_val_loss - 1e-3:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_beam_loss.pt')
        improved = True
        print("  → New best ValLoss saved")

    if not improved:
        no_imp += 1
        print(f"  No improvement for {no_imp}/{PATIENCE} epochs")
        if no_imp >= PATIENCE:
            print("Early stopping")
            break
    else:
        no_imp = 0

Training started—see epoch markers below.

>>> Epoch 1/30
........
Epoch 1 done in 3.2m | TrainLoss 7.781 | ValLoss 7.229 | BLEU 0.37 | TF 1.00 | LR 0.00005
  → New best BLEU saved
  → New best ValLoss saved

>>> Epoch 2/30
........
Epoch 2 done in 3.3m | TrainLoss 6.977 | ValLoss 7.437 | BLEU 0.61 | TF 1.00 | LR 0.00014
  → New best BLEU saved

>>> Epoch 3/30
........
Epoch 3 done in 3.3m | TrainLoss 6.545 | ValLoss 7.891 | BLEU 0.28 | TF 1.00 | LR 0.00026
  No improvement for 1/10 epochs

>>> Epoch 4/30
........
Epoch 4 done in 3.3m | TrainLoss 5.923 | ValLoss 8.068 | BLEU 0.44 | TF 1.00 | LR 0.00038
  No improvement for 2/10 epochs

>>> Epoch 5/30
........
Epoch 5 done in 3.2m | TrainLoss 5.258 | ValLoss 8.356 | BLEU 4.05 | TF 1.00 | LR 0.00047
  → New best BLEU saved

>>> Epoch 6/30
........
Epoch 6 done in 3.2m | TrainLoss 4.594 | ValLoss 8.182 | BLEU 1.03 | TF 0.96 | LR 0.00050
  No improvement for 1/10 epochs

>>> Epoch 7/30
........
Epoch 7 done in 3.3m | TrainLoss 3.937 | ValL

In [None]:
#  8. FULL EVAL AFTER TRAINING 
print("\nFinal evaluation with full BLEU on validation and test:")

val_full_bleu = calc_bleu(quick=False)
print(f"Validation BLEU (full): {val_full_bleu:.2f}")

model.load_state_dict(torch.load('best_beam.pt'))
refs,hyps=[],[]
with torch.no_grad():
    for i,(src,trg) in enumerate(test_loader):
        if i >= 10: break
        src,trg=src.to(device),trg.to(device)
        for j in range(min(src.size(0),5)):
            ps = beam_search(src[j:j+1],k=5,max_len=trg.size(1))
            hyps.append(sp.DecodeIds(ps).split())
            r = trg[j].tolist()
            if EOS in r: r = r[1:r.index(EOS)]
            refs.append([sp.DecodeIds(r).split()])
test_bleu = corpus_bleu(refs,hyps,smoothing_function=smooth) * 100
print(f"Test BLEU: {test_bleu:.2f}")

print("\nSample translations:")
for _ in range(3):
    idx = random.randrange(len(test_df))
    s = test_df.iloc[idx]['asm']; r = test_df.iloc[idx]['eng']
    ids = [SOS] + sp.EncodeAsIds(s.lower()) + [EOS]
    ps = beam_search(torch.tensor([ids],device=device),k=5,max_len=len(ids))
    print("SRC:", s)
    print("REF:", r)
    print("PRED:", sp.DecodeIds(ps))
    print("-"*50)

print("Training complete!")


Final evaluation with full BLEU on validation and test:
Validation BLEU (full): 3.40


RuntimeError: Error(s) in loading state_dict for Seq2Seq:
	Missing key(s) in state_dict: "enc.gru.weight_ih_l3", "enc.gru.weight_hh_l3", "enc.gru.bias_ih_l3", "enc.gru.bias_hh_l3", "enc.gru.weight_ih_l3_reverse", "enc.gru.weight_hh_l3_reverse", "enc.gru.bias_ih_l3_reverse", "enc.gru.bias_hh_l3_reverse", "dec.gru.weight_ih_l3", "dec.gru.weight_hh_l3", "dec.gru.bias_ih_l3", "dec.gru.bias_hh_l3". 
	size mismatch for enc.embedding.weight: copying a param with shape torch.Size([16000, 300]) from checkpoint, the shape in current model is torch.Size([16000, 400]).
	size mismatch for enc.gru.weight_ih_l0: copying a param with shape torch.Size([2304, 300]) from checkpoint, the shape in current model is torch.Size([3072, 400]).
	size mismatch for enc.gru.weight_hh_l0: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for enc.gru.bias_ih_l0: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.bias_hh_l0: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.weight_ih_l0_reverse: copying a param with shape torch.Size([2304, 300]) from checkpoint, the shape in current model is torch.Size([3072, 400]).
	size mismatch for enc.gru.weight_hh_l0_reverse: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for enc.gru.bias_ih_l0_reverse: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.bias_hh_l0_reverse: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.weight_ih_l1: copying a param with shape torch.Size([2304, 1536]) from checkpoint, the shape in current model is torch.Size([3072, 2048]).
	size mismatch for enc.gru.weight_hh_l1: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for enc.gru.bias_ih_l1: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.bias_hh_l1: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.weight_ih_l1_reverse: copying a param with shape torch.Size([2304, 1536]) from checkpoint, the shape in current model is torch.Size([3072, 2048]).
	size mismatch for enc.gru.weight_hh_l1_reverse: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for enc.gru.bias_ih_l1_reverse: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.bias_hh_l1_reverse: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.weight_ih_l2: copying a param with shape torch.Size([2304, 1536]) from checkpoint, the shape in current model is torch.Size([3072, 2048]).
	size mismatch for enc.gru.weight_hh_l2: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for enc.gru.bias_ih_l2: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.bias_hh_l2: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.weight_ih_l2_reverse: copying a param with shape torch.Size([2304, 1536]) from checkpoint, the shape in current model is torch.Size([3072, 2048]).
	size mismatch for enc.gru.weight_hh_l2_reverse: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for enc.gru.bias_ih_l2_reverse: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.gru.bias_hh_l2_reverse: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for enc.fc.weight: copying a param with shape torch.Size([768, 1536]) from checkpoint, the shape in current model is torch.Size([1024, 2048]).
	size mismatch for enc.fc.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([1024]).
	size mismatch for dec.emb.weight: copying a param with shape torch.Size([16000, 300]) from checkpoint, the shape in current model is torch.Size([16000, 400]).
	size mismatch for dec.attn.attn.weight: copying a param with shape torch.Size([768, 2304]) from checkpoint, the shape in current model is torch.Size([1024, 3072]).
	size mismatch for dec.attn.attn.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([1024]).
	size mismatch for dec.attn.v.weight: copying a param with shape torch.Size([1, 768]) from checkpoint, the shape in current model is torch.Size([1, 1024]).
	size mismatch for dec.gru.weight_ih_l0: copying a param with shape torch.Size([2304, 1836]) from checkpoint, the shape in current model is torch.Size([3072, 2448]).
	size mismatch for dec.gru.weight_hh_l0: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for dec.gru.bias_ih_l0: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for dec.gru.bias_hh_l0: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for dec.gru.weight_ih_l1: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for dec.gru.weight_hh_l1: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for dec.gru.bias_ih_l1: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for dec.gru.bias_hh_l1: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for dec.gru.weight_ih_l2: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for dec.gru.weight_hh_l2: copying a param with shape torch.Size([2304, 768]) from checkpoint, the shape in current model is torch.Size([3072, 1024]).
	size mismatch for dec.gru.bias_ih_l2: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for dec.gru.bias_hh_l2: copying a param with shape torch.Size([2304]) from checkpoint, the shape in current model is torch.Size([3072]).
	size mismatch for dec.fc.weight: copying a param with shape torch.Size([16000, 2604]) from checkpoint, the shape in current model is torch.Size([16000, 3472]).

In [None]:
# ==============================================================================
# === 1. IMPORTS AND SETUP ===
# ==============================================================================
import pandas as pd
import numpy as np
import random
import time
import sentencepiece as spm
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Set device and ensure reproducibility for consistent results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# ==============================================================================
# === 2. DATA LOADING & TOKENIZER TUNING ===
# ==============================================================================
print("\n--- Loading Data and Training Tokenizer ---")
# Fixed data loading and using correct column names
df = pd.read_csv('/kaggle/input/filtered/Filtered_data.tsv', sep='\t',
                 on_bad_lines='skip', names=["Assamese sentence", "English sentence"])
df.dropna(inplace=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=SEED)

# NEW: Print the number of sentence pairs in each split
print(f"Data split: {len(train_df)} training, {len(val_df)} validation, {len(test_df)} test pairs.")

# Prepare a combined text file for a joint vocabulary
with open('all_text_for_bpe.txt', 'w', encoding='utf-8') as f:
    for text in pd.concat([train_df['Assamese sentence'], train_df['English sentence']]):
        f.write(str(text).strip().lower() + '\n')

# Train a shared BPE model with a tuned vocabulary size
spm.SentencePieceTrainer.Train(
    '--input=all_text_for_bpe.txt --model_prefix=spm_bpe --vocab_size=8000 '
    '--character_coverage=1.0 --model_type=bpe --pad_id=0 --pad_piece=<pad> '
    '--bos_id=1 --bos_piece=<s> --eos_id=2 --eos_piece=</s> --unk_id=3 --unk_piece=<unk>'
)

# Load the trained tokenizer
sp = spm.SentencePieceProcessor()
sp.Load('spm_bpe.model')

# Define special token indices
PAD_IDX, SOS_IDX, EOS_IDX = sp.pad_id(), sp.bos_id(), sp.eos_id()
VOCAB_SIZE = sp.GetPieceSize()
print(f"Joint Vocabulary Size: {VOCAB_SIZE}")

# NEW: Show some sample tokens from the generated vocabulary
print("\n--- Sample Vocabulary Tokens ---")
sample_tokens = [sp.IdToPiece(i) for i in range(4, 25)]
print(f"Sample tokens: {sample_tokens}")


# ==============================================================================
# === 3. DATASET AND DATALOADERS ===
# ==============================================================================
class TranslationDataset(Dataset):
    def __init__(self, df, sp_model, max_len=100):
        self.src_sents = df['Assamese sentence'].astype(str).tolist()
        self.trg_sents = df['English sentence'].astype(str).tolist()
        self.sp_model = sp_model
        self.max_len = max_len

    def __len__(self):
        return len(self.src_sents)

    def __getitem__(self, idx):
        src_encoded = self.sp_model.EncodeAsIds(self.src_sents[idx].lower().strip())
        trg_encoded = self.sp_model.EncodeAsIds(self.trg_sents[idx].lower().strip())
        src_tensor = torch.LongTensor([SOS_IDX] + src_encoded[:self.max_len-2] + [EOS_IDX])
        trg_tensor = torch.LongTensor([SOS_IDX] + trg_encoded[:self.max_len-2] + [EOS_IDX])
        return src_tensor, trg_tensor

def collate_fn(batch):
    srcs, trgs = zip(*batch)
    src_padded = pad_sequence(srcs, batch_first=True, padding_value=PAD_IDX)
    trg_padded = pad_sequence(trgs, batch_first=True, padding_value=PAD_IDX)
    return src_padded, trg_padded

BATCH_SIZE = 64
train_loader = DataLoader(TranslationDataset(train_df, sp), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(TranslationDataset(val_df, sp), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(TranslationDataset(test_df, sp), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
print(f"\nDataLoaders created with batch size {BATCH_SIZE}.")

# ==============================================================================
# === 4. MODEL ARCHITECTURE ===
# ==============================================================================
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.GRU(emb_dim, hid_dim, num_layers=n_layers,
                          bidirectional=True, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc_hidden = nn.Linear(hid_dim * 2, hid_dim)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        s = hidden.shape
        hidden = hidden.view(self.rnn.num_layers, 2, s[1], s[2])
        hidden_cat = torch.cat((hidden[:, 0, :, :], hidden[:, 1, :, :]), dim=2)
        decoder_hidden = torch.tanh(self.fc_hidden(hidden_cat))
        return outputs, decoder_hidden

class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear((hid_dim * 2) + hid_dim, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        top_hidden = hidden[-1]
        src_len = encoder_outputs.shape[1]
        top_hidden = top_hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((top_hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        self.attention = attention
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.GRU((hid_dim * 2) + emb_dim, hid_dim, num_layers=n_layers,
                          dropout=dropout if n_layers > 1 else 0, batch_first=True)
        self.fc_out = nn.Linear((hid_dim * 2) + hid_dim + emb_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)
        context = torch.bmm(a, encoder_outputs)
        rnn_input = torch.cat((embedded, context), dim=2)
        output, new_hidden = self.rnn(rnn_input, hidden)
        embedded, output, context = embedded.squeeze(1), output.squeeze(1), context.squeeze(1)
        prediction = self.fc_out(torch.cat((output, context, embedded), dim=1))
        return prediction, new_hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder, self.decoder, self.device = encoder, decoder, device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.shape
        trg_vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        return outputs

# ==============================================================================
# === 5. TRAINING & EVALUATION FUNCTIONS ===
# ==============================================================================
def train_epoch(model, dataloader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for src, trg in dataloader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    all_refs, all_hyps = [], []
    smooth_fn = SmoothingFunction().method1
    with torch.no_grad():
        for src, trg in dataloader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0) # No teacher forcing
            output_dim = output.shape[-1]
            loss_output, loss_trg = output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1)
            epoch_loss += criterion(loss_output, loss_trg).item()
            hyp_tokens = output.argmax(2)
            for i in range(hyp_tokens.shape[0]):
                hyp_ids, ref_ids = hyp_tokens[i, 1:].tolist(), trg[i, 1:].tolist()
                if EOS_IDX in hyp_ids: hyp_ids = hyp_ids[:hyp_ids.index(EOS_IDX)]
                if EOS_IDX in ref_ids: ref_ids = ref_ids[:ref_ids.index(EOS_IDX)]
                all_hyps.append(sp.decode_ids(hyp_ids).split())
                all_refs.append([sp.decode_ids(ref_ids).split()])
    bleu = corpus_bleu(all_refs, all_hyps, smoothing_function=smooth_fn)
    return epoch_loss / len(dataloader), bleu * 100

# ==============================================================================
# === 6. TUNED HYPERPARAMETERS, INSTANTIATION & TRAINING ===
# ==============================================================================
print("\n--- Initializing Tuned Model and Training ---")
# Tuned hyperparameters for the 10k dataset
EMB_DIM, HID_DIM = 256, 512
ENC_LAYERS, DEC_LAYERS = 2, 2
ENC_DROPOUT, DEC_DROPOUT = 0.5, 0.5
CLIP, NUM_EPOCHS, PATIENCE = 1.0, 50, 7

# Instantiate model
attn = Attention(HID_DIM)
enc = Encoder(VOCAB_SIZE, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_DROPOUT)
dec = Decoder(VOCAB_SIZE, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_DROPOUT, attn)
model = Seq2Seq(enc, dec, device).to(device)
print(f"Model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters.")

# Optimizer, Loss, and Scheduler with strong regularization
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=2, verbose=True)

# Training loop
best_bleu = -1.0
epochs_no_improve = 0
for epoch in range(1, NUM_EPOCHS + 1):
    start_time = time.time()
    train_loss = train_epoch(model, train_loader, optimizer, criterion, CLIP)
    valid_loss, valid_bleu = evaluate(model, val_loader, criterion)
    end_time = time.time()

    scheduler.step(valid_bleu)
    if valid_bleu > best_bleu:
        best_bleu = valid_bleu
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best-model.pt')
        print(f" New best BLEU score: {valid_bleu:.2f}. Model saved.")
    else:
        epochs_no_improve += 1
    
    print(f'Epoch: {epoch:02} | Time: {end_time - start_time:.0f}s | Train Loss: {train_loss:.3f} | '
          f'Val. Loss: {valid_loss:.3f} | Val. BLEU: {valid_bleu:.2f} | Patience: {epochs_no_improve}/{PATIENCE}')
    
    if epochs_no_improve >= PATIENCE:
        print('Early stopping triggered!')
        break
print(f"\nTraining finished. Best validation BLEU: {best_bleu:.2f}")

# ==============================================================================
# === 7. INFERENCE AND FINAL TESTING ===
# ==============================================================================
print("\n--- Loading Best Model and Testing Translations ---")
# Load the best performing model
model.load_state_dict(torch.load('best-model.pt'))

def translate_sentence(sentence, model):
    model.eval()
    tokens = [SOS_IDX] + sp.encode_as_ids(sentence.lower().strip()) + [EOS_IDX]
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)
    trg_indexes = [SOS_IDX]
    for i in range(100):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        if pred_token == EOS_IDX: break
    return sp.decode(trg_indexes).strip()

# Test on some examples
sample_sentences = [
    "তেওঁ আজি বিদ্যালয়লৈ গ'ল।",
    "বইখন টেবুলৰ ওপৰত আছে।",
    "মই তোমাক ভাল পাওঁ।"
]
for sentence in sample_sentences:
    translation = translate_sentence(sentence, model)
    print(f"Source:      {sentence}")
    print(f"Translation: {translation}")
    print("-" * 20)

# Final evaluation on the test set
test_loss, test_bleu = evaluate(model, test_loader, criterion)
print(f'\n Final Test BLEU on unseen data: {test_bleu:.2f} ')

Using device: cuda

--- Loading Data and Training Tokenizer ---
Data split: 5536 training, 616 validation, 1538 test pairs.
Joint Vocabulary Size: 8000

--- Sample Vocabulary Tokens ---
Sample tokens: ['▁t', 'he', '▁a', 'in', '▁the', '▁ক', 'য়', 'াৰ', '▁ব', '▁প', '▁s', '▁o', '▁স', 're', '্ৰ', '▁b', 'er', 'ha', '▁c', 'en', 'on']

DataLoaders created with batch size 64.

--- Initializing Tuned Model and Training ---
Model has 31,173,952 trainable parameters.




⭐️ New best BLEU score: 0.01. Model saved.
Epoch: 01 | Time: 69s | Train Loss: 7.517 | Val. Loss: 7.004 | Val. BLEU: 0.01 | Patience: 0/7
⭐️ New best BLEU score: 0.01. Model saved.
Epoch: 02 | Time: 69s | Train Loss: 6.989 | Val. Loss: 6.977 | Val. BLEU: 0.01 | Patience: 0/7
⭐️ New best BLEU score: 0.01. Model saved.
Epoch: 03 | Time: 69s | Train Loss: 6.932 | Val. Loss: 6.962 | Val. BLEU: 0.01 | Patience: 0/7
Epoch: 04 | Time: 68s | Train Loss: 6.877 | Val. Loss: 6.959 | Val. BLEU: 0.01 | Patience: 1/7
Epoch: 05 | Time: 68s | Train Loss: 6.824 | Val. Loss: 6.932 | Val. BLEU: 0.01 | Patience: 2/7
⭐️ New best BLEU score: 0.02. Model saved.
Epoch: 06 | Time: 69s | Train Loss: 6.779 | Val. Loss: 6.932 | Val. BLEU: 0.02 | Patience: 0/7
⭐️ New best BLEU score: 0.02. Model saved.
Epoch: 07 | Time: 70s | Train Loss: 6.735 | Val. Loss: 6.994 | Val. BLEU: 0.02 | Patience: 0/7
⭐️ New best BLEU score: 0.28. Model saved.
Epoch: 08 | Time: 69s | Train Loss: 6.693 | Val. Loss: 6.960 | Val. BLEU: 0.2