In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import sentencepiece as spm  
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import numpy as np

import random
import math
import time
import os


if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using Device: CUDA (NVIDIA GPU)")
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using Device: MPS (Apple Silicon M3 GPU)")
else:
    device = torch.device('cpu')
    print("Using Device: CPU (Slow)")

Using Device: MPS (Apple Silicon M3 GPU)


In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
en_filepath = 'en-ne.txt/OpenSubtitles.en-ne.en'
ne_filepath = 'en-ne.txt/OpenSubtitles.en-ne.ne'

if not os.path.exists('nepali_bpe.model'):
    print("Training Nepali Tokenizer...")
    spm.SentencePieceTrainer.train(
        input=ne_filepath, 
        model_prefix='nepali_bpe', 
        vocab_size=8000, 
        model_type='bpe',
        pad_id=0, bos_id=1, eos_id=2, unk_id=3, 
        character_coverage=1.0
    )
else:
    print("Nepali tokenizer already exists.")

if not os.path.exists('english_bpe.model'):
    print("Training English Tokenizer...")
    spm.SentencePieceTrainer.train(
        input=en_filepath, 
        model_prefix='english_bpe', 
        vocab_size=8000, 
        model_type='bpe',
        pad_id=0, bos_id=1, eos_id=2, unk_id=3, 
        character_coverage=1.0
    )
else:
    print("English tokenizer already exists.")

Nepali tokenizer already exists.
English tokenizer already exists.


In [5]:
sp_en = spm.SentencePieceProcessor(model_file='english_bpe.model')
sp_ne = spm.SentencePieceProcessor(model_file='nepali_bpe.model')

def read_file(filename):
    print(f"Reading {filename}...")
    with open(filename, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

try:
    english_sentences = read_file(en_filepath)
    nepali_sentences = read_file(ne_filepath)
    print(f"Successfully loaded {len(english_sentences)} sentence pairs.")
except FileNotFoundError:
    print("ERROR: File not found. Check if you are in the 'Translation_Model' folder.")

class EnNeDataset(Dataset):
    def __init__(self, en_sentences, ne_sentences, sp_en, sp_ne, max_len=128):
        self.en_sentences = en_sentences
        self.ne_sentences = ne_sentences
        self.sp_en = sp_en
        self.sp_ne = sp_ne
        self.max_len = max_len
        
    def __len__(self):
        return len(self.en_sentences)
    
    def __getitem__(self, idx):
        en_text = self.en_sentences[idx]
        ne_text = self.ne_sentences[idx]
        
        en_encoded = [1] + self.sp_en.encode_as_ids(en_text) + [2]
        ne_encoded = [1] + self.sp_ne.encode_as_ids(ne_text) + [2]
        
        if len(en_encoded) > self.max_len: en_encoded = en_encoded[:self.max_len]
        if len(ne_encoded) > self.max_len: ne_encoded = ne_encoded[:self.max_len]
            
        return torch.tensor(en_encoded), torch.tensor(ne_encoded)

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
    trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=0)
    return src_padded, trg_padded

Reading en-ne.txt/OpenSubtitles.en-ne.en...
Reading en-ne.txt/OpenSubtitles.en-ne.ne...
Successfully loaded 149006 sentence pairs.


In [6]:
BATCH_SIZE = 32

full_dataset = EnNeDataset(english_sentences, nepali_sentences, sp_en, sp_ne)

train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_data, val_data = torch.utils.data.random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"Ready for training!")
print(f"Training Batches: {len(train_loader)}")
print(f"Validation Batches: {len(valid_loader)}")

src_sample, trg_sample = next(iter(train_loader))
print(f"Sample Batch Shape - Source: {src_sample.shape}, Target: {trg_sample.shape}")

Ready for training!
Training Batches: 4191
Validation Batches: 466
Sample Batch Shape - Source: torch.Size([32, 16]), Target: torch.Size([32, 15])


<b>ENCODER</b>

In [7]:
class Encoder(nn.Module):
    def __init__(self,
                 inp_dim,
                 hid_dim,
                 pf_dim,
                 n_layers,
                 n_heads,
                 drop_out,
                 device,
                 block_size = 100):
        super().__init__()
        self.tok_embedding = nn.Embedding(inp_dim,hid_dim)
        self.pos_embedding = nn.Embedding(block_size,hid_dim)
        self.layers = nn.ModuleList([EncoderLayer(hid_dim,
                                                  n_heads,
                                                  pf_dim,
                                                  drop_out,
                                                  device
                                                  ) for _ in range(n_layers)])
        self.dropout = nn.Dropout(drop_out)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        self.device = device


    def forward(self, src , src_mask):
        batch_size = src.shape[0] 
        src_len = src.shape[1]

        # src = [batch_size , src_len]
        #src_mask = [.., 1 , 1 , ...]

        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        # pos = [batch_size , src_len]

        src = self.dropout(self.tok_embedding(src) * self.scale + self.pos_embedding(pos)) 

        for layer in self.layers:
            src = layer(src, src_mask)

        #src = [batch size, src len, hid dim]

        return src

In [8]:
class EncoderLayer(nn.Module):
    def __init__(self,
                hid_dim, 
                 n_heads, 
                 pf_dim,  
                 drop_out, 
                 device ):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attentation = MultiHeadSelfAttentation(hid_dim, n_heads , drop_out , device)
        self.feedforward = FeedForward(hid_dim, pf_dim, drop_out)
        self.dropout = nn.Dropout(drop_out)

    def forward(self, src , src_mask):

        # src = [batch_size , src_len , hid_dim]
        #src_mask = [.., 1 , 1 , src_len]

        _src, _ = self.self_attentation(src, src, src , src_mask)
        src = self.self_attn_layer_norm(src + self.dropout(_src))

        # src = [batch_size , src_len , hid_dim]
         
        _src = self.feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))

        # src = [batch_size , src_len , hid_dim]

        return src

In [9]:
class MultiHeadSelfAttentation(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()

        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads

        self.key = nn.Linear(hid_dim, hid_dim , bias=False)
        self.query = nn.Linear(hid_dim, hid_dim , bias=False)
        self.value = nn.Linear(hid_dim, hid_dim , bias=False)

        self.proj = nn.Linear(hid_dim, hid_dim , bias=False)
        #self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, q ,k , v , mask = None):
        batch_size = q.shape[0]

        K = self.key(k) 
        Q = self.query(q) 
        V = self.value(v) # [batch , block(len) , hid_dim]

        K = K.view(batch_size , -1 , self.n_heads , self.head_dim).permute(0, 2 ,1 , 3) 
        Q = Q.view(batch_size , -1 , self.n_heads , self.head_dim).permute(0, 2 ,1 , 3)
        V = V.view(batch_size , -1 , self.n_heads , self.head_dim).permute(0, 2 ,1 , 3) #[batch , n_heads , block , head_size]

        wei = Q @ K.permute(0 , 1, 3, 2) * self.scale #* self.hid_dim**-0.5
        if mask is not None :
            # tril = torch.tril(torch.ones(q.shape[1], q.shape[1]))
            wei = wei.masked_fill(mask == 0 , float('-inf'))

        wei = F.softmax(wei , dim=-1) 
        wei = self.dropout(wei)
        x = wei @ V # V = [batch , n_heads , block , head_size] , wei = [batch size, n_heads, trg_len, src_len]

        #x = [batch , n_heads , block , head_size]

        x = x.permute(0, 2, 1, 3).contiguous()

        #x = [batch , block , n_heads , head_size]

        x = x.view(batch_size, -1, self.hid_dim)

        #X = [batch , block(len) , hid_dim = n_heads * head_size]


        x = self.proj(x)
        #X = [batch , block(len) , hid_dim]
        return x, wei

In [10]:
class FeedForward(nn.Module):
    def __init__(self , hid_dim , pf_dim , dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(hid_dim,pf_dim), #[batch , block(len) , pf_dim]
            nn.ReLU(), #avoid vanishing grading problem 
            nn.Linear(pf_dim, hid_dim), # [batch , block(len) , hid_dim]
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

<b>Decoder</b>

In [11]:
class Decoder(nn.Module):
    def __init__(self,
                 out_dim,
                 hid_dim,
                 pf_dim,
                 n_layers,
                 n_heads,
                 drop_out,
                 device,
                 block_size = 100):
        super().__init__()
        self.tok_embedding = nn.Embedding(out_dim,hid_dim)
        self.pos_embedding = nn.Embedding(block_size,hid_dim)
        self.layers = nn.ModuleList([DecoderLayer(hid_dim,
                                                  n_heads,
                                                  pf_dim,
                                                  drop_out,
                                                  device
                                                  ) for _ in range(n_layers)])
        self.dropout = nn.Dropout(drop_out)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        self.proj = nn.Linear(hid_dim, out_dim)

        self.device = device


    def forward(self, trg , src_enc , trg_mask , src_mask):

        #trg = [B , T]
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device) #[B,T]

        trg = self.dropout(self.tok_embedding(trg) * self.scale + self.pos_embedding(pos)) #[B,T,hid_dim]

        for layer in self.layers:
            trg, attetation = layer(trg , src_enc , trg_mask , src_mask)
        output = self.proj(trg) #[B,T,out_dim]

        return output, attetation

In [12]:
class DecoderLayer(nn.Module):
    def __init__(self,
                hid_dim, 
                 n_heads, 
                 pf_dim,  
                 drop_out, 
                 device ):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attentation = MultiHeadSelfAttentation(hid_dim, n_heads , drop_out , device)
        self.enc_attentation = MultiHeadSelfAttentation(hid_dim, n_heads , drop_out , device)
        self.feedforward = FeedForward(hid_dim, pf_dim, drop_out)
        self.dropout = nn.Dropout(drop_out)
        self.device = device

    def forward(self, trg , src_enc , trg_mask , src_mask):

        #trg = [batch size, trg_len, hid dim]]
        #src_enc = [batch size, src_len, hid dim]]
        #trg_mask = [batch_size , 1 , trg_len , block_size]

        _trg, _ = self.self_attentation(trg, trg , trg ,trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))


        _trg, attention = self.enc_attentation(trg, src_enc, src_enc, src_mask)

        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
         
        _trg = self.feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))

        #trg = [batch size, trg_len, hid dim]
        #attention = [batch size, n_heads, trg_len, src_len]

        return trg , attention

<b>Seq2Seq</b>

In [13]:
class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder,
                 decoder,
                 src_pad_idx,
                 trg_pad_idx,
                 device):
        super().__init__()
        self.device = device
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

    def make_src_mask(self, src):
    #src = [batch_size , block_size]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2) # shape = [batch_size , 1 , 1, block_size ]
        return src_mask
    
    def make_trg_mask(self, trg):
        #trg = [batch_size, block_size]
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

        #trg_pad_mask = [batch_size, 1 ,1 , block_size]
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool() #[trg_len, trg_len]

        trg_mask = trg_pad_mask & trg_sub_mask #[batch_size , 1 , trg_len , block_size]
        return trg_mask
    
    def forward(self,src , trg ):
        src_mask = self.make_src_mask(src) #[batch_size, 1 ,1 , block_size]

        src_enc = self.encoder(src, src_mask) #[batch_size, block(len), hid_dim]

        trg_mask = self.make_trg_mask(trg) #[batch_size , 1 , trg_len , block_size]

        output , attentation = self.decoder(trg ,src_enc , trg_mask, src_mask) 

        return output, attentation

In [14]:
INPUT_DIM = sp_en.get_piece_size()
OUTPUT_DIM = sp_ne.get_piece_size()
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, HID_DIM, ENC_PF_DIM, ENC_LAYERS, ENC_HEADS, ENC_DROPOUT, device , block_size=200)
dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_PF_DIM, DEC_LAYERS, DEC_HEADS, DEC_DROPOUT, device, block_size=200)

SRC_PAD_IDX = 0
TRG_PAD_IDX = 0

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [15]:
print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters")

The model has 10,198,848 trainable parameters


In [16]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [17]:
model.apply(initialize_weights);

In [18]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
LEARNING_RATE = 0.0001
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)

In [19]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [20]:
def train(model, iterator , optimizer , criterion , clip):
    model.train()
    epoch_loss = 0 

    for i, (src,trg) in enumerate(iterator):
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()

        output , _ = model(src , trg[:, :-1]) 
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
        if (i + 1) % 100 == 0:
            print(f"  Batch {i+1}/{len(iterator)} | Loss: {loss.item():.4f}")

    
    return epoch_loss / len(iterator)

In [21]:
def evaluate(model , iterator , criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src = src.to(device)
            trg = trg.to(device)
            
            output, _ = model(src, trg[:, :-1])
            
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)
            
            loss = criterion(output, trg)
            epoch_loss += loss.item()
            
    
    return epoch_loss / len(iterator)

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 50
CLIP = 1
best_valid_loss = float('inf')

if not os.path.exists('models'):
    os.makedirs('models')

print("Saving initial model weights...")
torch.save(model.state_dict(), 'models/model_init.pt')
print("✅ Initial model saved to 'models/model_init.pt'")



print("\n" + "="*50)
print("Starting Training...")
print("="*50 + "\n")

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    
    valid_loss = evaluate(model, valid_loader, criterion)
    scheduler.step(valid_loss)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-model.pt')
        save_msg = "✅ Model Saved! (New Best)"
    else:
        save_msg = ""
    
    print(f'\nEpoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f} | {save_msg}')
    print("-"*50)

print("\n" + "="*50)
print("Training Complete!")
print(f"Best Validation Loss: {best_valid_loss:.3f}")
print("="*50 + "\n")

In [None]:
checkpoint = {
    'epoch': N_EPOCHS,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': best_valid_loss,
    'hyperparameters': {
        'input_dim': INPUT_DIM,
        'output_dim': OUTPUT_DIM,
        'hid_dim': HID_DIM,
        'enc_layers': ENC_LAYERS,
        'dec_layers': DEC_LAYERS,
        'enc_heads': ENC_HEADS,
        'dec_heads': DEC_HEADS,
        'enc_dropout': ENC_DROPOUT,
        'dec_dropout': DEC_DROPOUT,
        'device': str(device)
    }
}

torch.save(checkpoint, 'nmt-model-checkpoint.pth')

print("✅ Complete checkpoint saved to 'nmt-model-checkpoint.pth'")
print("   (Includes Model, Optimizer, and Hyperparameters)")
print(f"   Ready to resume from Epoch {N_EPOCHS}")

In [26]:
import torch

model.load_state_dict(torch.load('best-model.pt'))
model.eval() 

def translate_sentence(sentence, model, device, max_len=50):
    model.eval()
        
    if isinstance(sentence, str):
        tokens = [1] + sp_en.encode_as_ids(sentence) + [2]
    else:
        tokens = [1] + sentence + [2]
        
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device) 
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indices = [1] 

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indices).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indices.append(pred_token)
        
        if pred_token == 2:
            break
            
    trg_tokens = [t for t in trg_indices if t not in [1, 2]]
    translated_text = sp_ne.decode_ids(trg_tokens)
    
    return translated_text

print("\n--- Translation Test ---")

sentences = [
    "Hello how are you?",
    "I am going home.",
    "This is a book.",
    "Can you please help me ?",
    "Are you serious ?"
]

for s in sentences:
    translation = translate_sentence(s, model, device)
    print(f"English: {s}")
    print(f"Nepali:  {translation}")
    print("-" * 30)


--- Translation Test ---
English: Hello how are you?
Nepali:  नमस्कार, तपाईं कसरी हुनुहुन्छ?
------------------------------
English: I am going home.
Nepali:  म घर जाँदैछु ।
------------------------------
English: This is a book.
Nepali:  यो पुस्तक हो ।
------------------------------
English: Can you please help me ?
Nepali:  के तिमी मलाई मद्दत गर्न सक्छौ ?
------------------------------
English: Are you serious ?
Nepali:  के तिमी गम्भीर छौ ?
------------------------------
