In [53]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch import optim
import random
import pandas as pd
from sentencepiece import SentencePieceProcessor
from model import *

In [54]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [55]:
INPUT_SIZE = 8000  # Size of the English vocabulary
OUTPUT_SIZE = 8000  # Size of the Nepali vocabulary
EMBED_SIZE = 128
HIDDEN_SIZE = 512
N_LAYERS = 1
DROPOUT = 0.5
BATCH_SIZE = 8
LEARNING_RATE = 0.001
EPOCHS = 50
TEACHER_FORCING_RATIO = 0.5


In [56]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [57]:
class TranslationDataset(Dataset):
    def __init__(self, cleaned_file_path):
        self.pairs = self.load_data(cleaned_file_path)

    def load_data(self, cleaned_file_path):
        df = pd.read_excel(cleaned_file_path)[:2]
        df = df.dropna()
        english_sentences = df['english_sent'].tolist()
        nepali_sentences = df['nepali_sent'].tolist()

        english_tokenizer = SentencePieceProcessor(model_file='english_sp.model')
        nepali_tokenizer = SentencePieceProcessor(model_file='nepali_sp.model')

        pairs = []
        for english_sentence, nepali_sentence in zip(english_sentences, nepali_sentences):
            english_indices = self.process_sentence(english_sentence, english_tokenizer)
            nepali_indices = self.process_sentence(nepali_sentence, nepali_tokenizer)
            pairs.append((english_indices, nepali_indices))
        
        return pairs

    def process_sentence(self, sentence, tokenizer):
        tokens = tokenizer.encode(sentence, out_type=int)
        return [1] + tokens + [2]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]


In [58]:
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_lens = [len(src) for src in src_batch]
    trg_lens = [len(trg) for trg in trg_batch]
    src_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in src_batch], padding_value=0)
    print(src_padded)
    trg_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in trg_batch], padding_value=0)
    return src_padded, trg_padded, src_lens, trg_lens


In [59]:

# Initialize model
encoder = Encoder(INPUT_SIZE, EMBED_SIZE, HIDDEN_SIZE, N_LAYERS, DROPOUT).to(DEVICE)
decoder = Decoder(EMBED_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, N_LAYERS, DROPOUT).to(DEVICE)
model = Seq2Seq(encoder, decoder).to(DEVICE)
print(model)

Seq2Seq(
  (encoder): Encoder(
    (embed): Embedding(8000, 128)
    (gru): GRU(128, 512, dropout=0.5, bidirectional=True)
  )
  (decoder): Decoder(
    (embed): Embedding(8000, 128)
    (attention): Attention(
      (attn): Linear(in_features=1024, out_features=512, bias=True)
    )
    (gru): GRU(640, 512, dropout=0.5)
    (out): Linear(in_features=1024, out_features=8000, bias=True)
  )
)




In [60]:
# Optimizer and Loss
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [61]:
# # Training loop
# def train_model(dataset):
    
#     dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
#     print("Data loaded...")
#     print("Training started...")
#     model.train()
#     for epoch in range(EPOCHS):
#         epoch_loss = 0
#         for src, trg, _, _ in dataloader:
#             src, trg = src.to(DEVICE), trg.to(DEVICE)
#             optimizer.zero_grad()
#             output = model(src, trg, TEACHER_FORCING_RATIO)
#             output_dim = output.shape[-1]
#             output = output[1:].view(-1, output_dim)
#             trg = trg[1:].view(-1)
#             loss = criterion(output, trg)
#             loss.backward()
#             optimizer.step()
#             # print(f"Loss: {loss.item()}")
#             epoch_loss += loss.item()
#         print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(dataloader):.4f}")

from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter(log_dir="runs/translation_experiment")

def train_model(dataset):
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
    print("Data loaded...")
    print("Training started...")
    model.train()
    for epoch in range(EPOCHS):
        epoch_loss = 0
        for src, trg, _, _ in dataloader:
            src, trg = src.to(DEVICE), trg.to(DEVICE)
            optimizer.zero_grad()
            output = model(src, trg, TEACHER_FORCING_RATIO)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Log loss to TensorBoard
        avg_loss = epoch_loss / len(dataloader)
        writer.add_scalar("Loss/Train", avg_loss, epoch)
        print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")
    
    writer.close()


In [62]:
# Save model
# torch.save(model.state_dict(), "SavedModels/seq2seq_model2.pth")

In [112]:
def translate_sentence(sentence, model, english_tokenizer, nepali_tokenizer, device ,BATCH_SIZE):
    # Tokenize and encode the input sentence
    tokens = english_tokenizer.encode(sentence, out_type=int)
    english_indices = [1] + tokens + [2]  # Add <sos> and <eos> tokens
    nepali_indices = [1]  # <sos> token
    
    pairs = []
    pairs.append((english_indices, nepali_indices))
    dataloader = DataLoader(pairs, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    a = 0
    for src, trg, _, _ in dataloader:
        a += 1
        
        src = src.to(device)
        trg = trg.to(device)
        output = model(src, trg, 0)
        print(output)
        # print(output.shape)
    print(a)
    # return english_indices
    # src_lens = [len(src) for src in english_indices]
    # return src_lens

    # Pass through the model
    # model.eval()
    # with torch.no_grad():
    #     encoder_outputs, hidden = model.encoder(input_tensor)
    #     decoder_input = torch.tensor([1]).to(device)  # <sos> token
    #     decoder_hidden = hidden[:N_LAYERS]

    #     translated_sentence = []
    #     for _ in range(50):  # Max translation length
    #         output, decoder_hidden, _ = model.decoder(decoder_input, decoder_hidden, encoder_outputs)
    #         top1 = output.argmax(1)
    #         translated_sentence.append(top1.item())
    #         if top1.item() == 2:  # <eos> token
    #             break
    #         decoder_input = top1.unsqueeze(0)

    # # Decode the token indices to words
    # translated_words = nepali_tokenizer.decode(translated_sentence[:-1])  # Remove <eos> token
    # return translated_words


In [113]:
# Load tokenizers
english_tokenizer = SentencePieceProcessor(model_file='english_sp.model')
nepali_tokenizer = SentencePieceProcessor(model_file='nepali_sp.model')

# Load the model
model = Seq2Seq(encoder, decoder).to(DEVICE)
model.load_state_dict(torch.load("SavedModels/model.pth"))

# Translate a sentence
sentence = "Hello, how are you?"
translation = translate_sentence(sentence, model, english_tokenizer, nepali_tokenizer, DEVICE, BATCH_SIZE)

tensor([[  1],
        [ 83],
        [434],
        [117],
        [  4],
        [420],
        [ 33],
        [ 17],
        [ 84],
        [  2]])
tensor([[[0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0')
1


In [114]:
def translate_sentence(sentence, model, english_tokenizer, nepali_tokenizer, device, max_length=50):
    """
    Translate a single English sentence to Nepali
    """
    # Set model to evaluation mode
    model.eval()
    
    # Process input sentence
    tokens = [1] + english_tokenizer.encode(sentence, out_type=int) + [2]
    src_tensor = torch.LongTensor(tokens).unsqueeze(1).to(device)
    
    # Get encoder outputs
    encoder_outputs, hidden = model.encoder(src_tensor)
    hidden = hidden[:model.decoder.gru.num_layers]
    
    # Initialize decoder input
    trg_indexes = [1]  # Start with SOS token
    
    # Initialize attention for visualization
    attentions = torch.zeros(max_length, len(tokens))
    
    for i in range(max_length):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        
        with torch.no_grad():
            output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs)
        
        # Get predicted token
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        
        # Save attention for visualization
        attentions[i] = attention.squeeze()
        
        # Break if we predict EOS token
        if pred_token == 2:
            break
    
    # Convert tokens back to words
    translated_tokens = trg_indexes[1:-1]  # Remove SOS and EOS tokens
    translated_sentence = nepali_tokenizer.decode(translated_tokens)
    
    return translated_sentence, attentions[:len(trg_indexes)-1, :]


In [118]:
# english_tokenizer, nepali_tokenizer = load_tokenizers()
# model = load_model('best_model.pt', DEVICE)

test_sentence = "David died in war"
translated, attention = translate_sentence(
    test_sentence, 
    model, 
    english_tokenizer, 
    nepali_tokenizer, 
    DEVICE
)
print(f"English: {test_sentence}")
print(f"Nepali: {translated}")

English: David died in war
Nepali: दाऊद दाऊद बेला म साह्रै युद्धमा। त्यस समय युद्धमा
