In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch import optim
import random
import pandas as pd
from sentencepiece import SentencePieceProcessor
from model import *

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
# INPUT_SIZE = 10000  # Size of the English vocabulary
# OUTPUT_SIZE = 10000  # Size of the Nepali vocabulary
# EMBED_SIZE = 256
# HIDDEN_SIZE = 512
# N_LAYERS = 1
# DROPOUT = 0.5
# BATCH_SIZE = 4
# LEARNING_RATE = 0.001
# EPOCHS = 10
# TEACHER_FORCING_RATIO = 0.5
INPUT_SIZE = 8000  # Matches SentencePiece tokenizer vocabulary size
OUTPUT_SIZE = 8000  # Matches SentencePiece tokenizer vocabulary size
EMBED_SIZE = 128  # Reduce for memory constraints
HIDDEN_SIZE = 256  # Reduce for memory constraints
N_LAYERS = 1  # Single layer GRU
DROPOUT = 0.3  # Prevent overfitting
BATCH_SIZE = 2  # Reduce for memory constraints
LEARNING_RATE = 0.001  # Standard learning rate
EPOCHS = 10  # Initial testing
TEACHER_FORCING_RATIO = 0.5  # Balanced teacher forcing


In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [5]:
class TranslationDataset(Dataset):
    def __init__(self, cleaned_file_path):
        self.pairs = self.load_data(cleaned_file_path)

    def load_data(self, cleaned_file_path):
        df = pd.read_excel(cleaned_file_path)
        df = df.dropna()
        english_sentences = df['english_sent'].tolist()
        nepali_sentences = df['nepali_sent'].tolist()

        english_tokenizer = SentencePieceProcessor(model_file='english_sp.model')
        nepali_tokenizer = SentencePieceProcessor(model_file='nepali_sp.model')

        pairs = []
        for english_sentence, nepali_sentence in zip(english_sentences, nepali_sentences):
            english_indices = self.process_sentence(english_sentence, english_tokenizer)
            nepali_indices = self.process_sentence(nepali_sentence, nepali_tokenizer)
            pairs.append((english_indices, nepali_indices))
        
        return pairs

    def process_sentence(self, sentence, tokenizer):
        tokens = tokenizer.encode(sentence, out_type=int)
        return [1] + tokens + [2]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]


In [6]:
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_lens = [len(src) for src in src_batch]
    trg_lens = [len(trg) for trg in trg_batch]
    src_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in src_batch], padding_value=0)
    trg_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in trg_batch], padding_value=0)
    return src_padded, trg_padded, src_lens, trg_lens


In [7]:

# Initialize model
encoder = Encoder(INPUT_SIZE, EMBED_SIZE, HIDDEN_SIZE, N_LAYERS, DROPOUT).to(DEVICE)
decoder = Decoder(EMBED_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, N_LAYERS, DROPOUT).to(DEVICE)
model = Seq2Seq(encoder, decoder).to(DEVICE)



In [8]:
# Optimizer and Loss
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [9]:
# # Training loop
# def train_model(dataset):
    
#     dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
#     print("Data loaded...")
#     print("Training started...")
#     model.train()
#     for epoch in range(EPOCHS):
#         epoch_loss = 0
#         for src, trg, _, _ in dataloader:
#             src, trg = src.to(DEVICE), trg.to(DEVICE)
#             optimizer.zero_grad()
#             output = model(src, trg, TEACHER_FORCING_RATIO)
#             output_dim = output.shape[-1]
#             output = output[1:].view(-1, output_dim)
#             trg = trg[1:].view(-1)
#             loss = criterion(output, trg)
#             loss.backward()
#             optimizer.step()
#             # print(f"Loss: {loss.item()}")
#             epoch_loss += loss.item()
#         print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(dataloader):.4f}")

from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter(log_dir="runs/translation_experiment")

def train_model(dataset):
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
    print("Data loaded...")
    print("Training started...")
    model.train()
    for epoch in range(EPOCHS):
        epoch_loss = 0
        for src, trg, _, _ in dataloader:
            src, trg = src.to(DEVICE), trg.to(DEVICE)
            optimizer.zero_grad()
            output = model(src, trg, TEACHER_FORCING_RATIO)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Log loss to TensorBoard
        avg_loss = epoch_loss / len(dataloader)
        writer.add_scalar("Loss/Train", avg_loss, epoch)
        print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")
    
    writer.close()


In [10]:
dataset = TranslationDataset("Dataset/english-nepali-cleaned.xlsx")

In [None]:
train_model(dataset)

Data loaded...
Training started...
