In [1]:
# Basic packages for the project
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import copy


In [2]:
# Packages for data generator & preparation
from torchtext.data import Field, TabularDataset, BucketIterator
import spacy
import sys
from indicnlp import common
from indicnlp.tokenize import indic_tokenize


In [3]:
# Load English tokenizer using spaCy
spacy_eng = spacy.load("en_core_web_sm")

# Define English tokenizer
def tokenize_eng(text):
    return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]


In [4]:
# Define Hindi tokenizer using Indic NLP
def tokenize_hindi(text):
    return [tok for tok in indic_tokenize.trivial_tokenize(text)]


In [5]:
# Define Field objects for English and Hindi text
english_txt = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")
hindi_txt = Field(tokenize=tokenize_hindi, init_token="<sos>", eos_token="<eos>")


In [6]:
# Define data fields for loading CSV files
data_fields = [('eng_text', english_txt), ('hindi_text', hindi_txt)]

# Load training and validation datasets
train_dt, val_dt = TabularDataset.splits(
    path='./', 
    train='train.csv', 
    validation='val.csv', 
    format='csv', 
    fields=data_fields
)


In [7]:
# Build vocabulary for English and Hindi text
english_txt.build_vocab(train_dt, max_size=10000, min_freq=2)
hindi_txt.build_vocab(train_dt, max_size=10000, min_freq=2)


In [8]:
# Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("GPU Available:", torch.cuda.is_available())
print("GPU Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU Found")

# Set model training parameters
save_model = True
num_epochs = 1
learning_rate = 5e-4
batch_size = 96


GPU Available: True
GPU Device Name: NVIDIA GeForce RTX 3060 Laptop GPU


In [9]:
# Create iterators for training and validation data
train_iter = BucketIterator(train_dt, batch_size=batch_size, sort_key=lambda x: len(x.eng_text), shuffle=True)
val_iter = BucketIterator(val_dt, batch_size=batch_size, sort_key=lambda x: len(x.eng_text), shuffle=True)


In [10]:
from transformer import Transformer

# Model hyperparameters
src_vocab_size = len(english_txt.vocab)
trg_vocab_size = len(hindi_txt.vocab)
embedding_size = 512
num_heads = 8
num_layers = 3
dropout = 0.10
max_len = 10000
forward_expansion = 4
src_pad_idx = english_txt.vocab.stoi["<pad>"]
trg_pad_idx = 0

# Initialize the Transformer model
model = Transformer(
    src_vocab_size=src_vocab_size,
    trg_vocab_size=trg_vocab_size,
    src_pad_idx=src_pad_idx,
    trg_pad_idx=trg_pad_idx,
    embed_size=embedding_size,
    num_layers=num_layers,
    forward_expansion=forward_expansion,
    heads=num_heads,
    dropout=dropout,
    device=device,
    max_len=max_len
).to(device)

print(len(english_txt.vocab))
print(len(hindi_txt.vocab))


10004
10004


In [11]:
# Set up optimizer, learning rate scheduler, and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10, verbose=True)
pad_idx = hindi_txt.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


In [12]:
loss_tracker = []
train_losses = []
val_losses = []
learning_rates = []

for epoch in range(num_epochs):
    model.train()
    losses = []
    loop = tqdm(enumerate(train_iter), total=len(train_iter))
    
    for batch_idx, batch in loop:
        inp_data = batch.eng_text.permute(-1, -2).to(device)
        target = batch.hindi_text.permute(-1, -2).to(device)

        # Forward pass
        output = model(inp_data, target[:, :-1])
        optimizer.zero_grad()
        loss = criterion(output.reshape(-1, trg_vocab_size), target[:, 1:].reshape(-1)) - 1
        losses.append(loss.item())

        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        loop.set_postfix(loss=loss.item())
    
    # Calculate mean training loss
    train_mean_loss = sum(losses) / len(losses)
    train_losses.append(train_mean_loss)
    
    # Get and track the learning rate
    learning_rate = optimizer.param_groups[0]['lr']
    learning_rates.append(learning_rate)
    
    # Step the scheduler
    scheduler.step(train_mean_loss)

    # Validation loop
    model.eval()
    val_losses_epoch = []
    with torch.no_grad():
        for val_batch_idx, val_batch in tqdm(enumerate(val_iter), total=len(val_iter)):
            val_inp_data = val_batch.eng_text.permute(-1, -2).to(device)
            val_target = val_batch.hindi_text.permute(-1, -2).to(device)
            val_output = model(val_inp_data, val_target[:, :-1])
            val_loss = criterion(val_output.reshape(-1, trg_vocab_size), val_target[:, 1:].reshape(-1)) - 1
            val_losses_epoch.append(val_loss.item())
            
        val_mean_loss = sum(val_losses_epoch) / len(val_losses_epoch)
        val_losses.append(val_mean_loss)

    # Save best model
    loss_tracker.append(val_mean_loss)
    if save_model and val_mean_loss == np.min(loss_tracker):
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        torch.save(checkpoint, 'best_model.pth')
        
    print(f"Epoch [{epoch + 1}/{num_epochs}]: train_loss={train_mean_loss}; val_loss={val_mean_loss}; "
          f"learning_rate={learning_rate}")

# Plot training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.show()


  0%|                                                                                        | 0/13768 [00:01<?, ?it/s]


RuntimeError: mat1 dim 1 must match mat2 dim 0

In [None]:
# Load the English tokenizer from spaCy
spacy_eng = spacy.load("en_core_web_sm")

# Define English and Hindi tokenizers
def tokenize_eng(text):
    return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

def tokenize_hindi(text):
    return [tok for tok in indic_tokenize.trivial_tokenize(text)]

In [None]:
# Define fields for English and Hindi text
english_txt = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")
hindi_txt = Field(tokenize=tokenize_hindi, init_token="<sos>", eos_token="<eos>")

# Load training and validation datasets
data_fields = [('eng_text', english_txt), ('hindi_text', hindi_txt)]
train_dt, val_dt = TabularDataset.splits(path='C:/Translation Model/', train='train.csv', validation='val.csv', format='csv', fields=data_fields)


In [None]:
def beam_search(sentence, model, src_field, src_tokenizer, trg_field, trg_vcb_sz, k, max_ts=50, device="cpu"):
    # Tokenize and preprocess input sentence
    sentence_tok = src_tokenizer(sentence)
    sentence_tok.insert(0, src_field.init_token)
    sentence_tok.append(src_field.eos_token)

    # Convert sentence to tensor of indices
    src_tok = torch.tensor([src_field.vocab.stoi[token] for token in sentence_tok], dtype=torch.long).unsqueeze(0).to(device)
    trg_tok = torch.tensor([trg_field.vocab.stoi[trg_field.init_token]], dtype=torch.long).unsqueeze(0).to(device)

    eos_token = trg_field.vocab.stoi[trg_field.eos_token]
    trans_store = {}

    store_seq_id = None
    store_seq_prob = None
    
    for ts in range(max_ts):
        if ts == 0:
            with torch.no_grad():
                out = model(src_tok, trg_tok)
            topk = torch.topk(torch.log(torch.softmax(out, dim=-1)), dim=-1, k=k)
            seq_id = trg_tok.expand(k, -1).clone()
            seq_id = torch.cat([seq_id, topk.indices.squeeze().unsqueeze(1)], dim=1)
            seq_prob = topk.values.squeeze()
            
            eos_mask = seq_id[:, -1] == eos_token
            if eos_mask.any():
                trans_store.update({seq_prob[i].item(): seq_id[i] for i in range(k) if eos_mask[i]})
                seq_id = seq_id[~eos_mask]
                seq_prob = seq_prob[~eos_mask]
            store_seq_id = seq_id.to(device)
            store_seq_prob = seq_prob.to(device)
        else:
            # Expand src_tok to match number of sequences in store_seq_id
            src_expanded = src_tok.expand(store_seq_id.shape[0], -1).to(device)
            with torch.no_grad():
                out = model(src_expanded, store_seq_id)
            out = torch.log(torch.softmax(out[:, -1, :], dim=-1))

            all_comb = (store_seq_prob.unsqueeze(1) + out).view(-1)
            all_comb_idx = torch.cartesian_prod(
                torch.arange(store_seq_id.shape[0]), torch.arange(trg_vcb_sz)
            ).to(device)
            topk = torch.topk(all_comb, k=k)
            top_seq_idx = all_comb_idx[topk.indices]
            seq_prob = topk.values

            new_seq_id = store_seq_id[top_seq_idx[:, 0]]
            new_seq_id = torch.cat([new_seq_id, top_seq_idx[:, 1].unsqueeze(1)], dim=1)

            eos_mask = new_seq_id[:, -1] == eos_token
            if eos_mask.any():
                trans_store.update({seq_prob[i].item(): new_seq_id[i] for i in range(k) if eos_mask[i]})
                new_seq_id = new_seq_id[~eos_mask]
                seq_prob = seq_prob[~eos_mask]
            store_seq_id = new_seq_id.to(device)
            store_seq_prob = seq_prob.to(device)

        if len(trans_store) >= k:
            break

    if len(trans_store) == 0:
        best_translation = store_seq_id[0]
    else:
        best_translation = trans_store[max(trans_store)]

    return " ".join([trg_field.vocab.itos[w] for w in best_translation[1:] if w != eos_token])


In [None]:
# Build vocabulary after modifying sequence length
english_txt.build_vocab(train_dt, max_size=10000, min_freq=2)
hindi_txt.build_vocab(train_dt, max_size=10000, min_freq=2)

In [None]:
# Sample English sentence to translate
sample_sentence = "Indian government takes good care of the minorities"

# Run beam search to get the translation
translated_text = beam_search(
    sentence=sample_sentence,
    model=model,  # Make sure 'model' is your trained model
    src_field=english_txt,
    src_tokenizer=tokenize_eng,
    trg_field=hindi_txt,
    trg_vcb_sz=10000,
    k=5,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Display the translated text
print("English Sentence:", sample_sentence)
print("Translated Hindi Sentence:", translated_text)
