# IMPORTS

In [21]:
# Import necessary libraries
import numpy as np
import csv
import torch
import pandas as pd
import torch.nn as nn
import os
import heapq
from tqdm import tqdm
import torch.optim as optim
import random
import math
import torch.nn.functional as F
import warnings
import wandb
from torch.nn.utils import clip_grad_norm_
warnings.filterwarnings("ignore")

In [22]:
# Check if CUDA is available
val=torch.cuda.is_available()
if val == 1:
    
 # If CUDA is available, use it as the device    
    device= torch.device('cuda')
else:
    
# If GPU is also unavailable, default to CPU
    device = torch.device('gpu')
    

In [23]:
# !wandb login f63b589f2c31fd6562d752168e172d22870ab562

# PREPROCESSING

In [24]:
# This function loads and preprocesses the dataset for training a sequence-to-sequence model.
def loadData(params):
    
    # Define path to dataset based on configuration
    data_path = params['dataset_path']
    
    # Open data files for training, validation, and testing
    tr_data = csv.reader(open(data_path + '/hin/hin_train.csv',encoding='utf8'))
    vl_data = csv.reader(open(data_path + '/hin/hin_valid.csv',encoding='utf8'))
    tt_data = csv.reader(open(data_path + '/hin/hin_test.csv',encoding='utf8'))
    
    # Initialize empty lists to store data
    tr_translations = []
    tt_words=[]
    vl_translations = []
    vl_words=[]
    tr_words =[]
    tt_translations = []
    pad=''
    start='$'
    end ='&'
    
    # Load training data
    train_data_list = list(tr_data)
    train_len = len(train_data_list)
    i = 0
    while i < train_len:
        pair = train_data_list[i]
        tr_words.append(pair[0] + end)
        tr_translations.append(start + pair[1] + end)
        i += 1  
    
    # Load validation data 
    i=0
    val_data_list = list(vl_data)
    val_len = len(val_data_list)
    while i < val_len :
        pair=val_data_list[i]
        vl_words.append(pair[0]+end)
        vl_translations.append(start+pair[1]+end)
        i+=1
    
    # Load validation data   
    i=0
    test_data_list = list(tt_data)
    test_len = len(test_data_list)
    while i < test_len :
        pair=test_data_list[i]
        tt_words.append(pair[0]+end)
        tt_translations.append(start+pair[1]+end)
        i+=1   
        
 
    # Convert lists to NumPy arrays for better performance
    tt_words =np.array(tt_words)
    tr_translations = np.array(tr_translations)
    vl_translations =np.array(vl_translations)
    tr_words = np.array(tr_words)
    tt_translations = np.array(tt_translations)
    vl_words =np.array(vl_words)
    
    
    # Build input and output vocabularies
    output_vocab,input_vocab = set() , set()
    
    # Add characters from train_words to input_vocab
    i = 0
    word_len=len(tr_words)
    while i < word_len :
        word = tr_words[i]
        char_index = 0
        while char_index < len(word):
            character = word[char_index]
            input_vocab.add(character)
            char_index += 1
        i += 1         
    
    # Add characters from val_words to input_vocab
    i = 0
    word_len=len(vl_words)
    while i < word_len :
        word = vl_words[i]
        char_index = 0
        while char_index < len(word):
            character = word[char_index]
            input_vocab.add(character)
            char_index += 1
        i += 1
        
    # Add characters from test_words to input_vocab
    i = 0
    word_len=len(tt_words)
    while i < word_len :
        word = tt_words[i]
        char_index = 0
        while char_index < len(word):
            character = word[char_index]
            input_vocab.add(character)
            char_index += 1
        i += 1
    
    # Add characters from train_translations, val_translations, and test_translations to output_vocab
    i = 0
    word_len=len(tr_translations)
    while i < word_len :
        word = tr_translations[i]
        char_index = 0
        while char_index < len(word):
            character = word[char_index]
            output_vocab.add(character)
            char_index += 1
        i += 1
        
    i = 0
    word_len=len(vl_translations)
    while i < word_len :
        word = vl_translations[i]
        char_index = 0
        while char_index < len(word):
            character = word[char_index]
            output_vocab.add(character)
            char_index += 1
        i += 1
        
    i = 0
    word_len=len(tt_translations)
    while i < word_len :
        word = tt_translations[i]
        char_index = 0
        while char_index < len(word):
            character = word[char_index]
            output_vocab.add(character)
            char_index += 1
        i += 1
    # Remove special tokens from output_vocab
    output_vocab.remove(start)
    input_vocab.remove(end)
    output_vocab.remove(end)
    
    # Sort vocabularies and add special tokens
    output_vocab= [pad, start, end] + list(sorted(output_vocab))
    input_vocab = [pad, start, end] + list(sorted(input_vocab))
            
    # Create index mappings for vocabularies
    output_index,input_index = {char: idx for idx, char in enumerate(output_vocab)},{char: idx for idx, char in enumerate(input_vocab)}
    output_index_rev,input_index_rev = {idx: char for char, idx in output_index.items()},{idx: char for char, idx in input_index.items()}
    
    # Determine maximum sequence length
    max_len = max(max([len(word) for word in np.hstack((tr_words, tt_words, vl_words))]), max([len(word) for word in np.hstack((tr_translations, vl_translations, tt_translations))]))
    
    # Prepare preprocessed data dictionary
    preprocessed_data = {
        'SOS' : start,
        'EOS' : end,
        'PAD' : pad,
        'train_words' : tr_words,
        'train_translations' : tr_translations,
        'val_words' : vl_words,
        'val_translations' : vl_translations,
        'test_words' : tt_words,
        'test_translations' : tt_translations,
        'max_enc_len' : max([len(word) for word in np.hstack((tr_words, tt_words, vl_words))]),
        'max_dec_len' : max([len(word) for word in np.hstack((tr_translations, vl_translations, tt_translations))]),
        'max_len' : max_len,
        'input_index' : input_index,
        'output_index' : output_index,
        'input_index_rev' : input_index_rev,
        'output_index_rev' : output_index_rev
    }
    return preprocessed_data

In [25]:
def create_tensor(preprocessed_data):
    
    # Extract max sequence length and the number of training examples
    prop_data=preprocessed_data['max_len']
    leng=len(preprocessed_data['train_words'])
    d_type='int64'
    
    # Initialize arrays for data
    input_data = np.zeros((prop_data,leng), dtype = d_type)
    output_data = np.zeros((prop_data,leng), dtype = d_type)
    leng=len(preprocessed_data['val_words'])
    vl_input_data = np.zeros((prop_data,leng), dtype = d_type)
    vl_output_data = np.zeros((prop_data,leng), dtype = d_type)
    leng=len(preprocessed_data['test_words'])
    tt_input_data = np.zeros((prop_data,leng), dtype = d_type)
    tt_output_data = np.zeros((prop_data,leng), dtype = d_type)
    
    # Fill in training data arrays
    idx = 0
    while idx < len(preprocessed_data['train_words']):
        w = preprocessed_data['train_words'][idx]
        t = preprocessed_data['train_translations'][idx]

        i = 0
        while i < len(w):
            char = w[i]
            input_data[i, idx] = preprocessed_data['input_index'][char]
            i += 1

        i = 0
        while i < len(t):
            char = t[i]
            output_data[i, idx] = preprocessed_data['output_index'][char]
            i += 1
        idx += 1            
        

    # Fill in validation data arrays        
    idx = 0
    while idx < len(preprocessed_data['val_words']):
        w = preprocessed_data['val_words'][idx]
        t = preprocessed_data['val_translations'][idx]

        i = 0
        while i < len(w):
            char = w[i]
            vl_input_data[i, idx] = preprocessed_data['input_index'][char]
            i += 1

        i = 0
        while i < len(t):
            char = t[i]
            vl_output_data[i, idx] = preprocessed_data['output_index'][char]
            i += 1
        idx += 1            
        
    # Fill in test data arrays        
    idx = 0
    while idx < len(preprocessed_data['test_words']):
        w = preprocessed_data['test_words'][idx]
        t = preprocessed_data['test_translations'][idx]

        i = 0
        while i < len(w):
            char = w[i]
            tt_input_data[i, idx] = preprocessed_data['input_index'][char]
            i += 1

        i = 0
        while i < len(t):
            char = t[i]
            tt_output_data[i, idx] = preprocessed_data['output_index'][char]
            i += 1
        idx += 1            
        
            
    # Convert NumPy arrays to PyTorch tensors        
    output_data=torch.tensor(output_data, dtype = torch.int64)
    input_data = torch.tensor(input_data,dtype = torch.int64)
    vl_output_data=torch.tensor(vl_output_data, dtype = torch.int64)
    vl_input_data = torch.tensor(vl_input_data,dtype = torch.int64)
    tt_output_data=torch.tensor(tt_output_data, dtype = torch.int64)
    tt_input_data= torch.tensor(tt_input_data,dtype = torch.int64)
    
    #Store tensors in a dictionary
    tensors = {
        'input_data' : input_data,
        'output_data' : output_data,
        'val_input_data' : vl_input_data,
        'val_output_data' : vl_output_data, 
        'test_input_data' : tt_input_data,
        'test_output_data' : tt_output_data
    }
    return tensors

# ENCODER RG

In [26]:
# Encoder module for a sequence-to-sequence model.
class Encoder(nn.Module): 
    
    #Initializes the Encoder module.
    def __init__(self, params, preprocessed_data):
        super(Encoder, self).__init__()
        
        # Extract parameters
        self.cell_type = params['cell_type']
        self.dropout = nn.Dropout(params['dropout'])
        
        # Embedding layer
        self.embedding = nn.Embedding(len(preprocessed_data['input_index']), params['embedding_size'])
        
        # RNN or GRU cell based on cell_type
        if self.cell_type == 'RNN':
            self.cell = nn.RNN(params['embedding_size'], params['hidden_size'], params['num_layers_enc'], dropout = params['dropout'], bidirectional = params['bi_dir'])
        elif self.cell_type == 'GRU':
            self.cell = nn.GRU(params['embedding_size'], params['hidden_size'], params['num_layers_enc'], dropout = params['dropout'], bidirectional = params['bi_dir'])
    #Forward pass of the Encoder    
    def forward(self, x):
        
        # Embedding layer
        drop_par = self.embedding(x)
        # Pass through RNN/GRU cell
        _ , hidden = self.cell(self.dropout(drop_par))
        
        # Return hidden state
        return hidden

# DECODER RG

In [27]:
#Decoder module for a sequence-to-sequence model.
class Decoder(nn.Module):
    
    #Initializes the Decoder module
    def __init__(self, params, preprocessed_data):
        super(Decoder, self).__init__()
        
        # Extract parameters
        self.cell_type = params['cell_type']
        self.dropout = nn.Dropout(params['dropout'])
        self.embedding = nn.Embedding(len(preprocessed_data['output_index']), params['embedding_size'])
        
        # RNN or GRU cell based on cell_type
        if self.cell_type == 'RNN':
            self.cell = nn.RNN(params['embedding_size'], params['hidden_size'], params['num_layers_dec'], dropout = params['dropout'], bidirectional = params['bi_dir'])
        elif self.cell_type == 'GRU':
            self.cell = nn.GRU(params['embedding_size'], params['hidden_size'], params['num_layers_dec'], dropout = params['dropout'], bidirectional = params['bi_dir'])
        
        # Fully connected layer for output prediction
        self.fc = nn.Linear(params['hidden_size'] * 2 if params['bi_dir'] == True else params['hidden_size'], len(preprocessed_data['output_index']))
    
    #Forward pass of the Decoder.
    def forward(self, x, hidden, cell):
        
        # Embedding layer
        emb = self.embedding(x.unsqueeze(0))
        outputs, hidden = self.cell(self.dropout(emb), hidden)
        
        # Predictions with fully connected layer
        pred = self.fc(outputs).squeeze(0)
        
        # Return predictions and updated hidden state
        return pred, hidden

# SEQ 2 SEQ RG

In [28]:
#Sequence-to-sequence model consisting of an Encoder and a Decoder.
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, params,  preprocessed_data):
        #Initializes the Seq2Seq model
        super(Seq2Seq, self).__init__()
        
        # Extract parameters
        self.cell_type = params['cell_type']
        self.decoder, self.encoder  = decoder, encoder
        self.output_index_len = len(preprocessed_data['output_index'])
        self.tfr = params['teacher_fr']
    
    #Forward pass of the Seq2Seq model
    def forward(self, source, target):
        
        # Extract batch size and target sequence length
        bs, target_len = source.shape[1], target.shape[0]
        x = target[0]
        outputs = torch.zeros(target_len, bs, self.output_index_len).to(device)
        
        # Encode the source sequence to obtain the initial hidden state
        hidden = self.encoder(source)
        
        # Iterate over each step in the target sequence
        for t in range(1, target_len):
            output, hidden = self.decoder(x, hidden, None)
            
            # Store the decoder output in the outputs tensor
            outputs[t], best_guess = output, output.argmax(1)
            
            # Determine the next input (x) for the decoder
            x = best_guess if random.random() >= self.tfr else target[t]
            
        # Return the predicted outputs from the decoder    
        return outputs

# ENCODER LSTM

In [29]:
# Encoder module using LSTM (Long Short-Term Memory) cells.
class Encoder_LSTM(nn.Module): 
    #Initializes the Encoder_LSTM module.
    def __init__(self, params, preprocessed_data):
        super(Encoder_LSTM, self).__init__()
        # Initialize dropout layer
        self.dropout = nn.Dropout(params['dropout'])
        
        # Initialize embedding layer
        self.embedding = nn.Embedding(len(preprocessed_data['input_index']), params['embedding_size'])
        
        # Initialize LSTM cell
        self.cell = nn.LSTM(params['embedding_size'], params['hidden_size'], params['num_layers_enc'], dropout = params['dropout'], bidirectional = params['bi_dir'])
    
    #Forward pass of the Encoder_LSTM    
    def forward(self, x):
        # Embedding layer
        drop_par = self.embedding(x)
        # applying dropout to the embedding layer
        outputs , (hidden, cell) = self.cell(self.dropout(drop_par))
        
        # Return hidden and cell states
        return hidden, cell

# DECODER LSTM

In [30]:
#Decoder module using LSTM (Long Short-Term Memory) cells.
class Decoder_LSTM(nn.Module):
    
    #Initializes the Decoder_LSTM module
    def __init__(self, params, preprocessed_data):
        super(Decoder_LSTM, self).__init__()
        
        # Dropout layer to randomly zero out input elements to prevent overfitting
        self.dropout = nn.Dropout(params['dropout'])
        
        # Embedding layer to convert output tokens into dense vectors
        self.embedding = nn.Embedding(len(preprocessed_data['output_index']), params['embedding_size'])
        
        # LSTM cell for sequence decoding
        self.cell =  nn.LSTM(params['embedding_size'], params['hidden_size'], params['num_layers_dec'], dropout = params['dropout'], bidirectional = params['bi_dir'])
        self.fc = nn.Linear(params['hidden_size'] *  2 if params['bi_dir'] == True else params['hidden_size'], len(preprocessed_data['output_index']))
    
    #Forward pass of the Decoder_LSTM.
    def forward(self, x, hidden, cell):
        
        # Embedding layer: maps input token to dense vector
        emb = self.embedding(x.unsqueeze(0))
        
        # Pass the embedded and dropout-processed input through the LSTM cell
        outputs , (hidden, cell) = self.cell(self.dropout(emb), (hidden, cell))
        
        # Predictions with fully connected layer
        pred  = self.fc(outputs).squeeze(0)
        
        # Apply log softmax activation to obtain output probabilities
        pred = F.log_softmax(pred, dim = 1)
        
        # Return predicted output probabilities, updated hidden, and cell states
        return pred, hidden, cell

# SEQ 2 SEQ LSTM

In [31]:
#Sequence-to-sequence model using LSTM cells for both encoding and decoding.
class Seq2Seq_LSTM(nn.Module):
    def __init__(self, encoder, decoder, params,  preprocessed_data):
        super(Seq2Seq_LSTM, self).__init__()
        
        # Store references to encoder, decoder, and other attributes
        self.cell_type = params['cell_type']
        self.decoder, self.encoder  = decoder, encoder
        self.output_index_len = len(preprocessed_data['output_index'])
        self.tfr = params['teacher_fr']
    
    #Forward pass of the Seq2Seq_LSTM model.
    def forward(self, source, target):
        
        # Extract batch size and target sequence length
        batch_size, target_len = source.shape[1], target.shape[0]
        
        # Initial input to the decoder (start token)
        x = target[0]
        outputs = torch.zeros(target_len, batch_size, self.output_index_len).to(device)
        
        # Encode the source sequence to obtain initial hidden and cell state
        hidden, cell = self.encoder(source)
        
        # Iterate over each step in the target sequence
        for t in range(1, target_len):
            
            # Pass input (x), hidden, and cell states to the decoder
            output, hidden, cell = self.decoder(x, hidden, cell)
            
            # Store the decoder output in the outputs tensor
            outputs[t], best_guess = output, output.argmax(1)
            
            # Determine the next input (x) for the decoder using teacher forcing strategy
            x = best_guess if random.random() >= self.tfr else target[t]
        
        # Return the predicted outputs from the decoder for each time step
        return outputs

# GET OPTIMIZERS

In [32]:
# Function to get the optimizer based on specified parameters
def get_optim(model, params):
    # Extract the optimizer type from params and convert to lowercase
    val = params['optimizer'].lower()
    
    if  val== 'sgd':
        # Use Stochastic Gradient Descent (SGD) optimizer
        opt = optim.SGD(model.parameters(), lr = params['learning_rate'], momentum = 0.9)
    
    if val == 'adagrad':
        # Use adagrad optimizer
        opt = optim.Adagrad(model.parameters(), lr = params['learning_rate'], lr_decay = 0, weight_decay = 0, initial_accumulator_value = 0, eps = 1e-10)
    
    if val == 'adam':
        # Use adam optimizer
        opt = optim.Adam(model.parameters(), lr = params['learning_rate'], betas = (0.9, 0.999), eps = 1e-8)
    
    if val == 'rmsprop':
        # Use rmsprop optimizer
        opt = optim.RMSprop(model.parameters(), lr = params['learning_rate'], alpha = 0.99, eps = 1e-8)
    
    return opt

# BEAM SEARCH

In [33]:
#Beam search function to generate predictions using a sequence-to-sequence model
def beam_search(model, word, preprocessed_data, bw, lp, ct):
    
    # Prepare input data tensor for the model
    val=preprocessed_data['max_len']+1
    data = np.zeros((val, 1), dtype=np.int32)
    
    # Map input character to index
    idx = 0
    while idx < len(word):
        char = word[ idx ]
        data[ idx , 0 ] = preprocessed_data['input_index'][char]
        idx += 1
    
    # Append EOS token index to indicate end of input sequence
    data[idx , 0] = preprocessed_data['input_index'][preprocessed_data['EOS']]
    
    # Convert input data to torch tensor and move to appropriate device
    data = torch.tensor(data, dtype=torch.int32).to(device)
    
    # Encode the input sequence to obtain initial hidden state
    with torch.no_grad():
        # For RNN encoder, obtain only hidden state
        val = ct !='LSTM' 
        if  val :
            hidden = model.encoder(data)
        else:
           # For LSTM encoder, obtain both hidden and cell states 
           hidden, cell = model.encoder(data)
    hidden_par = hidden.unsqueeze(0)
    
    # Reshape the SOS token index for initializing the sequence
    out_reshape = np.array(preprocessed_data['output_index'][preprocessed_data['SOS']]).reshape(1,)
    initial_seq = torch.tensor(out_reshape).to(device)
    
    # Initialize beam with the initial sequence and its score
    beam = [(0.0, initial_seq, hidden_par)]
    
    # Beam search loop to generate sequences
    i = 0
    leng=len(preprocessed_data['output_index'])
    while i < leng:
        candidates = []
        index = 0
        while index < len(beam):
            score, seq, hidden = beam[index]
            
            # Check if sequence ends with EOS token
            val=seq[-1].item() == preprocessed_data['output_index'][preprocessed_data['EOS']]
            if val:
                candidates.append((score, seq, hidden))
                index+=1
                continue
            
            # Prepare input token for the decoder based on the last token of the sequence
            reshape_last = np.array(seq[-1].item()).reshape(1,)
            hdn = hidden.squeeze(0)
            x = torch.tensor(reshape_last).to(device)
            
            # Decode the input token to get output probabilities and updated hidden state
            val= ct == 'LSTM'
            if val!=1:
                output ,  hidden = model.decoder(x, hdn, None)
            else:
                output, hidden , cell = model.decoder(x, hdn, cell)
            val=F.softmax(output, dim=1)
            
            # Apply softmax to obtain probabilities over output tokens
            topk_probs , topk_tokens = torch.topk(val, k=bw)
            
            # Generate candidate sequences based on top-k tokens
            ii = 0
            while ii < len(topk_probs[0]):
                prob = topk_probs[0][ii]
                token = topk_tokens[0][ii]
                new_seq = torch.cat((seq, token.unsqueeze(0)), dim=0)
                
                # Calculate length normalization factor (penalty) for the new sequence
                ln_ns = len(new_seq)
                ln_pf = ((ln_ns - 1) / 5)
                candidate_score = score + torch.log(prob).item() / (ln_pf ** lp)
                
                # Append candidate (score, sequence, hidden state) to candidates list
                candidates.append((candidate_score, new_seq, hidden.unsqueeze(0)))
                ii += 1
            index += 1
        # Select top beam width candidates based on scores    
        beam = heapq.nlargest(bw, candidates, key=lambda x: x[0])
        i += 1
    m = max(beam, key=lambda x: x[0]) 
    _, best_sequence, _ = m
    
    # Convert predicted sequence tokens to characters and concatenate them
    pred = ''.join([preprocessed_data['output_index_rev'][token.item()] for token in best_sequence[1:]])
    
    # Return the predicted sequence (excluding the EOS token)
    return pred[:-1]

# TRAINING FUNCTION

In [34]:
# Function to train the model
def train(model, crit, optimizer, preprocessed_data, tensors, params):
    val=1
    bs='batch_size'
    # Split the input and output data into batches
    tr_result = torch.split(tensors['output_data'], params[bs], dim = val)
    tr_data=torch.split(tensors['input_data'], params[bs], dim = val)
    vl_result= torch.split(tensors['val_output_data'], params[bs], dim=val)
    vl_data=torch.split(tensors['val_input_data'], params[bs], dim=val)
    
    # Loop through epochs
    epoch = 0
    while epoch < params['num_epochs'] :
        epoch +=1
        
        # Initialize counters for metrics
        correct_prediction,total_loss,total_words = 0,0,0
        model.train()
        leng=len(tr_data)
        
        # Use tqdm for progress visualization during training
        val='Training'
        with tqdm(total = leng, desc = val) as pbar:
            index = 0
            lenn = len(tr_data)
            
            # Loop through each batch in training data
            while index < lenn:
                # Move input and target data to device (e.g., GPU)
                y = tr_result[index]
                x = tr_data[index] 
                inp_data = x.to(device)
                target= y.to(device) 
                optimizer.zero_grad()
                output = model(inp_data, target)
                
                # Reshape target and output for loss calculation
                target = target.reshape(-1)
                output = output.reshape(-1, output.shape[2])
                
                # Create a mask to ignore padding tokens
                pad_mask = (target != preprocessed_data['output_index'][preprocessed_data['PAD']])
                output = output[pad_mask]
                target = target[pad_mask]
                
                # Compute loss and perform backpropagation
                loss = crit(output, target)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
                optimizer.step()
                
                # Update metrics
                total_loss = total_loss +loss.item()
                total_words = total_words + target.size(0)
                correct_prediction = correct_prediction + torch.sum(torch.argmax(output, dim=1) == target).item()

                index += 1
                pbar.update(1)
        
        # Calculate training accuracy and loss
        cal=correct_prediction / total_words
        train_accuracy = cal * 100
        len_train=len(tr_data)
        train_loss = total_loss / len_train
        model.eval()
        
        # Evaluate model on validation data
        with torch.no_grad():
            val_total_words,val_correct_pred,val_total_loss = 0,0,0
            with tqdm(total = len(vl_data), desc = 'Validation') as pbar:
                
                index = 0
                lenn=len(vl_data)
                # Loop through each batch in validation data
                while index < lenn:
                    
                    y_val =vl_result[index]
                    x_val= vl_data[index]
                    # Move validation input and target data to device
                    inp_data_val = x_val.to(device)
                    target_val=y_val.to(device)
                    
                    # Forward pass through the model for validation
                    output_val = model(inp_data_val, target_val)
                    target_val = target_val.reshape(-1)
                    output_val = output_val.reshape(-1, output_val.shape[2])
                    
                    # Create mask to ignore padding tokens
                    pad_mask = (target_val != preprocessed_data['output_index'][preprocessed_data['PAD']])
                    output_val = output_val[pad_mask]
                    target_val = target_val[pad_mask]
                    
                    # Calculate validation loss and metrics
                    val_loss = crit(output_val, target_val)
                    val_total_loss = val_total_loss+ val_loss.item()
                    val_total_words = val_total_words+ target_val.size(0)
                    val_correct_pred = val_correct_pred+ torch.sum(torch.argmax(output_val, dim=1) == target_val).item()
                    index += 1
                    pbar.update(1)
            # Calculate validation accuracy and loss        
            cal=val_correct_pred / val_total_words        
            val_accuracy = cal * 100
            lengg=len(vl_data)
            val_loss = val_total_loss / lengg
            
            # Evaluate model using beam search and calculate word-level accuracy
            correct_prediction = 0
            total_words = len(preprocessed_data['val_words'])
            with tqdm(total = total_words, desc = 'Beam') as pbar_:
                index = 0
                # Loop through each word in validation set for beam search evaluation
                while index < len(preprocessed_data['val_words']):
                    word, translation = preprocessed_data['val_words'][index], preprocessed_data['val_translations'][index]
                    ans = beam_search(model, word, preprocessed_data, params['beam_width'], params['length_penalty'], params['cell_type'])
                    val= translation[1:-1]
                    # Check if beam search translation matches reference translation
                    if ans == val:
                        correct_prediction = correct_prediction +1

                    index += 1
                    pbar_.update(1)
        # Calculate word-level accuracy using beam search            
        cal=correct_prediction / total_words
        val_accuracy_beam = cal * 100
        
        # Print and log results
        print(f'''Epoch : {epoch}
              Train Accuracy Char Level : {train_accuracy:.4f}, Train Loss : {train_loss:.4f}
              Validation Accuracy Char Level : {val_accuracy:.4f}, Validation Loss : {val_loss:.4f}
              Validation Accuracy Word Level : {val_accuracy_beam:.4f},  Correctly predicted : {correct_prediction}/{total_words}''')
        if params['w_log']:
            wandb.log(
                    {
                        'epoch': epoch,
                        'training_loss' : train_loss,
                        'training_accuracy_char' : train_accuracy,
                        'validation_loss' : val_loss,
                        'validation_accuracy_char' : val_accuracy,
                        'validation_accuracy_word' : val_accuracy_beam,
                        'correctly_predicted' : correct_prediction
                    }
                )
    
    # Return the trained model and validation accuracies
    return model, val_accuracy, val_accuracy_beam

# HYPERPARAMETERS

In [35]:
#providing the parameters for the function
params = {
#     'dataset_path' : r'C:\Users\gragh\OneDrive\Desktop\Codes\CS6910 DL\Assignment 3\DataSet\aksharantar_sampled',
    'dataset_path' : '/kaggle/input/dl-ass-3/aksharantar_sampled',
    'embedding_size': 256,
    'hidden_size': 512,
    'num_layers_enc': 3,
    'num_layers_dec': 3,
    'cell_type': 'LSTM',
    'dropout': 0.3,
    'optimizer' : 'adagrad',
    'learning_rate': 0.01,
    'batch_size': 32,
    'num_epochs': 10,
    'teacher_fr' : 0.7,
    'length_penalty' : 0.6,
    'beam_width': 1,
    'bi_dir' : True,
    'w_log' : 0
}

# QUESTION 1 : TRAINING MODEL

In [36]:
# Load preprocessed data based on specified parameters
preprocessed_data = loadData(params)

# Create tensors from the preprocessed data
tensors = create_tensor(preprocessed_data)

# Initialize the model based on the cell type specified in parameters
if params['cell_type'] == 'LSTM':
    # Use LSTM-based encoder, decoder, and Seq2Seq model
    encoder = Encoder_LSTM(params, preprocessed_data).to(device)
    decoder = Decoder_LSTM(params, preprocessed_data).to(device)
    model = Seq2Seq_LSTM(encoder, decoder, params, preprocessed_data).to(device) 
else:
    #Use RNN-based encoder, decoder, and Seq2Seq model 
    encoder = Encoder(params, preprocessed_data).to(device)
    decoder = Decoder(params, preprocessed_data).to(device)
    model = Seq2Seq(encoder, decoder, params, preprocessed_data).to(device)  
# print(model)

# Define the criterion (loss function) for training
crit = nn.CrossEntropyLoss(ignore_index = 0)

# Get the optimizer based on specified parameters
opt = get_optim(model,params)

# Initialize Weights & Biases (wandb) if logging is enabled
if params['w_log']:
    # Set the name of the run based on the model and training parameters
    wandb.init(project = 'DL-Assignment-3')
    wandb.run.name = f"c:{params['cell_type']}_e:{params['num_epochs']}_es:{params['embedding_size']}_hs:{params['hidden_size']}_nle:{params['num_layers_enc']}_nld:{params['num_layers_dec']}_o:{params['optimizer']}_lr:{params['learning_rate']}_bs:{params['batch_size']}_tf:{params['teacher_fr']}_lp:{params['length_penalty']}_b:{params['bi_dir']}_bw:{params['beam_width']}"
    
# Train the model using the defined criterion, optimizer, and other parameters
trained_model, _, _ = train(model, crit, opt, preprocessed_data, tensors, params)

# Finish Weights & Biases logging if enabled
if params['w_log']:
    wandb.finish()

Training: 100%|██████████| 1600/1600 [02:38<00:00, 10.11it/s]
Validation: 100%|██████████| 128/128 [00:04<00:00, 31.49it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 85.98it/s]


Epoch : 1
              Train Accuracy Char Level : 55.3069, Train Loss : 1.6294
              Validation Accuracy Char Level : 70.4199, Validation Loss : 1.0789
              Validation Accuracy Word Level : 28.3203,  Correctly predicted : 1160/4096


Training: 100%|██████████| 1600/1600 [02:37<00:00, 10.17it/s]
Validation: 100%|██████████| 128/128 [00:04<00:00, 31.52it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 86.23it/s]


Epoch : 2
              Train Accuracy Char Level : 72.5094, Train Loss : 0.9928
              Validation Accuracy Char Level : 72.9877, Validation Loss : 1.0035
              Validation Accuracy Word Level : 34.3506,  Correctly predicted : 1407/4096


Training: 100%|██████████| 1600/1600 [02:37<00:00, 10.19it/s]
Validation: 100%|██████████| 128/128 [00:04<00:00, 31.65it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 86.50it/s]


Epoch : 3
              Train Accuracy Char Level : 75.7889, Train Loss : 0.8885
              Validation Accuracy Char Level : 74.5673, Validation Loss : 0.9567
              Validation Accuracy Word Level : 37.8906,  Correctly predicted : 1552/4096


Training: 100%|██████████| 1600/1600 [02:37<00:00, 10.15it/s]
Validation: 100%|██████████| 128/128 [00:04<00:00, 31.45it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 86.55it/s]


Epoch : 4
              Train Accuracy Char Level : 77.8222, Train Loss : 0.8238
              Validation Accuracy Char Level : 75.2813, Validation Loss : 0.9337
              Validation Accuracy Word Level : 39.9414,  Correctly predicted : 1636/4096


Training: 100%|██████████| 1600/1600 [02:36<00:00, 10.20it/s]
Validation: 100%|██████████| 128/128 [00:04<00:00, 31.78it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 86.65it/s]


Epoch : 5
              Train Accuracy Char Level : 79.2086, Train Loss : 0.7818
              Validation Accuracy Char Level : 75.3128, Validation Loss : 0.9476
              Validation Accuracy Word Level : 39.9658,  Correctly predicted : 1637/4096


Training: 100%|██████████| 1600/1600 [02:37<00:00, 10.15it/s]
Validation: 100%|██████████| 128/128 [00:04<00:00, 31.52it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 86.39it/s]


Epoch : 6
              Train Accuracy Char Level : 80.4136, Train Loss : 0.7448
              Validation Accuracy Char Level : 75.6841, Validation Loss : 0.9323
              Validation Accuracy Word Level : 40.2588,  Correctly predicted : 1649/4096


Training: 100%|██████████| 1600/1600 [02:37<00:00, 10.14it/s]
Validation: 100%|██████████| 128/128 [00:04<00:00, 31.46it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 85.79it/s]


Epoch : 7
              Train Accuracy Char Level : 81.4526, Train Loss : 0.7123
              Validation Accuracy Char Level : 75.9326, Validation Loss : 0.9266
              Validation Accuracy Word Level : 41.2842,  Correctly predicted : 1691/4096


Training: 100%|██████████| 1600/1600 [02:37<00:00, 10.17it/s]
Validation: 100%|██████████| 128/128 [00:04<00:00, 31.55it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 85.76it/s]


Epoch : 8
              Train Accuracy Char Level : 82.2051, Train Loss : 0.6889
              Validation Accuracy Char Level : 75.5498, Validation Loss : 0.9479
              Validation Accuracy Word Level : 41.1865,  Correctly predicted : 1687/4096


Training: 100%|██████████| 1600/1600 [02:37<00:00, 10.14it/s]
Validation: 100%|██████████| 128/128 [00:03<00:00, 32.01it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 86.54it/s]


Epoch : 9
              Train Accuracy Char Level : 83.0228, Train Loss : 0.6623
              Validation Accuracy Char Level : 75.6355, Validation Loss : 0.9552
              Validation Accuracy Word Level : 41.2354,  Correctly predicted : 1689/4096


Training: 100%|██████████| 1600/1600 [02:37<00:00, 10.15it/s]
Validation: 100%|██████████| 128/128 [00:04<00:00, 31.51it/s]
Beam: 100%|██████████| 4096/4096 [00:47<00:00, 86.36it/s]

Epoch : 10
              Train Accuracy Char Level : 83.6507, Train Loss : 0.6439
              Validation Accuracy Char Level : 76.2211, Validation Loss : 0.9361
              Validation Accuracy Word Level : 42.3096,  Correctly predicted : 1733/4096





# QUESTION 4 : EVALUATE MODEL

In [37]:
def evaluate(preprocessd_data,trained_model)

    # Set the trained model to evaluation mode
    trained_model.eval()

    # Initialize variables for tracking predictions and evaluation
    correct_prediction = 0
    words = []
    translations = []
    prediction = []
    results = []

    # Use tqdm to visualize progress during inference
    total_words = len(preprocessed_data['test_words'])
    with tqdm(total = total_words) as pbar_:

        # Loop through each word in the test set
        index = 0
        while index < len(preprocessed_data['test_words']):
            word, translation = preprocessed_data['test_words'][index], preprocessed_data['test_translations'][index]

            # Perform beam search to generate a translation using the trained model
            ans = beam_search(trained_model, word, preprocessed_data, params['beam_width'], params['length_penalty'], params['cell_type'])

            # Store the word (without end token), translation (without start/end tokens), and predicted translation
            words.append(word[:-1])
            translations.append(translation[1:-1])
            prediction.append(ans)

            # Check if the predicted translation matches the reference translation
            val= ans == translation[1:-1]
            if val!=1 :
                results.append('No')
            else:
                correct_prediction = correct_prediction + 1
                results.append('Yes')
            index += 1
            pbar_.update(1)

    # Calculate accuracy based on correct predictions  
    cal=correct_prediction / total_words    
    accuracy = cal * 100
    print(f'Test Accuracy Word Level : {accuracy}, Correctly Predicted : {correct_prediction}')

    # Prepare a dictionary for logging predictions
    log = {'Word': words, 'Translation' : translations, 'Prediction' : prediction, 'Result' : results}
    path = '/kaggle/working/predictions_vanilla.csv'

    # Create a DataFrame from the logging dictionary and save it as a CSV file
    data_frame = pd.DataFrame(log)
    data_frame.to_csv(path, header = True, index = False)

    # Optionally display the DataFrame (for debugging or verification)
    pd.DataFrame(log)

100%|██████████| 4096/4096 [00:50<00:00, 80.72it/s]

Test Accuracy Word Level : 40.4296875, Correctly Predicted : 1656





Unnamed: 0,Word,Translation,Prediction,Result
0,thermax,थरमैक्स,थर्मक्स,No
1,sikhaaega,सिखाएगा,सिखाएगा,Yes
2,learn,लर्न,लीर्न,No
3,twitters,ट्विटर्स,ट्विटर्स,Yes
4,tirunelveli,तिरुनेलवेली,तिरुनेलवेली,Yes
...,...,...,...,...
4091,saflata,सफ़लता,सफलता,No
4092,shbana,शबाना,शबाना,Yes
4093,khaatootolaa,खातूटोला,खाटूटोला,No
4094,shivastava,शिवास्तव,शिवास्तव,Yes


# QUESTION 2 : Tuning Hyperparameters

In [38]:
# # Define individual hyperparameter values or ranges
# ne= {'values': [10]}
# ct = {'values': ['RNN', 'LSTM', 'GRU']}
# es = {'values': [128, 256, 512]}
# nl= {'values': [1, 2, 3]}
# dp= {'values': [0.3, 0.5, 0.7]}
# lr= {'values': [0.001, 0.005, 0.01, 0.1]}
# bs= {'values': [32, 64]}
# lp= {'values': [0.4, 0.5, 0.6]}
# bi= {'values': [True, False]}
# hs= {'values': [128, 256, 512]}
# opt={'values' : ['adam', 'sgd', 'rmsprop', 'adagrad']}
# bw = {'values': [1, 2, 3]}
# tf ={'values': [0.3, 0.5, 0.7]}

# # Define the sweep configuration dictionary
# sweep_config = {
#             'name': 'sweep 1 and 1.1 : random',
#             'method': 'random',
#             'metric': { 'goal': 'maximize','name': 'Accuracy'},
#             'parameters': 
#                 {
#                     'num_epochs': ne,
#                     'cell_type': ct,
#                     'embedding_size': es,
#                     'hidden_size': hs,
#                     'num_layers': nl ,
#                     'dropout': dp,
#                     'optimizer' : opt,
#                     'learning_rate': lr,
#                     'batch_size': bs,
#                     'teacher_fr' : tf,
#                     'length_penalty' : lp,
#                     'bi_dir' :bi ,
#                     'beam_width': bw
#                 }
#             }

In [39]:
# # Function to run sweep
# def run_sweep():
#     #Initialize Weights & Biases run for experiment tracking
#     init = wandb.init(project = 'DL-Assignment-3')
#     config = init.config
#     # Define parameters based on the configuration from Weights & Biases
#     params = {
#         'language' : 'hin',
#         'dataset_path' : '/kaggle/input/dl-ass3/aksharantar_sampled',
#         'num_epochs': config.num_epochs,
#         'cell_type': config.cell_type,
#         'embedding_size': config.embedding_size,
#         'hidden_size': config.hidden_size,
#         'num_layers_enc': config.num_layers,
#         'num_layers_dec': config.num_layers,
#         'dropout': config.dropout,
#         'optimizer' : config.optimizer,
#         'learning_rate': config.learning_rate,
#         'batch_size': config.batch_size,
#         'teacher_fr' : config.teacher_fr,
#         'length_penalty' : config.length_penalty,
#         'bi_dir' : config.bi_dir,
#         'beam_width' : config.beam_width,
#         'w_log' : 1
#     }
    
#     # Set the name of the Weights & Biases run based on the experiment configuration
#     wandb.run.name = f"c:{params['cell_type']}_e:{params['num_epochs']}_es:{params['embedding_size']}_hs:{params['hidden_size']}_nle:{params['num_layers_enc']}_nld:{params['num_layers_dec']}_o:{params['optimizer']}_lr:{params['learning_rate']}_bs:{params['batch_size']}_tf:{params['teacher_fr']}_lp:{params['length_penalty']}_b:{params['bi_dir']}_bw:{params['beam_width']}"
#     preprocessed_data = loadData(params)
#     tensors = create_tensor(preprocessed_data)
    
#     # Initialize the decoder, encoder, and seq2seq model based on the parameters
#     decoder = Decoder(params, preprocessed_data).to(device)
#     encoder = Encoder(params, preprocessed_data).to(device)
#     model = Seq2Seq(encoder, decoder, params, preprocessed_data).to(device) 
    
#     # Define the loss function (criterion) and optimizer based on the model and parameters
#     crit = nn.CrossEntropyLoss(ignore_index = 0)
#     opt = get_optim(model,params)
    
#     # Perform training and obtain validation accuracy with beam search
#     _, _, v_acc_beam = train(model, crit, opt, preprocessed_data, tensors, params)
    
#     # Log the validation accuracy with beam search to Weights & Biases
#     wandb.log({'Accuracy': v_acc_beam})

In [40]:
# # Initiate a hyperparameter sweep and obtain the sweep ID
# sweep_id = wandb.sweep(sweep_config, project='DL-Assignment-3')

# # Run the hyperparameter sweep using the defined `run_sweep` function as an agent
# # `count=25` specifies the number of runs (trials) to execute for the sweep
# wandb.agent(sweep_id, run_sweep, count = 30)

# # Finish the Weights & Biases run after completing the hyperparameter sweep
# wandb.finish()