In [None]:
# Import all the required libraries
import torch
import torch.nn as nn
import torch.optim as optim
import random
import unicodedata
import string
import os 
import pandas as pd
import wandb

In [None]:
# To enable gpu in the device.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
# Function to create index2char & char2index dictionary.
# This function does preprocessing of train, test and validation path and create pairs of english & hindi words.
def preprocessData(currdir, lang_chosen, index2char, char2index, data_type = 'train'):
    train_path = os.path.join(currdir, lang_chosen, lang_chosen + '_train.csv')
    val_path = os.path.join(currdir, lang_chosen, lang_chosen + '_valid.csv')
    test_path = os.path.join(currdir,  lang_chosen, lang_chosen + '_test.csv')
    if data_type == "test":
        path = test_path
    elif data_type == "val":
        path = val_path
    else:
        path = train_path
        
    data = pd.read_csv(path, names=['input', 'output'])
    input = data['input'].to_list()
    output = data['output'].to_list()
    pair_list =  []
    # Create pairs of words.
    for i in range(len(input)):
        pair = (input[i], output[i])
        pair_list.append(pair)
        
    # Tokens 
    # 0 -> SOS
    # 1 -> EOS
    # 3 -> Pad
    index2char1 = {0:'<', 1: '>', 2 : '.'}
    char2index1 = {'<' : 0, '>' : 1, '.' : 2 }
    char_count = {}
    num_char = 3
    index = 3
    maxlength_input = 0
    maxlength_output = 0
    # Creating char dictionary.
    for word in input:
        maxlength_input = max(maxlength_input, len(word))
        for char in word: 
            if char not in  char2index1:
                char2index1[char] = len(char2index1)
                char_count[char] = 1
                index2char1[len(index2char1)] = char
                index = index + 1
            else: 
                char_count[char] = char_count[char] + 1
                
    for word in output:
        maxlength_output = max(maxlength_output, len(word))
        for char in word: 
            if char not in  char2index1:
                char2index1[char] = len(char2index1)
                char_count[char] = 1
                index2char1[len(char2index1)] = char
                index = index + 1
            else: 
                char_count[char] = char_count[char] + 1
    # Adding in the main index2char and char2index dictionary
    for word in input:
        for char in word: 
            if char not in  char2index:
                char2index[char] = len(char2index)
                index2char[len(index2char)] = char
                
    for word in output:
        for char in word: 
            if char not in  char2index:
                char2index[char] = len(char2index)
                index2char[len(index2char)] = char
    return char_count, char2index, index2char, maxlength_input, maxlength_output, pair_list
    
    

In [None]:
# Function to create a vector for the word which contains its indices from char2index dictionary.
def word2vec(char2index, word):
    vec = []
    for char in word:
        vec.append(char2index[char])
    return vec

In [None]:
# Create a tensor for each of the words containg PAD_token, SOS_token & EOS_token and rest of the indices.
def create_vec(char2index, maxlength, word):
    wordvec = word2vec(char2index, word)
    wordvec.append(EOS_token)
    for i in range(maxlength - len(word)):
        wordvec.append(PAD_token)
    wordvec = torch.LongTensor(wordvec)
    return wordvec
# Function to create a pair of tensors of embedding of english & hindi words in a pair.
def create_vec_pair(char2index, maxlength, pair_list):
    vec_pair_list = []
    for word_pair in pair_list:
        eng_vec = create_vec(char2index, maxlength, word_pair[0])
        hind_vec = create_vec(char2index, maxlength, word_pair[1])
        vec_pair = (eng_vec, hind_vec)
        vec_pair_list.append(vec_pair)
    return vec_pair_list

In [None]:
# Class for Encoder & Decoder for LSTM , GRU & RNN.
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, input_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.bidirectional = bidirectional
        self.embedding_size = embedding_size
        self.cell_type = cell_type
        self.embedding = nn.Embedding(input_size, self.embedding_size)
        self.dropout = nn.Dropout(dropout)
        
        
        #For RNN 
        self.cell_layer = self.cell_layer = nn.RNN(self.embedding_size, self.hidden_size, num_layers = num_layers_encoder, dropout = drop_out, bidirectional = bi_directional)
        if self.cell_type == 'GRU':
            self.cell_layer = nn.GRU(self.embedding_size, self.hidden_size, num_layers = num_layers_encoder, dropout = drop_out, bidirectional = bi_directional)
        elif self.cell_type == 'LSTM':
            self.cell_layer = nn.LSTM(self.embedding_size, self.hidden_size, num_layers = num_layers_encoder, dropout = drop_out, bidirectional = bi_directional)
 
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded.view(1,self.batch_size, -1))
        ignored_tokens = []
        for i in range(len(input)): 
            if i == 0:
                ignored_tokens.append(i)
            elif i == 1:
                ignored_tokens.append(i+1)
            else: 
                ignored_tokens.append(i+2)
            if i == 2: 
                break
                
        output = embedded
        output, hidden = self.cell_layer(output, hidden)
        ignored_token.append(2)
        return output, hidden

    def initHidden(self , num_layers):
        if (self.bidirectional==False):
            result = torch.zeros(num_layers, self.batch_size, self.hidden_size)
        else:
            result = torch.zeros(num_layers*2, self.batch_size, self.hidden_size)
        result.to(device)

class DecoderRNN(nn.Module):
    def __init__(self, batch_size, hidden_size , bidirectional, embedding_size, dropout, cell_type, num_layers_decoder, output_size):
        super(DecoderRNN, self).__init__()
        # Taking the values of all the hyperparameters from the input.
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.embedding_size = embedding_size
        self.cell_type = cell_type
        self.embedding = nn.Embedding(output_size, self.embedding_size)
        self.dropout = nn.Dropout(drop_out)
        self.num_layers_decoder = num_layers_decoder
        # For RNN
        self.cell_layer = nn.RNN(self.embedding_size, self.hidden_size, num_layers = self.num_layers_decoder, dropout = drop_out, bidirectional = self.bi_directional)
        # For GRU
        if self.cell_type == 'GRU':
            self.cell_layer =   nn.GRU(self.embedding_size, self.hidden_size, num_layers = self.num_layers_decoder, dropout = drop_out, bidirectional = self.bi_directional)
        # For LSTM
        elif self.cell_type == 'LSTM':
            self.cell_layer = nn.LSTM(self.embedding_size, self.hidden_size, num_layers = self.num_layers_decoder, dropout = drop_out, bidirectional = self.bi_directional)
        
        # For bidirectional model
        if (self.bidirectional == True):
            self.out = nn.Linear(self.hidden_size*2 , output_size)
        # For non bi-directional model
        else:
            self.out = nn.Linear(self.hidden_size , output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        # Creating embeddding and then computing output from it.
        embedded = self.embedding(input)
        output = self.dropout(embedded.view(1,self.batch_size, -1))
        # Using the non-linear ReLU funciton to add non - linearity.
        output = nn.functional.relu(output)
        # computing output for appropriate gru, rnn or lstm.
        output, hidden = self.cell_layer(output, hidden)
        ignored_tokens = []
        for i in range(len(input)): 
            if i == 0:
                ignored_tokens.append(i)
            elif i == 1:
                ignored_tokens.append(i+1)
            else: 
                ignored_tokens.append(i+2)
            if i == 2: 
                break
        output = self.softmax(self.out(output[0]))
        return output, hidden
    # Function to create a hidden layer which is initialised with all zeros.
    def initHidden(self):
        # for Bidirectional model hidden size => (2*num_layers_decoder) x batch_size x hidden_size
        if (self.bidirectional == True):
            result = torch.zeros(2 * self.num_layers_decoder, self.batch_size, self.hidden_size) 
        # for non - Bidirectional model hidden size => num_layers_decoder x batch_size x hidden_size
        else:
            result = torch.zeros(self.num_layers_decoder , self.batch_size, self.hidden_size)
        # Transferring the result to store on gpu.
        result.to(device)
        # returning the result. 
        return result



In [None]:
# Function to train the model using teacher forcing method.
def train(input_tensor, output_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, max_length):
    teacher_forcing_ratio = 0.5
    # For GRU & RNN
    encoder_hidden = encoder.initHidden(num_layers_encoder)
    # For LSTM
    if cell_type == "LSTM":
        encoder_cell_state = encoder.initHidden(num_layers_encoder)
        encoder_hidden = (encoder_hidden, encoder_cell_state)

    # Transforming the shape of input_tensor to extract each of the letters one by one
    #  from all the words from the batch.
    input_tensor = torch.tensor(input_tensor.transpose(0, 1))
    output_tensor = torch.tensor(output_tensor.transpose(0, 1))
    # Doing gradient of encoder_optimizer & decoder_optimizer zero in the starting

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs = torch.zeros(max_length, batch_size, encoder.hidden_size)
    encoder_outputs.to(device)

    loss, i = 0, 0
    
    input_length = input_tensor.size(0)
    output_length = output_tensor.size(0)

    while i < input_length:
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        i += 1

    decoder_input = torch.LongTensor([SOS_token] * batch_size)
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden
    # Using the teachcer forcing ratio of 50 %.
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    # using Teacher forcing method.
    if use_teacher_forcing:
        i = 0
        while i < output_length:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            decoder_input = output_tensor[i]
            loss += criterion(decoder_output, output_tensor[i])
            i += 1

    else:
        j = 0
        while j < output_length:
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            decoder_input = torch.cat(tuple(topi))

            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            loss += criterion(decoder_output, output_tensor[j])
            j += 1

    
    # Adjusting all the weights to reduce the loss. 
    loss.backward()
    # updating the weights of encoder and deocder optimizer.
    encoder_optimizer.step()
    decoder_optimizer.step()
    # Returing the average loss.
    return loss.item() / output_length


In [None]:
# Fuction to calculate the average accuracy and average loss of the trained model.
def evaluate(encoder, decoder, loader, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, criterion, max_length, index2char):
    loss = total = correct = 0    
    with torch.no_grad():
        # for each of the batches in the loader, checking if each of the word is completely matching
        #  with the predicted word or not.
        for batch_x, batch_y in loader:
            batch_loss = 0

            encoder_hidden = encoder.initHidden(num_layers_encoder)
            if cell_type == "LSTM":
                encoder_cell_state = encoder.initHidden(num_layers_encoder)
                encoder_hidden = (encoder_hidden, encoder_cell_state)

            # Transforming input & target variable to extract each of the letter from the words of batches.
            input_variable = batch_x.transpose(0, 1)
            output_variable = batch_y.transpose(0, 1)
            input_length = input_variable.size(0)
            target_length = output_variable.size(0)

            output = torch.LongTensor(target_length, batch_size)

            encoder_outputs = torch.zeros(max_length, batch_size, encoder.hidden_size)
            encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
            
            decoder_input = torch.LongTensor([SOS_token] * batch_size)
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            for i in range(input_length):
                encoder_output, encoder_hidden = encoder(input_variable[i], encoder_hidden)
            # passing the last output of the encoder to the fist cell of decoder. 
            decoder_hidden = encoder_hidden
        
            for j in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

                batch_loss += criterion(decoder_output, output_variable[j].squeeze())

                topv, topi = decoder_output.data.topk(1)
                decoder_input = torch.cat(tuple(topi))
                output[j] = torch.cat(tuple(topi))

            output = output.transpose(0, 1)

            for k in range(output.size(0)):
                to_ignore = [SOS_token, EOS_token, PAD_token]
                pred = []
                y = []
                for w in output[k]: 
                    if w not in to_ignore:
                        y.append(index2char[w.item()])
                
                for w in batch_y[k]: 
                    if w not in to_ignore: 
                        pred.append(index2char[w.item()])
                if y == pred:
                    correct += 1
                total += 1
            # computing the average accuracy and loss.
            accuracy = (correct / total) * 100
            loss += batch_loss.item() / target_length
    # returning accuracy and loss.
    return accuracy, loss


In [None]:
# Define the main function to run the training loop
def trainIters(encoder, decoder,train_Loader, val_Loader, max_length, max_of_all, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, n_iters, index2char):
    # using Nadam as optimizer for learning
    optimizer_encoder = optim.NAdam(encoder.parameters(), lr = learning_rate)
    optimizer_decoder = optim.NAdam(decoder.parameters(), lr = learning_rate)
    # using negative log likelihood loss to compute loss.
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        print('Epoch No : ', iter)
        train_loss = 0

        
        for x, y in train_Loader:
            # transferrring the x & y to gpu.
            x,y = x.to(device), y.to(device)
            loss = train(x, y, encoder, decoder, optimizer_encoder, optimizer_decoder, criterion, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, max_length)
            train_loss += loss
        # computing the validation accuracy and loss.
        val_accur, val_loss = evaluate(encoder, decoder, val_Loader, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, criterion, max_of_all, index2char)
        print("Val Accuracy", val_accur, "Val Loss", val_loss/len(val_Loader),'Train Loss:' , train_loss / len(train_Loader))
        # storing the val_accur andd val_loss for the plots.
        wandb.log({"val_accuracy" :val_accur, "val_loss" :val_loss/len(val_Loader), "train_loss" : train_loss / len(train_Loader)})
    run_name = "bs_{}_emSz_{}_nEn_{}_nDec_{}_hl_{}_ct_{}_biDir_{}_lr_{}_drp_{}".format( configuration["batch_size"], configuration['embedding_size'], configuration["num_layers_encoder"], configuration["num_layers_decoder"], configuration["hidden_size"], configuration['cell_type'], configuration['bi_directional'], configuration['learning_rate'], configuration['drop_out'])
    wandb.run.name = run_name
    wandb.run.save()


In [None]:
def run_sweep_without_attention() :

    # preprocessing of train , test & val dataset.
    char_count, char2index ,index2char,maxlength_input, maxlength_output, pair_list =  preprocessData(currdir, lang_chosen, index2char, char2index, "train")
    val_char_count, char2index, index2char, val_maxlength_input, val_maxlength_output, val_pair_list = preprocessData(currdir, lang_chosen, index2char, char2index, "val")
    test_char_count, char2index, index2char, test_maxlength_input, test_maxlength_output, test_pair_list = preprocessData(currdir, lang_chosen, index2char, char2index, "test")
    maxlength = max(maxlength_input, maxlength_output) + 2
    # computing the maximum length of all the words.
    max_of_all = max([maxlength_input, maxlength_output, val_maxlength_input, val_maxlength_output, test_maxlength_input, test_maxlength_output])
    vec_pair_list = create_vec_pair(char2index, maxlength, pair_list)
    val_vec_pair_list = create_vec_pair(char2index, max_of_all, val_pair_list)
    test_vec_pair_list = create_vec_pair(char2index, max_of_all, test_pair_list)
    use_cuda = False
    configuration = {
        'embedding_size' : 256,
        'cell_type' : 'LSTM',
        'hidden_size' : 128,
        'batch_size' : 64,
        'bi_directional' : False,
        'drop_out' : 0.0, 
        'num_layers_encoder' : 1,
        'num_layers_decoder' : 1, 
        'learning_rate' : 0.001,
    }
    # storing all the values of hyperparameters in the appropriate variables
    batch_size = configuration['batch_size']
    embedding_size = configuration['embedding_size']
    cell_type = configuration['cell_type']
    hidden_size = configuration['hidden_size']
    bi_directional = configuration['bi_directional']
    drop_out = configuration['drop_out']
    num_layers_encoder = configuration['num_layers_encoder']
    num_layers_decoder = configuration['num_layers_decoder']
    learning_rate = configuration['learning_rate']

    encoder = EncoderRNN(hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, len(char2index))
    decoder = DecoderRNN(batch_size, hidden_size , bidirectional, embedding_size, dropout, cell_type, num_layers_decoder, len(char2index))
    # Creating batches for each of the train, test & validation dataset.
    train_loader = torch.utils.data.DataLoader(vec_pair_list, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_vec_pair_list, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_vec_pair_list, batch_size=batch_size, shuffle=True)

    # # Trainin the model
    wandb.init(project = 'deep_learning_assignment3', entity = 'cs22m081', config = configuration)
    trainIters(encoder, decoder, train_loader, val_loader, maxlength, max_of_all, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, epochs, index2char)


In [None]:
# Set up the parameters and hyperparameters
hidden_size = 256
MAX_LENGTH = 10
SOS_token = 0
EOS_token = 1
PAD_token = 2
# Prepare the data.
lang_chosen = 'hin'
currdir = '/kaggle/input/akshantar-original/'
# Tokens 
# 0 -> SOS
# 1 -> EOS
# 3 -> Pad
index2char = {0:'<', 1: '>', 2 : '.'}
char2index = {'<' : 0, '>' : 1, '.' : 2 }
## calling the function to run a single time on default configuration.
run_sweep_without_attention()

In [None]:
# Sweep configuration 
sweep_config ={
    'method':'bayes',
    'metric' = {
    'name' : 'validation_accuracy',
    'goal' : 'maximize',
     }
   'parameters' : 
   {
    'hidden_size':{
        'values' : [128,256,512]
    },
    'learning_rate':{
        'values' : [1e-2,1e-3]
    },
    'cell_type':{
        'values' : ['LSTM','RNN','GRU']
    },
    'num_layers_encoder':{
        'values' : [1,2,3]
    },
    'num_layers_decoder':{
        'values' : [1,2,3]
    },
    'drop_out':{
        'values' : [0.0,0.2,0.3]
    },
    'embedding_size':{
        'values' : [64,128,256,512]
    },
    'batch_size':{
        'values' : [32,64,128]
    },
    'bidirectional':{
        'values' : [True,False]
    }
}
}


In [None]:
# To run the sweeps on the above sweep configuration.
sweep_id = wandb.sweep(sweep_config, entity="cs22m081", project="deep_learning_assignment3")
wandb.agent(sweep_id, run_sweep_without_attention, count = 50)

In [None]:
# Function to predict the words for each of the input in the loader and save the result in a csv file prediction.csv
def inference(encoder, decoder, loader, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, max_length, index2char):
    loss = total = correct = 0    
    predictions = {
        "input" : [], 
        "pred" : [],
        "output" : []
    }
    # torch.no_grad() is used to not change the weights and predict the words from the encoder-decoder model.
    with torch.no_grad():
        for batch_x, batch_y in loader:
            encoder_hidden = encoder.initHidden(num_layers_encoder)
            if cell_type == "LSTM":
                encoder_cell_state = encoder.initHidden(num_layers_encoder)
                encoder_hidden = (encoder_hidden, encoder_cell_state)
            # used to store all the input words in the given batch_x
            input_words = [] 
            to_ignore = [SOS_token, EOS_token, PAD_token]
            for k in batch_x:
                input_word = ""
                for kk in k: 
                    if kk not in to_ignore:
                        input_word = input_word + index2char[kk.item()]
                input_words.append(input_word)
            input_variable = batch_x.transpose(0, 1)
            output_variable = batch_y.transpose(0, 1)

            input_length = input_variable.size(0)
            target_length = output_variable.size(0)

            output = torch.LongTensor(target_length, batch_size)

            encoder_outputs = torch.zeros(max_length, batch_size, encoder.hidden_size)
            encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
            
            decoder_input = torch.LongTensor([SOS_token] * batch_size)
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            for i in range(input_length):
                encoder_output, encoder_hidden = encoder(input_variable[i], encoder_hidden)

            decoder_hidden = encoder_hidden
        
            for j in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.data.topk(1)
                decoder_input = torch.cat(tuple(topi))
                output[j] = torch.cat(tuple(topi))

            output = output.transpose(0, 1)
            # used to store all the output words and predicted words in a list from the given batch.
            output_words = []
            pred_words = []
            for k in range(output.size(0)):                
                pred_word = ""
                output_word = ""
                # storing the predicted word.
                for w in output[k]: 
                    if w not in to_ignore:
                        pred_word = pred_word + index2char[w.item()]
                # storing the correct output word.
                for w in batch_y[k]: 
                    if w not in to_ignore: 
                        output_word = output_word + index2char[w.item()]
                output_words.append(output_word)
                pred_words.append(pred_word)
        # storing all the words from the current batch in the predictions dictionary.
        predictions["input"] = predictions["input"] + input_words
        predictions["pred"]= predictions["pred"] + pred_words
        predictions["output"] = predictions["output"] + output_words

    predict = pd.DataFrame(predictions)
    # creating a csv file.
    predict.to_csv("prediction.csv")
    return predict    

In [None]:
# calling the infer funtion to create the csv file.
pred = infer(encoder, decoder, test_loader, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size , maxlength, index2char)