In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import random
from datetime import datetime
import wandb
import requests, zipfile, io
import pandas

In [None]:
#downloading dataset by drive link
#Ref - https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url
url = "https://drive.google.com/u/0/uc?id=1uRKU4as2NlS9i8sdLRS1e326vQRdhvfw&export=download"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall()

In [None]:
wandb.login(key="75723b6e8716d094c31d6b7e25cc865ac5907e1f")

In [None]:
#Reading hindi dataset
csvFile = pandas.read_csv('/kaggle/working/aksharantar_sampled/hin/hin_train.csv', names = ['English', 'Hindi'])
train_input = csvFile['English']
train_output = csvFile['Hindi']
csvFile = pandas.read_csv('/kaggle/working/aksharantar_sampled/hin/hin_valid.csv', names = ['English', 'Hindi'])
valid_input = csvFile['English']
valid_output = csvFile['Hindi']
csvFile = pandas.read_csv('/kaggle/working/aksharantar_sampled/hin/hin_test.csv', names = ['English', 'Hindi'])
test_input = csvFile['English']
test_output = csvFile['Hindi']

In [None]:
#SOW -> startOfWord
#EOW -> endOfWToken
SOW_token = 0
EOW_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.char2index = {}
        self.index2char = {0: "#", 1: "$"}
        self.n_chars = 2  #  2 because SOW and EOW we are adding at the start

    #Takes a list of words and all words in the language (dictionary)
    def addAllWords(self, words):
        for word in words:
            self.addWord(word)

    #Takes a single word and then adds that word to dict
    def addWord(self, word):
        for c in word:
            if c not in self.char2index:
                self.char2index[c] = self.n_chars
                self.index2char[self.n_chars] = c
                self.n_chars += 1

In [None]:
#Create an object of Lang class and then adds all the english words of training dataset to english lang
lang_input = Lang("English")
lang_input.addAllWords(train_input)
#Create an object of Lang class and then adds all the hindi words of training dataset to hindi lang
lang_output = Lang("Hindi")
lang_output.addAllWords(train_output)

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_p, batch_size, embedding_size, cell_type = "LSTM", bidirection = False):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(dropout_p)
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.cell_type = cell_type
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.bidirection = bidirection
        if(cell_type == "GRU"):
            self.gru = nn.GRU(embedding_size, hidden_size, num_layers, dropout = dropout_p, bidirectional = bidirection)
        elif(cell_type == "LSTM"):
            self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = dropout_p, bidirectional = bidirection)
        elif(cell_type == "RNN"):
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout = dropout_p, bidirectional = bidirection)

    def forward(self, input, hidden):
        #In case of LSTM, hidden is a tuple which contains (hidden, cell)
        embedded = self.embedding(input).view(-1,self.batch_size, self.embedding_size)
        output = self.dropout(embedded)
        if(self.cell_type == "GRU"):
            _, hidden = self.gru(output, hidden)
        elif(self.cell_type == "LSTM"):
            _, (hidden, cell) = self.lstm(output)
        elif(self.cell_type == "RNN"):
            _, hidden = self.rnn(output, hidden)
        if self.bidirection:
            #In case of bidirection, the size of hidden is (2*num_layers, batch_size, hidden_size)
            #But for decoder, we need hidden of size (num_layers, batch_size, hidden_size)
            #Therefore we are taking avg of hidden[:num_layers] and hidden[num_layers:]
            hidden = hidden.reshape(2, hidden.size(0)//2, hidden.size(1), hidden.size(2))
            hidden = torch.add(hidden[0]*0.5, hidden[1]*0.5)
            hidden = hidden.squeeze(0)
            if(self.cell_type == "LSTM"):
                cell = cell.reshape(2, cell.size(0)//2, cell.size(1), cell.size(2))
                cell = torch.add(cell[0]*0.5, cell[1]*0.5)
                cell = cell.squeeze(0)
        if self.cell_type == "LSTM":
            return hidden, cell
        else:
            return hidden

    def initHidden(self):
        if self.bidirection:
            return torch.zeros(2*self.num_layers, self.batch_size, self.hidden_size, device=device)
        else:
            return torch.zeros(self.num_layers, self.batch_size, self.hidden_size, device=device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, dropout_p, batch_size, embedding_size, cell_type = "LSTM"):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(dropout_p)
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.cell_type = cell_type
        if(cell_type == "GRU"):
            self.gru = nn.GRU(embedding_size, hidden_size, num_layers, dropout = dropout_p)
        elif(cell_type == "LSTM"):
            self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = dropout_p)
        elif(cell_type == "RNN"):
            self.rnn = nn.RNN(embedding_size, hidden_size, num_layers, dropout = dropout_p)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        #In case of LSTM, hidden is a tuple which contains (hidden, cell)
        output = self.embedding(input).view(-1, self.batch_size, self.embedding_size)
        output = self.dropout(output)
        output = torch.relu(output)
        if(self.cell_type == "GRU"):
            output, hidden = self.gru(output, hidden)
        elif(self.cell_type == "LSTM"):
            output, (hidden, cell) = self.lstm(output, (hidden[0], hidden[1]))
        elif(self.cell_type == "RNN"):
            output, hidden = self.rnn(output, hidden)
        if self.cell_type == "LSTM":
            return self.softmax(self.out(output[0])), hidden, cell
        return self.softmax(self.out(output[0])), hidden


In [None]:
#Maps each char of word from char to index in that lang
#Output is a list
def indexesFromWord(lang, word):
    return [lang.char2index[c] for c in word]

#First calls indexesFromWord to get the mapping of each char to its index in that lang
#And then converts that list of indexes to tensor
def tensorFromWord(lang, word):
    indexes = indexesFromWord(lang, word)
    #Append EOW_Token at the end of word
    indexes.append(EOW_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

In [None]:
#Train the model for one batch
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size, cell_type):
    teacher_forcing_ratio = 0.5
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0
    if cell_type == "LSTM":
        encoder_hidden, encoder_cell = encoder(input_tensor, encoder_hidden)
    else:
        encoder_hidden = encoder(input_tensor, encoder_hidden)
        
    decoder_input = torch.tensor([SOW_token]*batch_size, device=device)
    decoder_hidden = encoder_hidden
    
    if cell_type == "LSTM":
        decoder_cell = encoder_cell
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    #Approaximately 50% of the time if will be executed
    #In this case, for the prediction of current char, input will be last actual char instead of last predicted char
    if use_teacher_forcing:
        for di in range(target_length):
            if cell_type == "LSTM":
                decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, (decoder_hidden, decoder_cell))
            else:
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            #Input to next cell, will be current hindi char, not curr predicted char
            decoder_input = target_tensor[di] 

    else:
        for di in range(target_length):
            if cell_type == "LSTM":
                decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, (decoder_hidden, decoder_cell))
            else:
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            #Input to next cell will be current predicted hindi char
            decoder_input = topi.squeeze().detach()  

            loss += criterion(decoder_output, target_tensor[di])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
def getBatchedTensorFromWords(words, batch_size, lang):
    input_tensor = [tensorFromWord(lang, word) for word in words]
    #Pad each word so that length of each word becomes same as max length of all words
    #Ref -> https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html
    batched_tensors = ((nn.utils.rnn.pad_sequence(input_tensor).squeeze(2)).to(device))
    batchWise = []
    for i in range(0, batched_tensors.shape[1], batch_size):
        batchWise.append((batched_tensors[0:batched_tensors.shape[0], i:(i+batch_size)]))
    return batchWise

In [None]:
def trainIters(encoder, decoder, n_datapoints, epochs, learning_rate, batch_size, embedding_size, cell_type, num_layers_encoder, num_layers_decoder, hidden_size, bidirectional, dropout_encoder, dropout_decoder):
    run_name = "embS_{}_nlEnc_{}_nlDec_{}_hl_{}_cellType_{}_biDir_{}_dropEnc_{}_dropDec_{}_ep_{}_bs_{}".format(embedding_size, num_layers_encoder, num_layers_decoder, hidden_size, cell_type, bidirectional, dropout_encoder, dropout_decoder, epochs, batch_size)
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.NAdam(encoder.parameters(), lr=learning_rate, weight_decay = 0.0005)
    decoder_optimizer = optim.NAdam(decoder.parameters(), lr=learning_rate, weight_decay = 0.0005)
    criterion = nn.CrossEntropyLoss()
    
    #Convert data to batched data
    train_batch_input = getBatchedTensorFromWords(train_input, batch_size, lang_input)
    train_batch_target = getBatchedTensorFromWords(train_output, batch_size, lang_output)
    
    valid_batch_input = getBatchedTensorFromWords(valid_input, batch_size, lang_input)
    
    for epochNum in range(epochs):
        for i in range(len(train_batch_input)):
            #Call the train function for one batch
            loss = train(train_batch_input[i], train_batch_target[i], encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size, cell_type)
            print_loss_total += loss*batch_size
        print_loss_avg = print_loss_total / len(train_input)
        print_loss_total = 0
        print("Average loss after ", epochNum+1, "epochs is ", print_loss_avg)

        valid_accuracy = findAccuracy(encoder, decoder, valid_batch_input, valid_output, cell_type, len(valid_input), batch_size, False)
        print("Valid accuracy is ", valid_accuracy)
        wandb.log({"validation_accuracy": valid_accuracy, "training_loss": print_loss_avg, 'epoch': epochNum})
    
    train_accuracy = findAccuracy(encoder, decoder, train_batch_input, train_output, cell_type, len(train_input), batch_size, False)
    print("Train accuracy is ", train_accuracy)
    wandb.log({"training_accuracy": train_accuracy})
    wandb.run.name = run_name
    wandb.run.save()
    wandb.run.finish()
    test_batch_input = getBatchedTensorFromWords(test_input, batch_size, lang_input)
    test_accuracy = findAccuracy(encoder, decoder, test_batch_input, test_output, cell_type, len(test_input), batch_size, True)
    print("Test accuracy", test_accuracy)


In [None]:
#Takes the model and one batch as input
#outputs predicted word for each english of the batch
def evaluate(encoder, decoder, input_tensors, cell_type, batch_size):
    with torch.no_grad():
        
        input_length = input_tensors.size(0)
        encoder_hidden = encoder.initHidden()

        if cell_type == "LSTM":
            encoder_hidden, encoder_cell = encoder(input_tensors, encoder_hidden)
        else:
            encoder_hidden = encoder(input_tensors, encoder_hidden)

        decoder_input = torch.tensor([SOW_token]*batch_size, device=device)  
        decoder_hidden = encoder_hidden

        if cell_type == "LSTM":
            decoder_cell = encoder_cell

        decoded_words = [""]*batch_size

        for di in range(input_length):
            
            if cell_type == "LSTM":
                decoder_output, decoder_hidden, decoder_cell = decoder(decoder_input, (decoder_hidden, decoder_cell))
            else:
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            for i in range(batch_size):
                #If curr predicted is eow or padded char, then ignore
                #else add curr predicted char to decoded_words list
                if topi[i].item() == EOW_token or topi[i] == 0:
                    continue
                else:
                    decoded_words[i] += lang_output.index2char[topi[i].item()]

            decoder_input = topi.squeeze().detach()

        return decoded_words

In [None]:
def findAccuracy(encoder1, decoder1, input, actual_output, cell_type, n, batch_size, flag):
    correct = 0
    wrong = 0
    if flag:
        file1 = open("testDatasetWordsWithoutAttention.txt","a")
        file1.write("Correct word      Predicted word\n")
    for i in range(len(input)):
        output_word = evaluate(encoder1, decoder1, input[i], cell_type, batch_size)
        for j in range(i*batch_size, i*batch_size+batch_size):
            if(actual_output[j] == output_word[j-i*batch_size]):
                correct += 1
            elif flag:
                wrong += 1
                s = str(wrong) + " " + actual_output[j] + "   " + output_word[j-i*batch_size] + "\n"
                file1.write(s)
    if flag:
        file1.close()
    return correct/n*100

In [None]:
def runSweep():
    config_defaults = {
        "embedding_size": 64,
        "num_layers_encoder": 3,
        "num_layers_decoder": 3,
        "hidden_layer": 256,
        "cell_type": "LSTM",
        "bidirectional": True,
        "dropout_encoder": 0.2,
        "dropout_decoder": 0.3,
        "epochs": 20,
        "batch_size": 128,
    }
    wandb.init(project = 'Assignment 3', entity = 'cs22m006', config=config_defaults)
    embedding_size = wandb.config.embedding_size
    num_layers_encoder = wandb.config.num_layers
    num_layers_decoder = wandb.config.num_layers
    hidden_size = wandb.config.hidden_layer
    batch_size = wandb.config.batch_size
    epochs = wandb.config.epochs
    cell_type = wandb.config.cell_type
    bidirectional = wandb.config.bidirectional
    dropout_encoder = wandb.config.dropout_encoder
    dropout_decoder = wandb.config.dropout_decoder
    learning_rate = 0.001

    #Comment above lines of this cell and uncomment below lines of this cell, if you want to just test the model without wandb
#     embedding_size = 256
#     num_layers_encoder = 3
#     num_layers_decoder = 3
#     hidden_size = 512
#     batch_size = 32
#     epochs = 30
#     cell_type = "LSTM"
#     bidirectional = True
#     dropout_encoder = 0.4
#     dropout_decoder = 0.2
#     learning_rate = 0.001
#     encoder1 = EncoderRNN(lang_input.n_chars, hidden_size, num_layers_encoder, dropout_encoder, batch_size, embedding_size, cell_type, bidirectional).to(device)
#     decoder1 = DecoderRNN(hidden_size, lang_output.n_chars, num_layers_decoder, dropout_decoder, batch_size, embedding_size, cell_type).to(device)
    trainIters(encoder1, decoder1, len(train_input), epochs, learning_rate, batch_size, embedding_size, cell_type, num_layers_encoder, num_layers_decoder, hidden_size, bidirectional, dropout_encoder, dropout_decoder)

In [None]:
runSweep()

In [None]:
#Run this cell, if you want to run sweep
sweep_config = {
  "name": "CS6910 Assignment 3 - Cross Entropy Loss",
  "metric": {
      "name":"validation_accuracy",
      "goal": "maximize"
  },
  "method": "bayes",
  "parameters": {
        "embedding_size": {
            "values": [512, 256, 64, 32]
        },
        "num_layers": {
            "values": [3, 2, 1]
        },
        "hidden_layer": {
            "values": [512, 256, 128]
        },
        "cell_type": {
            "values": ["RNN", "GRU"]
        },
        "bidirectional": {
            "values": [False, True]
        },
        "dropout_encoder": {
            "values": [0.2, 0.3, 0.4]
        },
        "dropout_decoder": {
            "values": [0.2, 0.3, 0.4]
        },
        "epochs": {
            "values": [20, 30]
        },
        "batch_size": {
            "values": [256, 128, 64, 32]
        }
    }
}
sweep_id = wandb.sweep(sweep_config, entity="cs22m006", project="Assignment 3")
wandb.agent(sweep_id, runSweep, count = 200)