In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import unicodedata
import string
import os 
import pandas as pd
import wandb

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
def preprocessData(currdir, lang_chosen, index2char, char2index, data_type = 'train'):
    train_path = os.path.join(currdir, lang_chosen, lang_chosen + '_train.csv')
    val_path = os.path.join(currdir, lang_chosen, lang_chosen + '_valid.csv')
    test_path = os.path.join(currdir,  lang_chosen, lang_chosen + '_test.csv')
    if data_type == "test":
        path = test_path
    elif data_type == "val":
        path = val_path
    else:
        path = train_path
        
    data = pd.read_csv(path, names=['input', 'output'])
    input = data['input'].to_list()
    output = data['output'].to_list()
    pair_list =  []
    # Create pairs of words.
    for i in range(len(input)):
        pair = (input[i], output[i])
        pair_list.append(pair)
        
    # Tokens 
    # 0 -> SOS
    # 1 -> EOS
    # 3 -> Pad
    index2char1 = {0:'<', 1: '>', 2 : '.'}
    char2index1 = {'<' : 0, '>' : 1, '.' : 2 }
    char_count = {}
    num_char = 3
    index = 3
    maxlength_input = 0
    maxlength_output = 0
    # Creating char dictionary.
    for word in input:
        maxlength_input = max(maxlength_input, len(word))
        for char in word: 
            if char not in  char2index1:
                char2index1[char] = len(char2index1)
                char_count[char] = 1
                index2char1[len(index2char1)] = char
                index = index + 1
            else: 
                char_count[char] = char_count[char] + 1
                
    for word in output:
        maxlength_output = max(maxlength_output, len(word))
        for char in word: 
            if char not in  char2index1:
                char2index1[char] = len(char2index1)
                char_count[char] = 1
                index2char1[len(char2index1)] = char
                index = index + 1
            else: 
                char_count[char] = char_count[char] + 1
    #Adding in the main index2char and char2index dictionary
    for word in input:
        for char in word: 
            if char not in  char2index:
                char2index[char] = len(char2index)
                index2char[len(index2char)] = char
                
    for word in output:
        for char in word: 
            if char not in  char2index:
                char2index[char] = len(char2index)
                index2char[len(index2char)] = char
    return char_count, char2index, index2char, maxlength_input, maxlength_output, pair_list
    
    

In [None]:
def word2vec(char2index, word):
    vec = []
    for char in word:
        vec.append(char2index[char])
    return vec

In [None]:
def create_vec(char2index, maxlength, word):
    wordvec = word2vec(char2index, word)
    wordvec.append(EOS_token)
    for i in range(maxlength - len(word)):
        wordvec.append(PAD_token)
#     print(wordvec)
    wordvec = torch.LongTensor(wordvec)
    return wordvec

def create_vec_pair(char2index, maxlength, pair_list):
    vec_pair_list = []
    for word_pair in pair_list:
        eng_vec = create_vec(char2index, maxlength, word_pair[0])
        hind_vec = create_vec(char2index, maxlength, word_pair[1])
        vec_pair = (eng_vec, hind_vec)
        vec_pair_list.append(vec_pair)
    return vec_pair_list

In [None]:
def evaluate(encoder, decoder, loader, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, criterion, max_length, index2char):
    loss = total = correct = 0    
    with torch.no_grad():
        for batch_x, batch_y in loader:
            batch_loss = 0

            encoder_hidden = encoder.initHidden(num_layers_encoder)
            if cell_type == "LSTM":
                encoder_cell_state = encoder.initHidden(num_layers_encoder)
                encoder_hidden = (encoder_hidden, encoder_cell_state)

            input_variable = batch_x.transpose(0, 1)
            output_variable = batch_y.transpose(0, 1)

            input_length = input_variable.size(0)
            target_length = output_variable.size(0)

            output = torch.LongTensor(target_length, batch_size)

            encoder_outputs = torch.zeros(max_length, batch_size, encoder.hidden_size)
            encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
            
            decoder_input = torch.LongTensor([SOS_token] * batch_size)
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            for i in range(input_length):
                encoder_output, encoder_hidden = encoder(input_variable[i], encoder_hidden)

            decoder_hidden = encoder_hidden
        
            for j in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

                batch_loss += criterion(decoder_output, output_variable[j].squeeze())

                topv, topi = decoder_output.data.topk(1)
                decoder_input = torch.cat(tuple(topi))
                output[j] = torch.cat(tuple(topi))

            output = output.transpose(0, 1)

            for k in range(output.size(0)):
                to_ignore = [SOS_token, EOS_token, PAD_token]
                pred = []
                y = []
                for w in output[k]: 
                    if w not in to_ignore:
                        y.append(index2char[w.item()])
                
                for w in batch_y[k]: 
                    if w not in to_ignore: 
                        pred.append(index2char[w.item()])
                if y == pred:
                    correct += 1
                total += 1
            accuracy = (correct / total) * 100
            loss += batch_loss.item() / target_length

    return accuracy, loss


In [None]:
def inference(encoder, decoder, loader, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, max_length, index2char):
    loss = total = correct = 0    
    predictions = {
        "input" : [], 
        "pred" : [],
        "output" : []
    }
    with torch.no_grad():
        for batch_x, batch_y in loader:
            encoder_hidden = encoder.initHidden(num_layers_encoder)
            if cell_type == "LSTM":
                encoder_cell_state = encoder.initHidden(num_layers_encoder)
                encoder_hidden = (encoder_hidden, encoder_cell_state)
            
            input_words = [] 
            to_ignore = [SOS_token, EOS_token, PAD_token]
            for k in batch_x:
                input_word = ""
                for kk in k: 
                    if kk not in to_ignore:
                        input_word = input_word + index2char[kk.item()]
                input_words.append(input_word)
            input_variable = batch_x.transpose(0, 1)
            output_variable = batch_y.transpose(0, 1)

            input_length = input_variable.size(0)
            target_length = output_variable.size(0)

            output = torch.LongTensor(target_length, batch_size)

            encoder_outputs = torch.zeros(max_length, batch_size, encoder.hidden_size)
            encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
            
            decoder_input = torch.LongTensor([SOS_token] * batch_size)
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            for i in range(input_length):
                encoder_output, encoder_hidden = encoder(input_variable[i], encoder_hidden)

            decoder_hidden = encoder_hidden
        
            for j in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.data.topk(1)
                decoder_input = torch.cat(tuple(topi))
                output[j] = torch.cat(tuple(topi))

            output = output.transpose(0, 1)
            output_words = []
            pred_words = []
            for k in range(output.size(0)):                
                pred_word = ""
                output_word = ""
                for w in output[k]: 
                    if w not in to_ignore:
                        pred_word = pred_word + index2char[w.item()]
                
                for w in batch_y[k]: 
                    if w not in to_ignore: 
                        output_word = output_word + index2char[w.item()]
                output_words.append(output_word)
                pred_words.append(pred_word)
            
        predictions["input"] = predictions["input"] + input_words
        predictions["pred"]= predictions["pred"] + pred_words
        predictions["output"] = predictions["output"] + output_words

    predict = pd.DataFrame(predictions)
    predict.to_csv("prediction.csv")
    return predict


pred = infer(encoder, decoder, test_loader, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size , maxlength, index2char)
    

In [None]:
class EncoderRNN_Attention(nn.Module):
    def __init__(self, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, input_size):
        super(EncoderRNN_Attention, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.bidirectional = bidirectional
        self.cell_type = cell_type
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(input_size, self.embedding_size)
        self.dropout = nn.Dropout(drop_out)
        self.num_layers_encoder = num_layers_encoder

        # For RNN
        self.cell_layer = nn.RNN(self.embedding_size, self.hidden_size, num_layers = self.num_layers_encoder, dropout = self.drop_out, bidirectional = self.bidirectional)

            
        if self.cell_type == 'GRU':
            self.cell_layer = nn.GRU(self.embedding_size, self.hidden_size, num_layers = self.num_layers_encoder, dropout = self.drop_out, bidirectional = self.bidirectional)
        elif self.cell_type == 'LSTM':
            self.cell_layer = nn.LSTM(self.embedding_size, self.hidden_size, num_layers = self.num_layers_encoder, dropout = self.drop_out, bidirectional = self.bidirectional)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output = self.dropout(embedded.view(1,self.batch_size, -1))
        output, hidden = self.cell_layer(output, hidden)
        return output, hidden

    def initHidden(self , num_layers):
        if (self.bidirectional == False):
            res = torch.zeros(num_layers, self.batch_size, self.hidden_size)
        else:
            res = torch.zeros(num_layers*2, self.batch_size, self.hidden_size)
        res.to(device)
        return res

class DecoderRNN_Attention(nn.Module):
    def __init__(self, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, output_size):
        super(DecoderRNN_Attention, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.cell_type = cell_type
        self.bidirectional = bidirectional
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(output_size, self.embedding_size)
        self.dropout = nn.Dropout(drop_out)
        self.num_layers_decoder = num_layers_decoder
        # For RNN
        self.cell_layer = nn.RNN(self.embedding_size + self.hidden_size, self.hidden_size, num_layers = self.num_layers_decoder, dropout = self.dropout, bidirectional = self.bidirectional)

        if self.cell_type == 'GRU':
            self.cell_layer = nn.GRU(self.embedding_size + self.hidden_size, self.hidden_size, num_layers = self.num_layers_decoder, dropout = self.dropout, bidirectional = self.bidirectional)
        elif self.cell_type == 'LSTM':
            self.cell_layer = nn.LSTM(self.embedding_size + self.hidden_size, self.hidden_size, num_layers = self.num_layers_decoder, dropout = self.dropout, bidirectional = self.bidirectional)
       
        # Attention layer
        self.attn = nn.Linear(self.hidden_size + self.hidden_size, self.hidden_size)
        self.v = nn.Linear(self.hidden_size, 1, bias=False)

        if (self.bidirectional == True):
            self.out = nn.Linear(2 * self.hidden_size , output_size)
        else:
            self.out = nn.Linear(self.hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, encoder_outputs):
        # Compute attention scores
        attn_scores = torch.tanh(self.attn(torch.cat((hidden[0], encoder_outputs[0]), dim=1)))
        attn_weights = torch.softmax(self.v(attn_scores), dim=1)

        # Compute context vector
        context = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        # Concatenate input and context vector
        input_combined = torch.cat((self.embedding(input).view(1, self.batch_size, -1), context), dim=2)

        # Pass through decoder cell layer
        output, hidden = self.cell_layer(input_combined, hidden)

        # Compute output and return
        output = self.softmax(self.out(output[0]))
        return output, hidden, attn_weights

    def initHidden(self):
        if (self.bidirectional == True):
            res = torch.zeros(self.num_layers_decoder*2, self.batch_size, self.hidden_size)
        else:
            res = torch.zeros(self.num_layers_decoder, self.batch_size, self.hidden_size)
        res.to(device)
        return res

In [None]:
def train_Attention(input_tensor, output_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, max_length):
    teacher_forcing_ratio = 0.5
    encoder_hidden = encoder.initHidden(num_layers_encoder)

    if cell_type == "LSTM":
        encoder_cell_state = encoder.initHidden(num_layers_encoder)
        encoder_hidden = (encoder_hidden, encoder_cell_state)

    input_tensor = torch.tensor(input_tensor.transpose(0, 1))
    output_tensor = torch.tensor(output_tensor.transpose(0, 1))

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs = torch.zeros(max_length+1, batch_size, encoder.hidden_size)
    encoder_outputs.to(device)

    loss, i = 0, 0

    input_length = input_tensor.size(0)
    output_length = output_tensor.size(0)

    # Encoder phase
    while i < input_length:
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        encoder_outputs[i] = encoder_output
        i += 1

    # Decoder phase
    decoder_input = torch.LongTensor([SOS_token] * batch_size)
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden
    decoder_attention = None

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        i = 0
        while i < output_length:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_input = output_tensor[i]
            loss += criterion(decoder_output, output_tensor[i])
            i += 1

    else:
        # Without teacher forcing: use its own predictions as the next input
        j = 0
        while j < output_length:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)#decoder_attention
            topv, topi = decoder_output.data.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            loss += criterion(decoder_output, output_tensor[j])
            j += 1

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / output_length


In [None]:
# Define the main function to run the training loop
def trainIters_Attention(encoder, decoder,train_Loader, val_Loader, max_length, max_of_all, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, n_iters, index2char):
    optimizer_encoder = optim.NAdam(encoder.parameters(), lr = learning_rate)
    optimizer_decoder = optim.NAdam(decoder.parameters(), lr = learning_rate)
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        print('Epoch No : ', iter)
        batch_no = 1
        train_loss = 0
        
        for x, y in train_Loader:
            loss = train_Attention(x, y, encoder, decoder, optimizer_encoder, optimizer_decoder, criterion, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, max_length)
            train_loss += loss
            batch_no += 1
        print('Train Loss: ', train_loss/ len(train_Loader))
        
        val_accur, val_loss = evaluate(encoder, decoder, val_Loader, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, criterion, max_of_all, index2char)
        print("Val Accuracy", val_accur, "Val Loss", val_loss/len(val_Loader))

In [None]:
def run_sweep_with_attention():
`    # Prepare the data.
    char_count, char2index ,index2char,maxlength_input, maxlength_output, pair_list =  preprocessData(currdir, lang_chosen, "train")
    val_char_count, val_char2index, val_index2char, val_maxlength_input, val_maxlength_output, val_pair_list = preprocessData(currdir, lang_chosen, "val")
    test_char_count, test_char2index, test_index2char, test_maxlength_input, test_maxlength_output, test_pair_list = preprocessData(currdir, lang_chosen, "test")

    maxlength = max(maxlength_input, maxlength_output) + 2

    max_of_all = max([maxlength_input, maxlength_output, val_maxlength_input, val_maxlength_output, test_maxlength_input, test_maxlength_output])
    vec_pair_list = create_vec_pair(char2index, maxlength, pair_list)
    val_vec_pair_list = create_vec_pair(char2index, max_of_all, val_pair_list)
    # print('vec_pair_list',vec_pair_list)
    configuration = {
        'embedding_size' : 64,
        'cell_type' : 'GRU', 
        'hidden_size' : 128, 
        'batch_size' : 64,
        'bi_directional' : False,
        'drop_out' : 0.0,
        'num_layers' : 1
        'learning_rate' : 0.001,
        'dropout_encoder' : 0.2,
        'dropout_decoder' : 0.2,
        'epochs' : 10,

    }
    batch_size = configuration['batch_size']
    embedding_size = configuration['embedding_size']
    cell_type = configuration['cell_type']
    hidden_size = configuration['hidden_size']
    bi_directional = configuration['bi_directional']
    drop_out = configuration['drop_out']
    dropout_encoder = configuration['dropout_encoder']
    dropout_decoder = configuration['dropout_decoder']
    learning_rate = configuration['learning_rate']
    epochs = configuration['epochs']


    encoder = EncoderRNN_Attention(hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, len(char2index))
    decoder = DecoderRNN_Attention(hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, len(char2index))
    train_loader = torch.utils.data.DataLoader(vec_pair_list, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_vec_pair_list, batch_size=batch_size, shuffle=True)
    use_cuda = False
    # # Train the model
    trainIters_Attention(encoder, decoder, train_loader, val_loader, maxlength, max_of_all, hidden_size, batch_size, bidirectional, embedding_size, dropout, cell_type, num_layers_encoder, num_layers_decoder, learning_rate, embedding_size, epochs, index2char)



In [None]:
# Set up the parameters and hyperparameters
hidden_size = 256
MAX_LENGTH = 10
SOS_token = 0
EOS_token = 1
PAD_token = 2
lang_chosen = 'hin'
currdir = '/kaggle/input/akshantar-original/'
run_sweep_with_attention()

In [None]:
sweep_config_attn = {
    'method' : 'bayes',
    
    'metric' : {
    'name' : 'val_accuracy',
    'goal' : 'maximize',
    },
    
    'parameters' : {
        'epochs' : {
            'values' : [10]
        },
        'bi_directional' : {
            'values' : [True, False]
        },
        'cell_type' : {
            'values' : ['RNN', 'GRU', 'LSTM']
        },
        'num_layers' :{
            'values' : [1, 2, 3]
        },
        'hidden_size' : {
            'values' : [128, 256, 512]
        },
        'batch_size' : {
            'values' : [32, 64, 128, 256]
        },
        'dropout_encoder' : {
            'values' : [0.2, 0.3, 0.4]
        },
        'dropout_decoder' : {
            'values' : [0.2, 0.3, 0.4]
        },
        'embedding_size': {
            'values' : [32, 64, 256, 512]
        }

    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config, entity="cs22m081", project="deep_learning_assignment3")
wandb.agent(sweep_id, run_sweep_with_attention, count = 50)