In [1]:
pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.22.2-py2.py3-none-any.whl (203 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.3/203.3 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [2]:
import torch
import pandas as pd
import os
import random
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
import gc
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import wandb
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
!gdown "1uRKU4as2NlS9i8sdLRS1e326vQRdhvfw"
!unzip -q aksharantar_sampled.zip

Downloading...
From: https://drive.google.com/uc?id=1uRKU4as2NlS9i8sdLRS1e326vQRdhvfw
To: /content/aksharantar_sampled.zip
  0% 0.00/14.0M [00:00<?, ?B/s] 75% 10.5M/14.0M [00:00<00:00, 104MB/s]100% 14.0M/14.0M [00:00<00:00, 115MB/s]


In [4]:
class PrepText():
    def __init__ (self, maxSize):
        self.textToNumX = {}
        self.numToTextX = {}
        self.textToNumY = {}
        self.numToTextY = {}
        self.encodingLength = maxSize


    def makeDict(self, wordsX, wordsY):
        #print ("creating the dictionary.")


        self.textToNumX["PAD"] = 0
        self.textToNumX["SOS"] = 1
        self.textToNumX["EOS"] = 2
        self.count = 3
        for word in wordsX:
            for letter in word:
                if letter not in self.textToNumX:
                    self.textToNumX[letter] = self.count
                    self.count+=1

        
        for letter, number in self.textToNumX.items():
            self.numToTextX[number] = letter

        self.textToNumY["PAD"] = 0
        self.textToNumY["SOS"] = 1
        self.textToNumY["EOS"] = 2
        self.count = 3
        for word in wordsY:
            for letter in word:
                if letter not in self.textToNumY:
                    self.textToNumY[letter] = self.count
                    self.count+=1

        
        for letter, number in self.textToNumY.items():
            self.numToTextY[number] = letter

    
    def lenOutput(self):
        return len(self.numToTextY);


    def lenInput(self):
        return len(self.numToTextX);

        
    def vectorizeOneWord(self, wordX, wordY):
        self.vectorX = torch.zeros(self.encodingLength, dtype = torch.int)
        self.vectorY = torch.zeros(self.encodingLength, dtype = torch.int)


        #print("encoding english word: " + wordX + " encoding hindi word: " + wordY)

        self.count = 1
        self.vectorX[0] = self.textToNumX['SOS']
        for letter in wordX:
            self.vectorX[self.count] = self.textToNumX[letter]
            self.count += 1
        self.vectorX[self.count] = self.textToNumX['EOS']



        self.count = 1
        self.vectorY[0] = self.textToNumY['SOS']
        for letter in wordY:
            self.vectorY[self.count] = self.textToNumY[letter]
            self.count += 1
        self.vectorY[self.count] = self.textToNumY['EOS']

        return self.vectorX, self.vectorY

    def vectorToWord (self, x, y):
        wordA = []
        wordB = []

        for element in x:
            wordA.append(self.numToTextY[element.item()])

        for element1 in y:
            wordB.append(self.numToTextY[element1.item()])


        print(wordA)
        print(wordB)

        return wordA, wordB

In [5]:
class AksharantarData(Dataset):

    def __init__(self, rootPath, max_size):

        self.root  = rootPath
        self.df = pd.read_csv(self.root, names = ["english", "hindi"])


        self.english = self.df["english"]
        self.hindi = self.df["hindi"]


        self.vocab = PrepText(max_size)
        self.vocab.makeDict(self.english, self.hindi)

    
    def convertBack(self, inputX, inputY):
        return self.vocab.vectorToWord(inputX, inputY)


    def lenOutput(self):
        return self.vocab.lenOutput()


    def lenInput(self):
        return self.vocab.lenInput()

    def getDictEng (self):
        return self.vocab.textToNumX;

    def getDictHin (self):
        return self.vocab.textToNumY;

    
    def __len__(self):

        return len(self.df)


    def __getitem__ (self, idx):

        #print(idx)

        self.englishWord = self.english[idx]
        #print(self.englishWord)
        self.hindiWord = self.hindi[idx]
        #print(self.hindiWord)
        self.vecEncodedX, self.vecEncodedY = self.vocab.vectorizeOneWord(self.englishWord, self.hindiWord)
        return (self.vecEncodedX, self.vecEncodedY)

In [6]:
def createDataLoader (encodingLength, batchSize):


    # training data.
    trainData = AksharantarData("/content/aksharantar_sampled/hin/hin_train.csv", encodingLength)

    # validation data.
    valData = AksharantarData("/content/aksharantar_sampled/hin/hin_valid.csv", encodingLength) 

    # testing data.
    testData = AksharantarData("/content/aksharantar_sampled/hin/hin_test.csv", encodingLength)


    # determine the lengths of the different datasets.
    lenIn = trainData.lenInput()
    lenOut = trainData.lenOutput()


    # train data loader.
    trainLoader = DataLoader(trainData, shuffle = True, batch_size = batchSize)

    # validation data loader.
    valLoader = DataLoader(valData, shuffle = True, batch_size = batchSize)

    # test data loader.
    testLoader = DataLoader(testData, shuffle = True, batch_size = batchSize)

    # currently set it to false for debugging purposes.
    return trainLoader, valLoader, testLoader, lenIn+1, lenOut+1

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, biDirection):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p, batch_first=True, bidirectional = biDirection )
        
    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell

In [8]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p, biDirection):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p, bidirectional = biDirection)

        self.gelu = nn.GELU()

        self.fc = nn.Linear(hidden_size*(int(biDirection)+1), output_size)

    def forward(self, x, hidden, cell):

        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))

        predictions = self.fc(self.gelu(outputs))

        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [41]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, outputSize, teacherForce):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.outputSize = outputSize
        self.teacherForce = teacherForce


    def forward(self, source, target, teacherStat):
        batch_size = source.shape[0]
        target_len = target.shape[1]
        target_vocab_size = self.outputSize


        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)
        
        x = target[:,0]

        for t in range(1, target_len):
            
            
            output, hidden, cell = self.decoder(x, hidden, cell)
          
            outputs[t-1] = output

            best_guess = output.argmax(dim =1)

            x = target[:,t] if random.random() < self.teacherForce and teacherStat else best_guess

        output, hidden, cell = self.decoder (x, hidden, cell)

        outputs[t] = output
        
        gc.collect()

        return outputs

            

In [40]:
def compile (inputSizeEncoder, inputSizeDecoder, encoderEmbedding, decoderEmbedding, hiddenSize, outputSize, numLayers, encDropout, decDropout, learningRate, biDirection, teach):


    # define the encoder models.
    encoder = Encoder (inputSizeEncoder, encoderEmbedding, hiddenSize, numLayers, encDropout, biDirection).to(device)
    decoder = Decoder (inputSizeDecoder, decoderEmbedding,  hiddenSize, outputSize, numLayers, decDropout, biDirection).to(device)

    
    # define the model.
    model = EncoderDecoder(encoder, decoder, outputSize, teach).to(device)


    # print the model parameters while at it.
    model.parameters


    # return all relevant stuff.
    return model, encoder, decoder

In [11]:
import gc
gc.collect()

13

In [36]:
def accuracy (x, y, batchSize):
    #x=torch.argmax(x,dim=1)
    #print(x.shape)
    #print(y.shape)
    # reshape to the batch size.
    x = x.reshape (int (x.shape[0]/batchSize), batchSize)
    y = y.reshape (int (y.shape[0]/batchSize), batchSize)
    
    x = x.T
    y = y.T
    #print(x[100])
    #print(y[100])


    # initialize correct to 0.0.
    correct = 0.0

    for i in range(batchSize):
        mask = torch.eq(y[i], 0).int()
        x[i] = (1-mask) * x[i]
        
        if torch.equal(x[i], y[i]):
            correct += 1
            #print (x[i])
            #print(y[i])
    
    return correct

In [49]:

def trainerLoop (trainLoader, valLoader, model, encoder, decoder, optimizer, criterion, encodingLength, num_epochs, batchSize, teacherDuration):

    for epoch in tqdm(range(num_epochs)):


        # initialize training accuracy and training loss.

        trainAcc = 0.0
        trainLoss = 0.0
        teacherStat = 0


        # switch model to training mode.
        model.train()


        # decide whether this epoch should have teacher forcing or not.
        if epoch < num_epochs*teacherDuration:
            teacherStat = 1

        # train all batches in the epoch.
        for x,y in trainLoader:

            x = x.to(device)
            y = y.to(device)

            output = model(x, y, epoch)
            

            output = output.reshape(-1, output.shape[2])

            y = y.T.reshape(-1)



            optimizer.zero_grad()
            loss = criterion(output, y.to(torch.long))
            trainLoss += loss
            loss.backward()


            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # avoid exploding gradient problem
            optimizer.step()

            trainAcc += accuracy (output.argmax(1), y.to(torch.long), batchSize)

            #print(loss)


        # normalize loss and accuracy and also print them.
        trainLoss /= (51200*encodingLength)
        trainAcc /= (51200)
        tqdm.write(f"Training Loss : {trainLoss:.4f}, Training Accuracy : {trainAcc:.4f}")

        
        # change model to evaluation mode.
        model.eval()


        # initialize the validation accuracy and validation loss.
        valAcc = 0.0
        valLoss = 0.0



        # evaluate model for every batch.
        for x,y in valLoader: 


            # send data to device.
            x,y = x.to(device), y.to(device)


            # do a forward propagation.
            output = model(x,y, 0)


            # reshape the output and the target to fit the loss function.
            output = output.reshape(-1, output.shape[2])
            y = y.T.reshape(-1)


            # calculate loss.
            loss = criterion (output, y.to(torch.long))


            # update validation accuracy and validation loss.
            valAcc += accuracy (output.argmax(1), y.to(torch.long), batchSize)
            
            valLoss += loss


        # validation and accuracy to be normalized.
        valLoss /= len(valLoader)*batchSize*encodingLength
        valAcc /= len(valLoader)*batchSize

        #wandb.log({"TrainingLoss" : trainLoss, "ValidationLoss" : valLoss, "TrainingAccuracy" : trainAcc, "ValidationAccuracy" : valAcc})

        
        # print them actively with tqdm visualization bar.
        tqdm.write(f"Validation Loss = {valLoss} and Validation accuracy = {valAcc}")
    

        

In [46]:
def wandbTrainer ():

    # initialize the wandb run.
    #wandb.init(project = "DLAssignment3", entity = "cs22m028")


    # define where the parameters come from
    #parameters = wandb.config


    batchSize = 256
    encoderEmbedding = 256
    decoderEmbedding = 256
    hiddenSize = 256
    numLayers = 2
    encDropout = 0
    decDropout = 0
    num_epochs = 1
    learningRate = 0.001
    bidirectional = True


    # define the parameters for this training.
    # batchSize = parameters["batchSize"]
    # encoderEmbedding = parameters["Embedding"]
    # decoderEmbedding = parameters["Embedding"]
    # hiddenSize = parameters["hiddenSize"]
    # numLayers = parameters["numberOfLayers"]
    # encDropout = parameters["EncoderDropout"]
    # decDropout = parameters["DecoderDropout"]
    # num_epochs = parameters["epochs"]
    # learningRate = parameters["learningRate"]
    # bidirectional = parameters["bidirectional"]
    teach = 0.5
    duration = 0.5

    encodingLength = 35



    # obtain the dataLoader objects from the dataLoderCreator.
    trainLoader, valLoader, testLoader, inputSizeEncoder, inputSizeDecoder = createDataLoader (encodingLength, batchSize)


    # defince implicit parameters
    outputSize = inputSizeDecoder

    # Define the model, optimizer and Loss Function. 
    model, encoder, decoder = compile (inputSizeEncoder, inputSizeDecoder, encoderEmbedding, decoderEmbedding, hiddenSize, outputSize, numLayers, encDropout, decDropout, learningRate, bidirectional, teach)


    # define the optimizer and the loss function.
    criterion = nn.CrossEntropyLoss(reduction = "sum", ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)


    # Call the training function with appropriate parameters
    trainModel = trainerLoop (trainLoader, valLoader, model, encoder, decoder, optimizer, criterion, encodingLength, num_epochs, batchSize, duration)        


In [29]:
def getLogging (key, projectName, entityName):


    # initialize the wandb.
    wandb.login(key=key)


    # set up sweep configuration method.
    sweep_config = {
        'method': 'bayes'
        }


    # set up sweep metric.
    metric = {
        'name': 'val_acc',
        'goal': 'maximize'   
        }


    # set sweep config.
    sweep_config['metric'] = metric


    '''
        batchSize = parameters["batchSize"]
        inputSizeEncoder = parameters["inputEncoder"]
        inputSizeDecoder = parameters["inputDecoder"]
        outputSize = parameters["outputSize"]
        encoderEmbedding = parameters["encoderEmbedding"]
        decoderEmbedding = parameters["decoderEmbedding"]
        hiddenSize = parameters["hiddenSize"]
        numLayers = parameters["numberOfLayers"]
        encDropout = parameters["EncoderDropout"]
        decDropout = parameters["DecoderDropout"]
        num_epochs = parameters["epochs"]
        learningRate = parameters["learningRate"]
        bidirectinal = parameters["bidirectional"]
        encodingLength = parameters["encodingLength"]
    
    '''


    # setup a parameters dictionary.
    parameters_dict = {


        'epochs' : {
            'values':[10,15,20]
        },

        'batchSize' : {
            'values' : [128, 256, 512]
        },

        'Embedding' : {
            'values' : [128, 256, 512]
        },

        'hiddenSize' : {
            'values' : [256, 512, 1024]
        },

        'numberOfLayers' : {
            'values' : [2,4,8]
        },

        'EncoderDropout' : {
            'values' : [0.3, 0.5]
        },

        'DecoderDropout' : {
            'values' : [0.3, 0.5]
        },

        'learningRate' : {
            'values' : [0.001, 0.0001, 0.0005]
        },

        'bidirectional' : {
            'values' : [True, False]
        },

        'teacherForce' : {
            'values' : [0.5, 0.55, 0.6, 0.7]
        },

        'teacherDuration' : {
            'values' : [0.5, 0.55, 0.6, 0.7]
        }
    }


    # set up the sweep configuration parameters.
    sweep_config['parameters'] = parameters_dict

    # create a sweep_id
    sweep_id = wandb.sweep(sweep_config, project= "DLAssignment3")

    # wandb agent run.
    wandb.agent(sweep_id, project= "DLAssignment3" , function = wandbTrainer, count = 1)

In [50]:
if __name__ == "__main__":


    # Start wandb :
    #getLogging ("4a022304a9a0aebfd481babe48517c3bac750362", "DLAssignment3", "cs22m028")

    wandbTrainer()

    # Just Train maybe.. I don't know.

    # Need to write selection logic sooner or later.

  0%|          | 0/1 [00:00<?, ?it/s]

Training Loss : 0.0000, Training Accuracy : 0.9999
Validation Loss = 0.003264238592237234 and Validation accuracy = 0.9755859375
Training Loss : 0.6546, Training Accuracy : 0.0018
Validation Loss = 1.0081984996795654 and Validation accuracy = 0.0
