In [1]:
import torch
import pandas as pd
import os
import random
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
import gc
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
!gdown "1uRKU4as2NlS9i8sdLRS1e326vQRdhvfw"
!unzip -q aksharantar_sampled.zip

Downloading...
From: https://drive.google.com/uc?id=1uRKU4as2NlS9i8sdLRS1e326vQRdhvfw
To: /content/aksharantar_sampled.zip
  0% 0.00/14.0M [00:00<?, ?B/s]100% 14.0M/14.0M [00:00<00:00, 173MB/s]


In [3]:
class PrepText():
    def __init__ (self, maxSize):
        self.textToNumX = {}
        self.numToTextX = {}
        self.textToNumY = {}
        self.numToTextY = {}
        self.encodingLength = maxSize


    def makeDict(self, wordsX, wordsY):
        #print ("creating the dictionary.")


        self.textToNumX["PAD"] = 0
        self.textToNumX["SOS"] = 1
        self.textToNumX["EOS"] = 2
        self.count = 3
        for word in wordsX:
            for letter in word:
                if letter not in self.textToNumX:
                    self.textToNumX[letter] = self.count
                    self.count+=1

        
        for letter, number in self.textToNumX.items():
            self.numToTextX[number] = letter

        self.textToNumY["PAD"] = 0
        self.textToNumY["SOS"] = 1
        self.textToNumY["EOS"] = 2
        self.count = 3
        for word in wordsY:
            for letter in word:
                if letter not in self.textToNumY:
                    self.textToNumY[letter] = self.count
                    self.count+=1

        
        for letter, number in self.textToNumY.items():
            self.numToTextY[number] = letter

    
    def lenOutput(self):
        return len(self.numToTextY);


    def lenInput(self):
        return len(self.numToTextX);

        
    def vectorizeOneWord(self, wordX, wordY):
        self.vectorX = torch.zeros(self.encodingLength, dtype = torch.int)
        self.vectorY = torch.zeros(self.encodingLength, dtype = torch.int)


        #print("encoding english word: " + wordX + " encoding hindi word: " + wordY)

        self.count = 1
        self.vectorX[0] = self.textToNumX['SOS']
        for letter in wordX:
            self.vectorX[self.count] = self.textToNumX[letter]
            self.count += 1
        self.vectorX[self.count] = self.textToNumX['EOS']



        self.count = 1
        self.vectorY[0] = self.textToNumY['SOS']
        for letter in wordY:
            self.vectorY[self.count] = self.textToNumY[letter]
            self.count += 1
        self.vectorY[self.count] = self.textToNumY['EOS']

        self.count = 1

        return self.vectorX, self.vectorY

    def vectorToWord (self, x, y):
        wordA = []
        wordB = []

        for element in x:
            wordA.append(self.numToTextY[element.item()])

        for element1 in y:
            wordB.append(self.numToTextY[element1.item()])


        print(wordA)
        print(wordB)

        return wordA, wordB

In [4]:
class AksharantarData(Dataset):

    def __init__(self, rootPath, max_size):

        self.root  = rootPath
        self.df = pd.read_csv(self.root, names = ["english", "hindi"])


        self.english = self.df["english"]
        self.hindi = self.df["hindi"]


        self.vocab = PrepText(max_size)
        self.vocab.makeDict(self.english, self.hindi)

    
    def convertBack(self, inputX, inputY):
        return self.vocab.vectorToWord(inputX, inputY)


    def lenOutput(self):
        return self.vocab.lenOutput()


    def lenInput(self):
        return self.vocab.lenInput()

    def getDictEng (self):
        return self.vocab.textToNumX;

    def getDictHin (self):
        return self.vocab.textToNumY;

    
    def __len__(self):

        return len(self.df)


    def __getitem__ (self, idx):

        #print(idx)

        self.englishWord = self.english[idx]
        #print(self.englishWord)
        self.hindiWord = self.hindi[idx]
        #print(self.hindiWord)
        self.vecEncodedX, self.vecEncodedY = self.vocab.vectorizeOneWord(self.englishWord, self.hindiWord)
        return (self.vecEncodedX, self.vecEncodedY)

In [5]:
def createDataLoader (encodingLength, batchSize):


    # training data.
    trainData = AksharantarData("/content/aksharantar_sampled/tam/tam_train.csv", encodingLength)

    # validation data.
    valData = AksharantarData("/content/aksharantar_sampled/tam/tam_valid.csv", encodingLength) 

    # testing data.
    testData = AksharantarData("/content/aksharantar_sampled/tam/tam_test.csv", encodingLength)


    # determine the lengths of the different datasets.
    lenIn = trainData.lenInput()
    lenOut = trainData.lenOutput()


    # train data loader.
    trainLoader = DataLoader(trainData, shuffle = False, batch_size = batchSize)

    # validation data loader.
    valLoader = DataLoader(valData, shuffle = True, batch_size = batchSize)

    # test data loader.
    testLoader = DataLoader(testData, shuffle = True, batch_size = batchSize)

    # currently set it to false for debugging purposes.
    return trainLoader, valLoader, testLoader, lenIn+1, lenOut+1

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, biDirection):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p, batch_first=True, bidirectional = biDirection )
        
    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell

In [7]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p, biDirection):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p, bidirectional = biDirection)

        self.gelu = nn.LeakyReLU()

        self.fc = nn.Linear(hidden_size*(int(biDirection)+1), output_size)

    def forward(self, x, hidden, cell):

        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))

        predictions = self.fc(outputs)

        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [15]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, outputSize):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.outputSize = outputSize

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1]
        target_vocab_size = self.outputSize


        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)



        
        x = target[:,0]

        for t in range(1, target_len):
            
            
            output, hidden, cell = self.decoder(x, hidden, cell)
          
            outputs[t-1] = output

            best_guess = output.argmax(dim =1)

            x = target[:,t] if random.random() < teacher_force_ratio else best_guess

        output, hidden, cell = self.decoder (x, hidden, cell)

        outputs[t] = output
        
        gc.collect()

        return outputs

            

In [9]:
def compile (inputSizeEncoder, inputSizeDecoder, encoderEmbedding, decoderEmbedding, hiddenSize, outputSize, numLayers, encDropout, decDropout, learningRate, biDirection):


    # define the encoder models.
    encoder = Encoder (inputSizeEncoder, encoderEmbedding, hiddenSize, numLayers, encDropout, biDirection).to(device)
    decoder = Decoder (inputSizeDecoder, decoderEmbedding,  hiddenSize, outputSize, numLayers, decDropout, biDirection).to(device)

    
    # define the model.
    model = EncoderDecoder(encoder, decoder, outputSize).to(device)


    # print the model parameters while at it.
    print(model.parameters)


    # return all relevant stuff.
    return model, encoder, decoder

In [29]:
def accuracy (x, y, batchSize):
    #x=torch.argmax(x,dim=1)
  #  print(x.shape)
 #   print(y.shape)
    # reshape to the batch size.
    x = x.reshape (int (x.shape[0]/batchSize), batchSize)
    y = y.reshape (int (y.shape[0]/batchSize), batchSize)
    
    x = x.T
    y = y.T
#    print(x[100])
#    print(y[100])


    # initialize correct to 0.0.
    correct = 0.0

    for i in range(batchSize):
        mask = torch.eq(y[i], 0).int()
        x[i] = (1-mask) * x[i]
        
        if torch.equal(x[i], y[i]):
            correct += 1
            #print (x[i])
            #print(y[i])
    
    return correct

In [28]:

def trainerLoop (trainLoader, valLoader, model, encoder, decoder, optimizer, criterion, encodingLength, num_epochs, batchSize):

    for epoch in tqdm(range(num_epochs)):


        # initialize training accuracy and training loss.

        trainAcc = 0.0
        trainLoss = 0.0
        batchNo = 0

        # switch model to training mode.
        model.train()


        # train all batches in the epoch.
        for x,y in trainLoader:
            batchNo += 1
            x = x.to(device)
            y = y.to(device)

            output = model(x, y)
            

            output = output.reshape(-1, output.shape[2])

            y = y.T.reshape(-1)



            optimizer.zero_grad()
            loss = criterion(output, y.to(torch.long))
            with torch.no_grad():
              trainLoss += loss.item()
            loss.backward()


            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # avoid exploding gradient problem
            optimizer.step()

            trainAcc += accuracy (output.argmax(1), y.to(torch.long), batchSize)
            
            
           

        # normalize loss and accuracy and also print them.
        trainLoss /= (len(trainLoader)*batchSize*encodingLength)
        trainAcc /= (len(trainLoader)*batchSize)
        trainAcc *= 100
        tqdm.write(f"Training Loss : {trainLoss:.4f}, Training Accuracy : {trainAcc:.4f}")    


        # Calculate the validation accuracy and loss now.

        valLoss = 0.0
        valAcc = 0.0


        for x,y in valLoader:

            x,y = x.to(device), y.to(device)

            output = model (x,y,0)

            output = output.reshape(-1, output.shape[2])

            y = y.T
            y = y.reshape(-1)

            loss = criterion (output, y.to(torch.long))

            with torch.no_grad():
                valLoss += loss.item()

            valAcc += accuracy (output.argmax(1), y.to(torch.long), batchSize)

        valLoss /= (len(valLoader)*batchSize*encodingLength)
        valAcc *= 100
        valAcc /= len(valLoader)*batchSize

        print(f"Validation Loss : {valLoss}, Validation Accuracy : {valAcc}")
        

In [27]:
def valAccuracy (x, y, batchSize):
    #x=torch.argmax(x,dim=1)

    x = x.reshape (int (x.shape[0]/batchSize), batchSize)
    x = x.T


    #print(x.shape)
    #print(y.shape)
    # reshape to the batch size.
    
    #y = y.T
#    print(x[100])
#    print(y[100])


    # initialize correct to 0.0.
    correct = 0.0

    for i in range(batchSize):
        mask = torch.eq(y[i], 0).int()
        x[i] = (1-mask) * x[i]
        
        if torch.equal(x[i], y[i]):
            correct += 1
            #print (x[i])
            #print(y[i])
    
    return correct

In [30]:
def wandbTrainer ():







    # define the parameters for this training.
    batchSize = 256
    encoderEmbedding = 256
    decoderEmbedding = 256
    hiddenSize = 256
    numLayers = 2
    encDropout = 0
    decDropout = 0
    num_epochs = 3
    learningRate = 0.001
    bidirectional = True
    

    encodingLength = 35



    # obtain the dataLoader objects from the dataLoderCreator.
    trainLoader, valLoader, testLoader, inputSizeEncoder, inputSizeDecoder = createDataLoader (encodingLength, batchSize)


    # defince implicit parameters
    outputSize = inputSizeDecoder

    # Define the model, optimizer and Loss Function. 
    model, encoder, decoder = compile (inputSizeEncoder, inputSizeDecoder, encoderEmbedding, decoderEmbedding, hiddenSize, outputSize, numLayers, encDropout, decDropout, learningRate, bidirectional)


    # define the optimizer and the loss function.
    criterion = nn.CrossEntropyLoss(reduction = "sum", ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)




    # Call the training function with appropriate parameters
    trainModel = trainerLoop (trainLoader, valLoader, model, encoder, decoder, optimizer, criterion, encodingLength, num_epochs, batchSize)        


In [31]:
if __name__ == "__main__":


    # Start wandb :
    wandbTrainer()

    # Just Train maybe.. I don't know.

    # Need to write selection logic sooner or later.

<bound method Module.parameters of EncoderDecoder(
  (encoder): Encoder(
    (dropout): Dropout(p=0, inplace=False)
    (embedding): Embedding(30, 256)
    (rnn): LSTM(256, 256, num_layers=2, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0, inplace=False)
    (embedding): Embedding(50, 256)
    (rnn): LSTM(256, 256, num_layers=2, bidirectional=True)
    (gelu): LeakyReLU(negative_slope=0.01)
    (fc): Linear(in_features=512, out_features=50, bias=True)
  )
)>


  0%|          | 0/3 [00:00<?, ?it/s]

Training Loss : 0.5318, Training Accuracy : 2.2773
Validation Loss : 1.508928782599313, Validation Accuracy : 0.0
Training Loss : 0.2315, Training Accuracy : 15.9199
Validation Loss : 1.6848846708025251, Validation Accuracy : 0.0
Training Loss : 0.1340, Training Accuracy : 38.5859
Validation Loss : 1.9447686331612724, Validation Accuracy : 0.0
