In [1]:
import torch
import pandas as pd
import os
import random
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm


In [2]:
!gdown "1uRKU4as2NlS9i8sdLRS1e326vQRdhvfw"
!unzip -q aksharantar_sampled.zip

zsh:1: command not found: gdown
unzip:  cannot find or open aksharantar_sampled.zip, aksharantar_sampled.zip.zip or aksharantar_sampled.zip.ZIP.


In [3]:
class PrepText():
    def __init__ (self, maxSize):
        self.textToNumX = {}
        self.numToTextX = {}
        self.textToNumY = {}
        self.numToTextY = {}
        self.encodingLength = maxSize


    def makeDict(self, wordsX, wordsY):
        #print ("creating the dictionary.")


        self.textToNumX["PAD"] = 0
        self.textToNumX["SOS"] = 1
        self.textToNumX["EOS"] = 2
        self.count = 3
        for word in wordsX:
            for letter in word:
                if letter not in self.textToNumX:
                    self.textToNumX[letter] = self.count
                    self.count+=1

        
        for letter, number in self.textToNumX.items():
            self.numToTextX[number] = letter

        self.textToNumY["PAD"] = 0
        self.textToNumY["SOS"] = 1
        self.textToNumY["EOS"] = 2
        self.count = 3
        for word in wordsY:
            for letter in word:
                if letter not in self.textToNumY:
                    self.textToNumY[letter] = self.count
                    self.count+=1

        
        for letter, number in self.textToNumY.items():
            self.numToTextY[number] = letter

    
    def lenOutput(self):
        return len(self.numToTextY);


    def lenInput(self):
        return len(self.numToTextX);

        
    def vectorizeOneWord(self, wordX, wordY):
        self.vectorX = torch.zeros(self.encodingLength, dtype = torch.int)
        self.vectorY = torch.zeros(self.encodingLength, dtype = torch.int)


        #print("encoding english word: " + wordX + " encoding hindi word: " + wordY)

        self.count = 1
        self.vectorX[0] = self.textToNumX['SOS']
        for letter in wordX:
            self.vectorX[self.count] = self.textToNumX[letter]
            self.count += 1
        self.vectorX[self.count] = self.textToNumX['EOS']



        self.count = 1
        self.vectorY[0] = self.textToNumY['SOS']
        for letter in wordY:
            self.vectorY[self.count] = self.textToNumY[letter]
            self.count += 1
        self.vectorY[self.count] = self.textToNumY['EOS']

        return self.vectorX, self.vectorY

    def vectorToWord (self, vectorA, vectorB):
        wordA = ""
        wordB = ""


        for element in vectorA:
            wordA += self.textToNumX[element.item()]

        for element in vectorB:
            wordB += self.textToNumY[element.item()]

        
        return wordA, wordB

In [4]:
import numpy as np
class AksharantarData(Dataset):

    def __init__(self, rootPath, max_size):

        self.root  = rootPath
        self.df = pd.read_csv(self.root, names = ["english", "hindi"])


        self.english = self.df["english"]
        self.hindi = self.df["hindi"]


        self.vocab = PrepText(max_size)
        self.vocab.makeDict(self.english, self.hindi)


    def lenOutput(self):
        return self.vocab.lenOutput()


    def lenInput(self):
        return self.vocab.lenInput()

    def getDictEng (self):
        return self.vocab.textToNumX;

    def getDictHin (self):
        return self.vocab.textToNumY;

    
    def __len__(self):

        return len(self.df)


    def __getitem__ (self, idx):

        #print(idx)

        self.englishWord = self.english[idx]
        #print(self.englishWord)
        self.hindiWord = self.hindi[idx]
        #print(self.hindiWord)
        self.vecEncodedX, self.vecEncodedY = self.vocab.vectorizeOneWord(self.englishWord, self.hindiWord)
        return (self.vecEncodedX, self.vecEncodedY)

In [5]:
trainData = AksharantarData("/content/aksharantar_sampled/hin/hin_train.csv", 35)


# update if necessary
valData = AksharantarData("/content/aksharantar_sampled/hin/hin_valid.csv", 35) 
# update if necessary
testData = AksharantarData("/content/aksharantar_sampled/hin/hin_test.csv", 35)


In [6]:
from torch.utils.data import DataLoader


# declare the batch size.
BATCH_SIZE = 128

# train data loader.
trainloader = DataLoader(trainData, shuffle = True, batch_size = BATCH_SIZE)

# validation data loader.
valLoader = DataLoader(valData, shuffle = True, batch_size = BATCH_SIZE)

# test data loader.
testLoader = DataLoader(testData, shuffle = True, batch_size = BATCH_SIZE)

# currently set it to false for debugging purposes.

In [7]:
import torch.nn as nn

In [8]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell

In [9]:
class Decoder (nn.Module):
    def __init__ (self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super (Decoder, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers


        self.embedding_layer = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)
        self.fc = nn.Linear (hidden_size, output_size)


    def forward (self, X, hidden, cell):

        X = X.unsqueeze(0)
        

        embedded = self.embedding_layer(X)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        predictions = self.fc(output)
        predictions = predictions.squeeze(0)


        return predictions, hidden, cell


In [10]:
class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [11]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = trainData.lenOutput()+1

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)
        #print("here lays hidden.shape")
        #print(hidden.shape)
        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

            

In [12]:
# Training hyper parameters
batchSize = 128


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputSizeEncoder = trainData.lenInput()+1
inputSizeDecoder = trainData.lenOutput()+1
outputSize = trainData.lenOutput()+1
encoderEmbedding = 300
decoderEmbedding = 300
hiddenSize = 1024
numLayers = 8
encDropout = 0.5
decDropout = 0.5
num_epochs = 10
learningRate = 0.001

In [13]:
encoder = Encoder (inputSizeEncoder, encoderEmbedding, hiddenSize, numLayers, encDropout).to(device)
decoder = Decoder (inputSizeDecoder, decoderEmbedding,  hiddenSize, outputSize, numLayers, decDropout).to(device)
model = EncoderDecoder(encoder, decoder).to(device)
criterion = nn.CrossEntropyLoss(reduction = "sum")
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)

In [13]:
model.parameters

<bound method Module.parameters of EncoderDecoder(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(29, 300)
    (rnn): LSTM(300, 1024, num_layers=8, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(67, 300)
    (rnn): LSTM(300, 1024, num_layers=8, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=67, bias=True)
  )
)>

In [14]:


for epoch in tqdm(range(num_epochs)):
    trainAcc = 0.0
    trainLoss = 0.0

    print (f'Epoch {epoch}/{num_epochs}')
    model.train()
    batchNo = 0
    
    for x,y in trainloader:
        #print (f"Doing batch Number: {batchNo}")
        batchNo+=1
        x = x.to(device)
        y = y.to(device)

        
        output = model(x, y)
        

        output = output.reshape(-1, output.shape[2])

        y = y.reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, y.to(torch.long))
        trainLoss += loss
        loss.backward()


        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # avoid exploding gradient problem
        optimizer.step()

        trainAcc += wordAccuracy (output.argmax(1), y.to(torch.long), batchSize)

        #print(loss)
        

        if batchNo % 100 == 0:
            print(f"After {batchNo} batches,========")
            break
    trainLoss /= (51200*35)
    #trainLoss *= 35
    trainAcc /= (51200*35) 
    #trainAcc *= 35
    print(f"Training Loss : {trainLoss}")
    print(f"Training accuracy : {trainAcc}")


    model.eval()
    valAcc = 0.0
    valLoss = 0.0

    with torch.inference_mode():
        for x,y in valLoader: 
            x,y = x.to(device), y.to(device)

            output = model(x,y)

            output = output.reshape(-1, output.shape[2])
            #print(y.shape)
            y = y.reshape(-1)

            loss = criterion (output, y.to(torch.long))
            valAcc += wordAccuracy (output.argmax(1), y.to(torch.long), batchSize)
            valLoss += loss

        valLoss /= len(valLoader)*batchSize*35
        valAcc /= len(valLoader)*batchSize*35


        print(f"Validation Loss = {valLoss} and Validation accuracy = {valAcc}")
    break


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0/10


KeyboardInterrupt: ignored

In [None]:
len(valLoader)*batchSize

4096

In [15]:
def wordAccuracy (prediction, target, batchSize):


    answer = 0
    for i in range(batchSize):
        answer += torch.equal(prediction[i], target[i])

    return answer