In [1]:
#INSPIRED FROM NLP FROM SCRATCH -PYTORCH OFFICIAL DOCS.
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import pickle
import bcolz

In [2]:
###PHASE 1 - DATA PROCESSING (REUSABLE CODE FOR 3,4,5)######

#1. Extract Glove Embedding Dictionary. Maps almost every single word in every language to vector.
vectors = bcolz.open('6B.50.dat')[:]
words = pickle.load(open('6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open('6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [3]:
#2. Initialise Language Database

SOS_token = 0
EOS_token = 1

class LanguageDatabase:
    def __init__(self,language):
        #At the begining only SOS and EOS characters are in vocabulary
        self.language_name = language
        self.num_words = 2;
        self.word2index = {}
        self.index2word = {0:"SOS",1:"EOS"}
        self.word2count = {}
    
    def len_vocab(self):
        return self.num_words
    
    def add_new(self,text):
        for token in text.split(' '):
            if(token not in self.word2index):
                self.word2index[token] = self.num_words
                self.index2word[self.num_words] = token
                self.word2count[token] = 1
                self.num_words+=1;
            else:
                self.word2count[token] += 1

In [4]:
#Preprocessing English Text(Imperfect.. Need to be better. Know REGEX?? Pls help here)
# Basically remove punctuations attached with text. for example 'is!' should be replaced by 'is'. 'What?' should be replaced by What.
#Helps reduce redundancy,

#Convert all chatracters to lowercase and strip trailing spaces
def Preprocess(s):
    s = s.lower().strip()
    
    #REMOVE NON LETTER CHARACTERS
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    
    return s

#BS Function -  DO SOMETHING
def Preprocess_Hindi(s):
    lines = re.split('\W=',s)
    s= "";
    for l in lines:
        if(s!=""):
            s = s + l;
        else:
            s = s + " " + l;
    return s

In [5]:
#3. Read file and initialise Hindi and English Language Database.The hindi database occassionaly contains English words.
def init_db():
    training_en_lines = open('DataSet2/train.en',encoding='utf-8').read().strip().split('\n')
    training_hi_lines = open('DataSet2/train.hi',encoding='utf-8').read().strip().split('\n')
    validation_en_lines = open('DataSet2/dev.en',encoding='utf-8').read().strip().split('\n')
    validation_hi_lines = open('DataSet2/dev.hi',encoding='utf-8').read().strip().split('\n')
    
    #PAIRS Structure ENGLISH -> HINDI
    training_pairs = [[Preprocess(l1),l2] for l1,l2 in zip(training_en_lines,training_hi_lines)]
    validation_pairs = [[Preprocess(l1),l2] for l1,l2 in zip(validation_en_lines,validation_hi_lines)]
    
    #ENGLISH
    en_lang_db = LanguageDatabase('english')
    
    #HINDI
    hi_lang_db = LanguageDatabase('hindi')
    
    print('Database Initialised. Populating....')
    #Database must know every single word that could be present in the language in our data.
    
    #Training Text
    for [en,hi] in training_pairs:
        en_lang_db.add_new(en)
        hi_lang_db.add_new(hi)
        
    #Validation Text
    for [en,hi] in validation_pairs:
        en_lang_db.add_new(en)
        hi_lang_db.add_new(hi)
        
    print('Finished!')
        
    return en_lang_db,hi_lang_db,training_pairs,validation_pairs

In [6]:
EnglishDB,HindiDB,train_pairs,val_pairs = init_db()

Database Initialised. Populating....
Finished!


In [7]:
#Crucial Parameters - MUST BE CORRECT!
TRAIN_VOCAB_LENGTH = EnglishDB.len_vocab()
TRANSLATE_VOCAB_LENGTH = HindiDB.len_vocab()

In [8]:
def Get_Training_matrix(VOCAB_LENGTH,DB):
    weights_matrix = np.zeros((VOCAB_LENGTH,50))
    discovered_words = 0
    for i in range(VOCAB_LENGTH):
        word = DB.index2word[i]
        try:
            weights_matrix[i] = glove[word]
            discovered_words+=1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale = 0.6,size=(50,))
    return weights_matrix

In [9]:
#5. Extract Word Vectors for each word corresponding to indexes in database as a matrix.


# This matrix can be multiplied with one hot representation of words to get the corresponding Glove representation
# Initialise embedding layer weights with this matrix

TRAIN_weights_matrix = torch.tensor(Get_Training_matrix(TRAIN_VOCAB_LENGTH,EnglishDB))
#Confirmed
TRAIN_weights_matrix[2] == torch.tensor(glove[EnglishDB.index2word[2]])

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True])

In [10]:
TRANSLATION_weights_matrix = torch.tensor(Get_Training_matrix(TRANSLATE_VOCAB_LENGTH,HindiDB))
#Confirmed
TRANSLATION_weights_matrix[2] == torch.tensor(glove[HindiDB.index2word[2]])

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True])

In [11]:
#### PHASE 2 - THE MODEL ####
#7 Embed Layer Creator with Glove Weights(Adopted from Medium Article on using Pretrained Networks)
def init_embed_layer(weights_matrix,notrain=False):
    num_embeddings, embeddin_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings,embeddin_dim)
    if notrain:
        emb_layer.requires_grad = False
    return emb_layer,num_embeddings,embeddin_dim

In [12]:
#8. Get the pairs reading for putting in model.
def get_tensors_from_pair(pair):
    indexes = [EnglishDB.word2index[word] for word in pair[0].split(' ')]
    while(len(indexes) < 100):
        indexes.append(EOS_token)
    input_tensor = torch.tensor(indexes, dtype=torch.long, device=device)
    indexes = [HindiDB.word2index[word] for word in pair[1].split(' ')]
    while(len(indexes) < 100):
        indexes.append(EOS_token)
    target_tensor = torch.tensor(indexes, dtype=torch.long, device=device)
    return (input_tensor,target_tensor)

training_pairs_en = [get_tensors_from_pair(p)[0] for p in train_pairs]
training_pairs_hi = [get_tensors_from_pair(p)[1] for p in train_pairs]
validation_pairs_en = [get_tensors_from_pair(p)[0] for p in val_pairs]
validation_pairs_hi = [get_tensors_from_pair(p)[1] for p in val_pairs]

In [13]:
TP_en = torch.stack(training_pairs_en)
TP_hi = torch.stack(training_pairs_hi)
VP_en = torch.stack(validation_pairs_en)
VP_hi = torch.stack(validation_pairs_hi)

In [14]:
class training_set(torch.utils.data.Dataset):
    def __init__(self,En,Hi):
        self.En = En                          # set data
        self.Hi = Hi                           # set lables

    def __len__(self):
        return len(self.En)                   # return length

    def __getitem__(self, idx):
        return [self.En[idx], self.Hi[idx]] 

In [15]:
training_dataset = training_set(TP_en,TP_hi)
validation_dataset = training_set(VP_en,VP_hi)

In [16]:
train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=250, shuffle=True)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=250, shuffle=True)

In [17]:
#6 Encoder Model using Embedding and LSTM.
class Encoder(nn.Module):
    def __init__(self, input_size, embed_layer_size, hidden_layer_size,weights_matrix, n_layers=1):
        
        #Input size , embed_layer_size = weights_matrix.shape
        
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.input_size = input_size
        self.n_layers = n_layers
        
        #Define Embedding Layer
        #DO NOT CHANGE THE GLOVE EMBEDDING WEIGHTS. THEY ARE PERFECTION.
        self.embedding, num_embeddings, embeddin_dim = init_embed_layer(weights_matrix,True)
        self.embedding.weight = nn.Parameter(weights_matrix)
        
        self.rnn = nn.LSTM(embed_layer_size, hidden_layer_size, n_layers)
    
    def forward(self, input, hidden, cell):
        #print('INPUT SHAPE',input.shape)
        
        #Generate GloVe Encoding from One Hot Representation of the word
        
        encoding = self.embedding(input)
        
        #print(encoding)
        #print(encoding.shape,hidden.shape,cell.shape)
        #print('WORD REPRESENTATION IN ENCODER',encoding.shape)
        outputs,(hidden,cell) = self.rnn(encoding.float(),(hidden.float(), cell.float()))
        
        #print('OUTPUTS SHAPE',outputs.shape,hidden.shape,cell.shape)
        
        #print('ENCODER DONE')
        
        #We only need the decoded hidden and state values for encoder.
        return hidden,cell
    
    def initHidden(self,batchSize):
        return torch.zeros(1,batchSize,self.hidden_layer_size,device=device)


In [18]:
#7 Decoder Model using Embedding and LSTM.
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hid_dim, weights_matrix, n_layers = 1):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        #Define Embedding Layer
        #DO NOT CHANGE THE GLOVE EMBEDDING WEIGHTS. THEY ARE PERFECTION.
        self.embedding, num_embeddings, embeddin_dim = init_embed_layer(weights_matrix,True)
        self.embedding.weight = nn.Parameter(weights_matrix)
        
        self.rnn = nn.LSTM(embed_dim, hid_dim, n_layers)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
    
    def forward(self, input, hidden, state):
        
        #print('INPUT SHAPE',input.shape)
        #print('HIDDEN SHAPE',hidden.shape)
        #print('STATE SHAPE',state.shape)
        
        input = input.unsqueeze(0)
        
        #print('INPUT IN DECODER',input.shape)
        #Generate GloVe Encoding from One Hot Representation of the word
        encoding = self.embedding(input)
        
        #print('WORD REPRESENTATION IN DECODER',encoding.shape)
        #print(encoding.shape,hidden.shape,state.shape)
        outputs,(hidden,state) = self.rnn(encoding.float(),(hidden.float(), state.float()))
        
        #print('OUTPUTS',outputs.shape)
        #print('HIDDEN_OUTPUTS',hidden.shape)
        #print('HIDDEN_STATE',state.shape)
        
        prediction = self.fc_out(outputs.squeeze(0))
        
        #print(prediction.shape)
        
        return prediction, hidden, state
    
    def initHidden(self,batchSize):
        return torch.zeros(1,batchSize,self.hid_dim,device=device)

In [19]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        h0 = self.encoder.initHidden(250)
        cell0 = self.encoder.initHidden(250)
        hidden, cell = self.encoder(src,h0,cell0)
        
        #print('HIDDEN_INPUT',hidden.shape)
        #print('CELL_INPUT',cell.shape)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

In [20]:
#9. Initialize model params and optimizer..Potential HyperParameters.
enc = Encoder(TRAIN_weights_matrix.shape[0], TRAIN_weights_matrix.shape[1], 100, TRAIN_weights_matrix, 1)
dec = Decoder(TRANSLATION_weights_matrix.shape[0], TRANSLATION_weights_matrix.shape[1], 100, TRANSLATION_weights_matrix, 1)
model = Seq2Seq(enc, dec, device).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, 1.0, 3.0)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(19279, 50)
    (rnn): LSTM(50, 100)
  )
  (decoder): Decoder(
    (embedding): Embedding(40651, 50)
    (rnn): LSTM(50, 100)
    (fc_out): Linear(in_features=100, out_features=40651, bias=True)
  )
)

In [21]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,223,851 trainable parameters


In [22]:
optimizer = optim.Adam(model.parameters())

In [23]:
def train(model, dataloader, optimizer , clip):
    
    model.train()
    
    epoch_loss = 0
    
    dataindex = 0
    
    for src,trg in dataloader:
        
        print('START')
        src = src.t()
        trg = trg.t()
        
        optimizer.zero_grad()
        
        #print('SRC shape ', src.shape)
        #print('TRG shape ', src.shape)
        print('MODELLING')
        output = model(src, trg)
        print('FINISHED MODELLING')
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = torch.flatten(trg[1:])
        s = nn.Softmax(dim=1)
        output = s(output)
        #print('OK')
        #Need to set ignore index
        print(output)
        criterion = nn.CrossEntropyLoss()
        print(output<0)
        print(trg<0)
        loss = criterion(output, trg)
        print(loss.item())
        
        #loss.backward()
        print('OK')
        #torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        print('STEPPED')
        epoch_loss += loss.item()
        dataindex+=1
        
    return epoch_loss / len(iterator)

In [None]:
train_loss = train(model, train_loader, optimizer, 1.0)

START
MODELLING
FINISHED MODELLING
tensor([[2.1248e-07, 7.2887e-14, 4.1564e-09,  ..., 4.1813e-11, 2.4648e-12,
         2.4946e-08],
        [2.1248e-07, 7.2887e-14, 4.1564e-09,  ..., 4.1813e-11, 2.4648e-12,
         2.4946e-08],
        [2.1248e-07, 7.2887e-14, 4.1564e-09,  ..., 4.1813e-11, 2.4648e-12,
         2.4946e-08],
        ...,
        [2.1248e-07, 7.2887e-14, 4.1564e-09,  ..., 4.1813e-11, 2.4648e-12,
         2.4946e-08],
        [2.1248e-07, 7.2887e-14, 4.1564e-09,  ..., 4.1813e-11, 2.4649e-12,
         2.4944e-08],
        [2.1248e-07, 7.2887e-14, 4.1564e-09,  ..., 4.1813e-11, 2.4649e-12,
         2.4944e-08]], grad_fn=<SoftmaxBackward>)
tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., Fal

In [None]:
print(x.shape)