# WITH 2 Layer uni-LSTM Encoder and Decoder



In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download en

Importing the Indic Library for processing hindi Sentences. 

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor

fatal: destination path 'indic_nlp_library' already exists and is not an empty directory.
fatal: destination path 'indic_nlp_resources' already exists and is not an empty directory.


In [None]:
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"

Importing all necessary packages and libraries for the given NMT.

In [None]:
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import math
import time
from indicnlp.tokenize import indic_tokenize  
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
import spacy

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
df=pd.read_csv('./drive/MyDrive/train/train.csv')
df=df[['hindi','english']]
train, val=train_test_split(df,test_size=0.02,random_state=42)
train.to_csv('train.csv',index=False)
val.to_csv('val.csv',index=False)
df2=pd.read_csv('hindistatements.csv')
df3=pd.read_csv('train.csv')

with open('actual_test.csv', 'w') as file:
     writer = csv.writer(file)
     writer.writerow(["hindi", "english"])
     for i in range(len(df2)):
         writer.writerow([df2.iloc[i]['hindi'],df3.iloc[i]['english']])
         
df4=pd.read_csv('actual_test.csv')
df4.to_csv('test.csv',index=False)

In [None]:
spacy_en = spacy.load('en_core_web_sm')
def tokenize_hindi(text):
    
    return [t for t in indic_tokenize.trivial_tokenize(text)]

def tokenize_english(text):

    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_hindi, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = False,
           )

TRG = Field(tokenize = tokenize_english, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

fields={'hindi': ('src',SRC),'english':('trg',TRG)}
train_data, valid_data, test_data=TabularDataset.splits(
    path='.',
    train='train.csv',
    validation='val.csv',
    test='test.csv',
    format='csv',
    fields= fields,
)

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = 32 ,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device)

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)

        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))      
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            " must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1) 
            input = trg[t] if teacher_force else top1
        
        return outputs

In [None]:

enc = Encoder(len(SRC.vocab), 256, 512, 2, 0.5)
dec = Decoder(len(TRG.vocab), 256, 512, 2, 0.5)

model = Seq2Seq(enc, dec, device).to(device)

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
     
model.apply(init_weights)

optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:

training_loss=[]
validation_loss=[]
best_valid_loss = float('inf')

for epoch in range(10):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, 1)
    training_loss.append(train_loss)
    valid_loss = evaluate(model, valid_iterator, criterion)
    validation_loss.append(valid_loss)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '2LSTM.pt')
    
    print(f'Epoch: {epoch+1:02}')
    print(f'Train Loss: {train_loss:.3f}')
    print(f'Val. Loss: {valid_loss:.3f}')
    print("=====================================================")

Epoch: 01
Train Loss: 4.881
Val. Loss: 4.743
Epoch: 02
Train Loss: 4.212
Val. Loss: 4.466
Epoch: 03
Train Loss: 3.857
Val. Loss: 4.342
Epoch: 04
Train Loss: 3.580
Val. Loss: 4.252
Epoch: 05
Train Loss: 3.355
Val. Loss: 4.205
Epoch: 06
Train Loss: 3.168
Val. Loss: 4.194
Epoch: 07
Train Loss: 3.015
Val. Loss: 4.198
Epoch: 08
Train Loss: 2.890
Val. Loss: 4.217
Epoch: 09
Train Loss: 2.781
Val. Loss: 4.241
Epoch: 10
Train Loss: 2.685
Val. Loss: 4.265


In [None]:
print(training_loss)
print(validation_loss)

[4.881316395496667, 4.211683106407209, 3.85655080891751, 3.5799211834598363, 3.3553206841125562, 3.16809576316175, 3.01523053212461, 2.8901991181755493, 2.780895181822792, 2.6850299606600156]
[4.742612112313509, 4.465935755521059, 4.3415204510092735, 4.251764938235283, 4.205137647688389, 4.193554386496544, 4.198031611740589, 4.217468999326229, 4.240992661565542, 4.265145502984524]


In [None]:
model.load_state_dict(torch.load('2LSTM.pt'))
example_idx = 66
example = test_data.examples[example_idx]
print('Hindi sentence:', ' '.join(example.src))

src_tensor = SRC.process([example.src]).to(device)
trg_tensor = TRG.process([example.trg]).to(device)

model.eval()
with torch.no_grad():
    outputs = model(src_tensor, trg_tensor, teacher_forcing_ratio=0)

output_idx = outputs[1:].squeeze(1).argmax(1)
Predicted_English=' '.join([TRG.vocab.itos[idx] for idx in output_idx])
print("Predicted_English:", Predicted_English)

Hindi sentence: क्यों ?
Predicted_English: why ? ? <eos> <eos>
