### Import Required Libraries

In [None]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras_preprocessing.sequence import pad_sequences
from keras.models import load_model
from tensorflow.keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 200)

## Read IIT Data Bombay

In [None]:
# !pip install datasets==1.18.1
from datasets import load_dataset
dataset = load_dataset("cfilt/iitb-english-hindi")

In [None]:
data_size=15000

In [None]:
hin_eng=[]
for translation_pair in dataset["train"]["translation"][:15000]:
  source_sentence = translation_pair["hi"]
  target_sentence = translation_pair["en"]
  hin_eng.append([target_sentence, source_sentence ])

hin_eng = array(hin_eng)
del dataset

### Text Pre-Processing

#### Text Cleaning

Let's take a look at our data, then we will decide which pre-processing steps to adopt.

We will get rid of the punctuation marks, and then convert the text to lower case.

In [None]:
# Remove punctuation
hin_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in hin_eng[:,0]]
hin_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in hin_eng[:,1]]

In [None]:
# convert to lowercase
for i in range(len(hin_eng)):
    hin_eng[i,0] = hin_eng[i,0].lower()
    
    hin_eng[i,1] = hin_eng[i,1].lower()

#### Text to Sequence Conversion

To feed our data in a Seq2Seq model, we will have to convert both the input and the output sentences into integer sequences of fixed length. Before that, let's visualise the length of the sentences. We will capture the lengths of all the sentences in two separate lists for English and German, respectively.

In [None]:
# empty lists
eng_l = []
deu_l = []

# populate the lists with sentence lengths
for i in hin_eng[:,0]:
    eng_l.append(len(i.split()))

for i in hin_eng[:,1]:
    deu_l.append(len(i.split()))

In [None]:
length_df = pd.DataFrame({'eng':eng_l, 'deu':deu_l})

In [None]:
length_df.hist(bins = 30)
plt.show()

The maximum length of the German sentences is 11 and that of the English phrases is 8.

Let's vectorize our text data by using Keras's Tokenizer() class. It will turn our sentences into sequences of integers. Then we will pad those sequences with zeros to make all the sequences of same length.

In [None]:
# function to build a tokenizer
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
# prepare english tokenizer
eng_tokenizer = tokenization(hin_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8
print('English Vocabulary Size: %d' % eng_vocab_size)

In [None]:
# prepare Deutch tokenizer
deu_tokenizer = tokenization(hin_eng[:, 1])
deu_vocab_size = len(deu_tokenizer.word_index) + 1

deu_length = 8
print('Deutch Vocabulary Size: %d' % deu_vocab_size)

Given below is a function to prepare the sequences. It will also perform sequence padding to a maximum sentence length as mentioned above. 

In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

### Model Building

We will now split the data into train and test set for model training and evaluation, respectively.

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(hin_eng, test_size=0.2, random_state = 12)

It's time to encode the sentences. We will encode German sentences as the input sequences and English sentences as the target sequences. It will be done for both train and test datasets.

In [None]:
# prepare training data
trainX = encode_sequences(deu_tokenizer, deu_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

In [None]:
# prepare validation data
testX = encode_sequences(deu_tokenizer, deu_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

Now comes the exciting part! Let us define our Seq2Seq model architecture. We are using an Embedding layer and an LSTM layer as our encoder and another LSTM layer followed by a Dense layer as the decoder.  

In [None]:
# build NMT model
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model

We are using RMSprop optimizer in this model as it is usually a good choice for recurrent neural networks.

In [None]:
model = build_model(deu_vocab_size, eng_vocab_size, deu_length, eng_length, 512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

Please note that we have used __'sparse_categorical_crossentropy'__ as the loss function because it allows us to use the target sequence as it is instead of one hot encoded format. One hot encoding the target sequences with such a huge vocabulary might consume our system's entire memory.

It seems we are all set to start training our model. We will train it for 30 epochs and with a batch size of 512. You may change and play these hyperparameters. We will also be using __ModelCheckpoint()__ to save the best model with lowest validation loss. I personally prefer this method over early stopping.

In [None]:
filename = 'model.h1.27_nov_22'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1), 
          epochs=30, batch_size=512, 
          validation_split = 0.2,
          callbacks=[checkpoint], verbose=0)

Let's compare the training loss and the validation loss.

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['train','validation'])
plt.show()

### Make Predictions

Let's load the saved model to make predictions.

In [None]:
import numpy as np

In [None]:
model = load_model(filename, compile=False  )
# preds = model.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))
# preds = (model.predict(testX.reshape((testX.shape[0],testX.shape[1]))) > 0.5).astype("int32")
predict_x=model.predict(testX.reshape((testX.shape[0],testX.shape[1]))) 
classes_x=np.argmax(predict_x,axis=1)
classes_x

In [211]:
preds=classes_x

In [215]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [217]:
# convert predictions into text (English)
preds_text = []

for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0:
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
             
        else:
            if(t == None):
                temp.append('')
            else:
                temp.append(t)            
   
    preds_text.append(' '.join(temp))

In [224]:
pred_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})


In [222]:
pd.set_option('display.max_colwidth', 200)

In [223]:
pred_df.head(15)

Unnamed: 0,actual,predicted
0,hopscotch,s the the the the the the the the the the the the the the the the the the the the the the the the the t...
1,cancel command,project the the of project the the the the the the the the the of the the the the of the the a of the the a the of the of of the of ...
2,the five of spades,project the a of of the the of the the the of of s the the the of the of the a the of a a the to a of the of a the ...
3,watch operations,project the the of file s file s the the the the the a the the the the to the a the the the the of the the the a of the the a the the of the ...
4,web,s the the the the the the the the the the the the the the the the the the the the the the the the the the the...
5,foreground color,project the of the the the the the of of the of the a of the of the the the the of the of of ...
6,loaded session,project the the of the to a the the the the a the the the the file the the the the the the the the the the of the the to the the the ...
7,place the seven of hearts next to the six of hearts,project to the a a of the project the the s of a the s the the the to the a project s to s s the s the s project s s s the the project ...
8,deal a card from the deck,project to the a to s a of s of to to of a of to to of the of to the a to of s of to a to file to of to a to s s to to of to of to of to s to of t...
9,underline plain,s project the project the the the the project the the the the project the project project the the the the the the the the the project the ...


In [225]:
pred_df.tail(15)

Unnamed: 0,actual,predicted
2985,widgets,project the the the the the the the the the the the the ...
2986,anjuta manual,project file the of the the of a project a the of the the a the the the to the the a the file the the the of the the a of the the a the of the of...
2987,view help for this game,project the a the to of the the a the the the the a the the a of a of to the of the to the to the a to to the of a the a to the to the ...
2988,preferred plugins,project the the a project a the the the the a the the the a the the a the the the the the a the a the the the the of the the th...
2989,short cut,project the the the the the the the the the the the the ...
2990,gtype,project a the the the of project the the ...
2991,update,project the to the the a to the the to the the the to the the the the to the the the the the the the ...
2992,try dealing a row of cards,project to of a to s a s a of a to of a of to the a s to a the of to the to s of s a to s to a to file to s to a to s to s to s to s to a the s to s to s of to s o...
2993,save the selected style with the modified parameters,project the a the to s of file to of project file to the file s the file to of the to to of file of the project to of s project project a a s of the file to project a ...
2994,joker,s the the the the the the the the the the the the the the the the the the the the the the the ...


In [226]:
pred_df.sample(15)

Unnamed: 0,actual,predicted
2412,type,project the of s ...
1246,git push complete,project the of the of project s project of the the of the the project the the the the the s the project of the the the the of the the project of the ...
2264,open documents,project a the of s the to the of a a the ...
1939,move a onto the ten of diamonds,project a s the a the project of file the project the of a of file to the the file the a project the of file the project to a file to file project to project to of the ...
2341,available plugins,project the the of a the the the the the a the the the a the the a the the the the the the the a of the the a the the the the of th...
570,clean module,project the a project file a the the the the project the the the the file the file the the the the the the the project the the project the ...
1428,view,project the the the the to the the the the the the the the the the the the the the the the the the the the...
2766,started,project the the the of the the the of the the the the the the the o...
2542,cannot open s s,project a to a to a of a the a to a a to the a the to of a of the a of the s of file of of a to of the of a to a to the file to of the to a of a of to of ...
1181,gnome games web site,project the the the the the the the the the the the the to the the the the the of a the the the the of the the of the the ...
