In [None]:
#Make imports
import numpy as np
import re
import pickle
import os
import seaborn as sns
import string
import pandas as pd
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df=pd.read_csv("/content/drive/MyDrive/Hindi_English_Truncated_Corpus.csv")

In [None]:
df.drop(['source'],axis=1,inplace=True)
mask = (df['english_sentence'].str.len()>20) & (df['english_sentence'].str.len()<200)
df = df.loc[mask]
df = df.sample(64000, random_state=1)
df.head()


Unnamed: 0,english_sentence,hindi_sentence
63241,Indian News Service - National News Agency,इण्डियन न्यूज सर्विस - राष्ट्रीय समाचार एजेंसी
81404,"In West Bengal , it seems set to eat humble pi...",पश्चिम बंगाल में तो वह अपमान का घूंट पीने को भ...
8803,One american dollar is equal to 60 pakistani r...,एक अमरीकी डालर की कीमत लगभग ६० पाकिस्तानी रुपय...
73434,"but between those high highs,",लेकिन इन बेहतरीन लम्हों के बीच
65711,Every other politician went along because when...,और वजह यह थी कि आर्थिक मामलं पर हमेशा विफल विच...


In [None]:
eng = df["english_sentence"]
hind = df["hindi_sentence"]
eng = eng.apply(lambda x: "<START> " + str(x) + " <END>")
hind = hind.apply(lambda x: "<START> "+ x + " <END>")


In [None]:

filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '<unk>'
eng_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters = filters, oov_token=oov_token)
hind_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters = filters, oov_token=oov_token)
eng_tokenizer.fit_on_texts(eng)
hind_tokenizer.fit_on_texts(hind)

In [None]:
print(len(eng), len(hind))
print()
eng[:3], hind[:3]
eng[1],hind[1]

64000 64000



("<START> I'd like to tell you about one such child, <END>",
 '<START> मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी, <END>')

In [None]:
#Some parameters
vocab_size = 52000
total_sentences = 30000
maxlen = 20
epochs = 80
validation_split = 0.05

In [None]:
en_data = []
hi_data = []

cnt = 0

for (en,hi) in zip(eng, hind):
  l = min(len(en.split()), len(hi.split()))
  if l <= maxlen:
    en_data.append(en)
    hi_data.append(hi)
    cnt += 1
  if cnt == total_sentences:
    break

In [None]:
en_sequences = eng_tokenizer.texts_to_sequences(eng)
hi_sequences = hind_tokenizer.texts_to_sequences(hind)
english_vocab_size = len(eng_tokenizer.word_index) + 1
hindi_vocab_size = len(hind_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  44305
Hindi Vocab Size:  51960


In [None]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

In [None]:
#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

In [None]:
# Training and Testing split
# 95%, 5%
split = int(0.95 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(28500, 20) (28500, 20) (28500, 20)


In [None]:
X_train

[array([[    2,    71,   419, ...,     0,     0,     0],
        [ 5505, 11454,    94, ...,  4969,  3537,     3],
        [    2,    37,   587, ...,     0,     0,     0],
        ...,
        [    2,     8,    15, ...,     0,     0,     0],
        [    2,    89,   741, ...,     0,     0,     0],
        [    2,     7,   123, ...,     0,     0,     0]], dtype=int32),
 array([[    2, 13532,  4759, ...,     0,     0,     0],
        [  176, 16315,     4, ...,   121,  2541,     6],
        [    2,    13,  1425, ...,     0,     0,     0],
        ...,
        [    2,   197,     5, ...,     0,     0,     0],
        [    2,   147,    45, ...,     0,     0,     0],
        [    2,  7816,    69, ...,     0,     0,     0]], dtype=int32)]

In [None]:
english_vocab_size

44305

In [None]:
d_model = 256

#Encoder
inputs = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(english_vocab_size, d_model, mask_zero=True)(inputs)
_,state_h,state_c = tf.keras.layers.LSTM(d_model,activation='relu',return_state=True)(x)

#Decoder
targets = tf.keras.layers.Input(shape=(None,))
embedding_layer = tf.keras.layers.Embedding(hindi_vocab_size, d_model, mask_zero=True)
x = embedding_layer(targets)
decoder_lstm = tf.keras.layers.LSTM(d_model,activation='relu',return_sequences=True, return_state=True)
x,_,_ = decoder_lstm(x, initial_state=[state_h, state_c])
dense1 = tf.keras.layers.Dense(hindi_vocab_size, activation='softmax')
x = dense1(x)

model = tf.keras.models.Model(inputs=[inputs, targets],outputs=x)
model.summary()

loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    11342080    ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    13301760    ['input_2[0][0]']                
                                                                                              

In [None]:
X_train=np.array(X_train)
print(X_train.shape)
y_train=np.array(y_train)
print(y_train.shape)

(2, 28500, 20)
(28500, 20)


In [None]:
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./drive/MyDrive/en-hi.h5',
    monitor='val_accuracy',
    mode='max'
)


In [None]:
model.fit([X_train[0],X_train[1]], y_train, epochs=epochs, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
 91/847 [==>...........................] - ETA: 2:00 - loss: 4.1322 - accuracy: 0.3466

In [None]:

#Retrieve previously saved stuff
saved_model = tf.keras.models.load_model('./drive/MyDrive/en-hi.h5')
saved_model.summary()

inputs = saved_model.get_layer('input_1').output
_,state_h,state_c = saved_model.get_layer('lstm').output
targets = saved_model.get_layer('input_2').output
embedding_layer = saved_model.get_layer('embedding_1')
decoder_lstm = saved_model.get_layer('lstm_1')
dense1 = saved_model.get_layer('dense')

In [None]:
d_model=256
#Inference Model

#Encoder
encoder = tf.keras.models.Model(inputs, [state_h, state_c])

#Decoder
decoder_input_h = tf.keras.layers.Input(shape=(d_model,))
decoder_input_c = tf.keras.layers.Input(shape=(d_model,))
x = embedding_layer(targets)
x, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = dense1(x)
decoder = tf.keras.models.Model([targets] + [decoder_input_h, decoder_input_c],
                                [x] + [decoder_output_h, decoder_output_c])

In [None]:


def predict_sentence(en_input):
  input_seq = eng_tokenizer.texts_to_sequences([en_input])
  next_h, next_c = encoder.predict(input_seq)

  curr_token = np.zeros((1,1))

  curr_token[0,0] = 2

  pred_sentence = ''

  for i in range(maxlen):
    output, next_h, next_c = decoder.predict([curr_token] + [next_h, next_c])
    next_token = np.argmax(output[0, 0, :])
    next_word = hind_tokenizer.index_word[next_token]
    if next_word == '<end>':
      break
    else:
      pred_sentence += ' ' + next_word
      curr_token[0, 0] = next_token

  return pred_sentence


e="but between those high highs"
h=predict_sentence(e)
h

In [None]:
#Testing and Analysis
import nltk

candidates = []
references = []

ctr = 3
i = 0

engtest=["in this book the condition of those who died is there","and deal with the subject in a way","politicians do not have permission to do what needs to be done"]
hintest=["इसमें तुमसे पूर्व गुज़रे हुए लोगों के हालात हैं"," और कैसे इस विषय से निपटें","राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करने कि अनुमति नहीं है"]
gts=["इस पुस्तक में मरने वालों की स्थिति है","और एक तरह से विषय से निपटें","राजनेताओं को वह करने की अनुमति नहीं है जो करने की आवश्यकता है"]
while ctr>0:
  # l = len(X_test[i].split())
  # if l<=maxlen:   #Choose only sentences of length in range [5,15]
  pred_sentence = predict_sentence(engtest[i])
  candidates.append(pred_sentence.split())

  print("Input: ", engtest[i])
  print("Prediction: ", pred_sentence)

    #google_translated_sentence = translate_client.translate(X_test[i], target_language='hi')['translatedText']

  print("Google Translated Reference: ", gts[i])
  print("Dataset Reference: ", ' '.join(hintest[i].split()[1:-1]))
  print()
  references.append([hintest[i].split()[1:-1], gts[i].split()])

  ctr -= 1
  i += 1

print(nltk.translate.bleu_score.corpus_bleu(references, candidates))