## Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
%cd "/content/drive/MyDrive/EnglishHindiTranslationNLP"

/content/drive/MyDrive/EnglishHindiTranslationNLP


In [None]:
!pip install datasets

## Helper 

In [None]:
import pickle

# saving
def save_variable(variable, file_path):
    with open(file_path, 'wb') as handle:
        pickle.dump(variable, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
def load_variable(file_path):
    with open(file_path, 'rb') as handle:
        variable = pickle.load(handle)
    return variable

## Imports

In [None]:
#Make imports
import numpy as np
import re
import pickle
import os
import seaborn as sns
import string
import tensorflow as tf
import pandas as pd
from dataset import prepare_data
from tqdm import tqdm

## Define Constants

In [None]:
#Some parameters
vocab_size = 10000
total_sentences = 25000
maxlen = 16
epochs = 70
validation_split = 0.05
max_sentence_length= maxlen

## Load Data

In [None]:
train = prepare_data(type='train', max_entries=total_sentences)
en_data = []
hi_data = []
cnt = 0
for (en,hi) in tqdm(zip(train['en'].to_list(), train['hi'].to_list())):
  l = min(len(en.split()), len(hi.split()))
  if l <= maxlen:
    en_data.append(en)
    hi_data.append(hi)
    cnt += 1
  if cnt == total_sentences:
    break

25000it [00:00, 181172.24it/s]


## Tokenize Text 

In [None]:
#Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  1922
Hindi Vocab Size:  2400


## Pad Sequences

In [None]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

In [None]:
#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

## Test train split

In [None]:
# Training and Testing split
# 95%, 5%
split = int(0.9 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(22500, 16) (22500, 16) (22500, 16)


## Define Model

In [None]:
import tensorflow as tf
class LSTMModel(tf.keras.Model):

    def __init__(self,encoder_vocab_size = None, decoder_vocab_size = None, embedding_size = 128,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.encoder_vocab_size = encoder_vocab_size
        self.decoder_vocab_size = decoder_vocab_size
        self.embedding_size     = embedding_size
        # encoder
        self.input_1      = tf.keras.layers.InputLayer(input_shape=(None,), name = 'input_1')
        self.embedding_1  =   tf.keras.layers.Embedding(encoder_vocab_size, embedding_size,mask_zero=True, name = 'embedding_1')
        self.encoder_lstm =   tf.keras.layers.LSTM(embedding_size, return_state=True,  name ='encoder_lstm' )
        # decoder 
        self.input_2      = tf.keras.layers.InputLayer(input_shape=(None,), name='input_2')
        self.embedding_2  = tf.keras.layers.Embedding(decoder_vocab_size, embedding_size,mask_zero=True,name= "embedding_2")
        self.decoder_lstm = tf.keras.layers.LSTM(embedding_size,activation='relu',return_sequences=True, return_state=True, name ='decoder_lstm' )

        self.token_layer = tf.keras.layers.Dense(decoder_vocab_size,activation='softmax', name = 'token_layer')

    def call(self,inputs):
        encoder_input = self.input_1(inputs[0])
        decoder_input = self.input_2(inputs[1])
        # encode the inputs 
        encoder_embed = self.embedding_1(encoder_input)
        # run rnn on the encoded sequence
        _, state_h, state_c = self.encoder_lstm(encoder_embed)
        # decode the target 
        decoder_embed = self.embedding_2(decoder_input)
        x, _,_ = self.decoder_lstm(decoder_embed, initial_state=[state_h, state_c])
        return self.token_layer(x)
  
    # def get_config(self):
    #   config = super.get_config()
    #   config['encoder_vocab_size'] = self.encoder_vocab_size
    #   config['decoder_vocab_size'] = self.decoder_vocab_size
    #   config['embedding_size']     = self.embedding_size
    #   return config
    def predict_sequence(self,text, input_tokenizer, output_tokenizer, max_len=maxlen):
      if type(text)!=list:
        text = [text]
      input_sequence = input_tokenizer.texts_to_sequences(text)
      print(input_sequence)
      if type(input_sequence)==list:
        input_sequence = np.array(input_sequence)
      encoder_embed = self.embedding_1(input_sequence)
        # run rnn on the encoded sequence
      _, next_h, next_c = self.encoder_lstm(encoder_embed)
      curr_token = [[0]]
      curr_token[0][0] = output_tokenizer.word_index['<START>']

      out_seq = ""
      for i in range(max_len):
        print(curr_token)
        decoder_embedding = self.embedding_2(np.array(curr_token))
        x, next_h, next_c = self.decoder_lstm(decoder_embedding, initial_state=[next_h, next_c])
        x = self.token_layer(x)
        next_token = np.argmax(x[0,0,:])
        next_word = output_tokenizer.index_word[next_token]
        if next_word =="<END>":
          break
        curr_token[0][0] = next_token
        #curr_token[0].append(next_token)
        out_seq= out_seq+" "+ next_word
      return out_seq

In [None]:
model = LSTMModel(encoder_vocab_size=english_vocab_size, decoder_vocab_size= hindi_vocab_size ,embedding_size=64) 
model(np.array([[[1]*max_sentence_length], [[1]*max_sentence_length]]))
model.summary()

loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])



UnknownError: Fail to find the dnn implementation. [Op:CudnnRNNV3]

In [None]:
#Save model after each epoch
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=Weights_DIR+"\\model\\",
    monitor='val_accuracy',
    mode='max'
)

In [None]:
model.fit(X_train, y_train, epochs=10,batch_size= 64, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])

Epoch 1/10

In [None]:
model.save_weights(Weights_DIR+"\\model.h5")

In [None]:
saved_model = LSTMModel(encoder_vocab_size=english_vocab_size, decoder_vocab_size= hindi_vocab_size ) 
saved_model(np.array([[[1]*max_sentence_length], [[1]*max_sentence_length]]))
saved_model.load_weights(Weights_DIR+"\\model.h5")
saved_model.predict_sequence("Hi what is your name", en_tokenizer, hi_tokenizer)

In [None]:
#Testing and Analysis
import nltk

candidates = []
references = []

ctr = 20 
i = 0

while ctr>0:
  l = len(X_test[i].split())
  if l<=maxlen:   #Choose only sentences of length in range [5,15]
    pred_sentence = predict_sentence(X_test[i])
    candidates.append(pred_sentence.split())

    print("Input: ", X_test[i])
    print("Prediction: ", pred_sentence)

    # google_translated_sentence = translate_client.translate(X_test[i], target_language='hi')['translatedText']
    
    # print("Google Translated Reference: ", google_translated_sentence)
    print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
    print()
    references.append([y_test[i].split()[1:-1]])

    ctr -= 1
  i += 1

print(nltk.translate.bleu_score.corpus_bleu(references, candidates))

In [None]:
#Testing and Analysis
import nltk

candidates = []
references = []

ctr = 20 
i = 0

while ctr>0:
  l = len(X_test[i].split())
  if l<=maxlen:   #Choose only sentences of length in range [5,15]
    pred_sentence = predict_sentence(X_test[i])
    candidates.append(pred_sentence.split())

    print("Input: ", X_test[i])
    print("Prediction: ", pred_sentence)

    # google_translated_sentence = translate_client.translate(X_test[i], target_language='hi')['translatedText']
    
    # print("Google Translated Reference: ", google_translated_sentence)
    print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
    print()
    references.append([y_test[i].split()[1:-1]])

    ctr -= 1
  i += 1

print(nltk.translate.bleu_score.corpus_bleu(references, candidates))