In [1]:

import os
from helper import load_variable, save_variable
import tensorflow as tf
import numpy as np
from config import Weights_DIR
from dataset import prepare_data
from models.lstm.model import LSTMModel
max_sentence_length = 16
train_df = prepare_data(type='train', max_entries=1000)

In [11]:
class Translate:
    def __init__(self, path_to_weights=Weights_DIR, data=None, max_sentence_length=16, embedding_size=256):
        if path_to_weights is not None:
            self.path_to_weights = path_to_weights
        
        if data is not None:
            self.data = data
        else:
            print(
                "to instantiate the translation pass the path to weights or pass the training data")
            assert False
        self.en_hi_model = None
        self.hi_en_model = None
        self.english_tokenizer = tf.keras.preprocessing.text.Tokenizer(
            filters='', oov_token='<OOV>', lower=False)
        self.hindi_tokenizer = tf.keras.preprocessing.text.Tokenizer(
            filters='', oov_token='<OOV>', lower=False)
        self.english_vocab_size = None
        self.hindi_vocab_size = None
        self.validation_split = 0.15
        self.max_sentence_length = max_sentence_length

    def load_weights(self):
        if os.path.exists(os.path.join(self.path_to_weights, 'english_tokenizer')):
            self.english_tokenizer = load_variable(
                os.path.join(self.path_to_weights, 'english_tokenizer'))
            self.english_vocab_size = len(
                self.english_tokenizer.word_index) + 1
        else:
            print('Could not find english tokenizer weights')

        if os.path.exists(os.path.join(self.path_to_weights, 'hindi_tokenizer')):
            self.hindi_tokenizer = load_variable(
                os.path.join(self.path_to_weights, 'hindi_tokenizer'))
            self.hindi_vocab_size = len(self.hindi_tokenizer.word_index)+1
        else:
            print("Could not find hindi_tokenizer weights")

        if os.path.exists(os.path.join(self.path_to_weights, 'hi_en_model.h5')):
            self.hi_en_model = LSTMModel(encoder_vocab_size=self.hindi_vocab_size, decoder_vocab_size=self.english_vocab_size)
            self.hi_en_model(np.array([[[1]*self.max_sentence_length], [[1]*self.max_sentence_length]]))
            self.hi_en_model.load_weights(os.path.join(self.path_to_weights, 'hi_en_model.h5'))
        else:
            print("could not load hi_en_model")

        if os.path.exists(os.path.join(self.path_to_weights, 'en_hi_model.h5')):
            self.en_hi_model = LSTMModel(encoder_vocab_size=self.english_vocab_size, decoder_vocab_size=self.hindi_vocab_size)
            self.en_hi_model(np.array([[[1]*self.max_sentence_length], [[1]*self.max_sentence_length]]))
            self.en_hi_model.load_weights(os.path.join(self.path_to_weights, 'en_hi_model.h5'))
        else:
            print('Could not load en_hi_model')

    def tokenize_data(self):
        print("filtering the sentences on the basis of their length")
        data = self.data
        data = data[data['en'].apply(
            lambda x: len(x.split()) < max_sentence_length)]
        data = data[data['hi'].apply(
            lambda x: len(x.split()) < max_sentence_length)]
        print(
            f"{data.__len__()} sentence pairs are valid or less than max_sentence_length")
        english_sentences = self.data['en'].to_list()
        hindi_sentences = self.data['hi'].to_list()

        print("start tokenization..")
        self.english_tokenizer.fit_on_texts(english_sentences)
        self.hindi_tokenizer.fit_on_texts(hindi_sentences)
        self.english_sequences = self.english_tokenizer.texts_to_sequences(
            english_sentences)
        self.hindi_sequences = self.hindi_tokenizer.texts_to_sequences(
            hindi_sentences)

        self.english_vocab_size = len(self.english_tokenizer.word_index) + 1
        self.hindi_vocab_size = len(self.hindi_tokenizer.word_index) + 1
        print("English Vocab Size: ", self.english_vocab_size)
        print("Hindi Vocab Size: ", self.hindi_vocab_size)

    def train_en_hi(self, num_epochs=10, optimizer='rmsprop', metrics=['accuracy'], **kwargs):

        if self.en_hi_model == None:
            self.en_hi_model = LSTMModel(encoder_vocab_size=self.english_vocab_size,
                                         decoder_vocab_size=self.hindi_vocab_size)
        encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(self.english_sequences,
                                                                       maxlen=max_sentence_length,
                                                                       padding='post')

        decoder_inputs = []
        decoder_outputs = []
        for sentence in self.hindi_sequences:
            decoder_inputs.append(sentence[:-1])
            decoder_outputs.append(sentence[1:])
        decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs,
                                                                       maxlen=max_sentence_length,
                                                                       padding='post')
        decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs,
                                                                        maxlen=max_sentence_length,
                                                                        padding='post')
        self.en_hi_model.compile(optimizer=optimizer,
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                                 metrics=metrics)

        callback1 = tf.keras.callbacks.ModelCheckpoint(
            filepath=Weights_DIR+"\\model\\",
            monitor='val_accuracy',
            mode='max'
        )
        self.en_hi_model.fit([encoder_inputs, decoder_inputs],
                             decoder_outputs,epochs=num_epochs ,validation_split=self.validation_split,
                             callbacks=[callback1])

    def train_hi_en(self, num_epochs=10, optimizer='rmsprop', metrics=['accuracy'], **kwargs):
        if self.hi_en_model == None:
            self.hi_en_model = LSTMModel(encoder_vocab_size=self.hindi_vocab_size,
                                         decoder_vocab_size=self.english_vocab_size)
        encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(self.hindi_sequences,
                                                                       maxlen=max_sentence_length,
                                                                       padding='post')

        decoder_inputs = []
        decoder_outputs = []
        for sentence in self.english_sequences:
            decoder_inputs.append(sentence[:-1])
            decoder_outputs.append(sentence[1:])
        decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs,
                                                                       maxlen=max_sentence_length,
                                                                       padding='post')
        decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs,
                                                                        maxlen=max_sentence_length,
                                                                        padding='post')
        self.hi_en_model.compile(optimizer=optimizer,
                                 loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                                 metrics=metrics)

        callback1 = tf.keras.callbacks.ModelCheckpoint(
            filepath=Weights_DIR+"\\model\\",
            monitor='val_accuracy',
            mode='max'
        )
        self.hi_en_model.fit([encoder_inputs, decoder_inputs],
                             decoder_outputs, epochs= num_epochs, validation_split=self.validation_split,
                             callbacks=[callback1])
                             
    def train(self, model_to_train=None, num_epochs=2, optimizer='rmsprop', metrics=['accuracy'], **kwargs):
        self.tokenize_data()
        if model_to_train == 'en_hi':
            self.train_en_hi(num_epochs=num_epochs,
                             optimizer=optimizer, metrics=metrics, **kwargs)
        elif model_to_train == 'hi_en':
            self.train_hi_en(num_epochs=num_epochs,
                             optimizer=optimizer, metrics=metrics, **kwargs)
        else:
            self.train_hi_en(num_epochs=num_epochs,
                             optimizer=optimizer, metrics=metrics, **kwargs)
            self.train_en_hi(num_epochs=num_epochs,
                             optimizer=optimizer, metrics=metrics, **kwargs)

    def save_model(self, path=None):
        if path != None and os.path.exists(path):
            self.path_to_weights = path
        print(f"saving model at {self.path_to_weights}")

        save_variable(self.english_tokenizer, os.path.join(
            self.path_to_weights + "\\english_tokenizer"))
        save_variable(self.hindi_tokenizer, os.path.join(
            self.path_to_weights + "\\hindi_tokenizer"))
        self.en_hi_model.save_weights(os.path.join(self.path_to_weights + "\\en_hi_model.h5"))
        self.hi_en_model.save_weights(os.path.join(self.path_to_weights + "\\hi_en_model.h5"))



    def translate_sentence_to_hindi(self,sentence):
        if self.english_tokenizer ==None or self.hindi_tokenizer==None or self.en_hi_model==None:
            print("the translate object is not initialized properly")
            return sentence
        sequence = self.english_tokenizer.texts_to_sequences([sentence])
        embedding = self.en_hi_model.encoder(np.array(sequence))
        _, next_h, next_c= self.en_hi_model.encoder_lstm(embedding)
        curr_token = [[0]]
        curr_token[0][0] = self.hindi_tokenizer.word_index['<START>']
        predict_sentence = ''
        for i in range(max_sentence_length):
                # print(curr_token, next_h.shape, next_c.shape)
                temp = self.en_hi_model.decoder(np.array(curr_token))
                output , next_h, next_c= self.en_hi_model.decoder_lstm(temp,initial_state= [next_h, next_c])
                next_token = np.argmax(output[0, 0, :])
                next_word =self.hindi_tokenizer.index_word[next_token]

                if next_word == '<END>':
                    break
                predict_sentence =predict_sentence+" " + next_word
                curr_token[0][0] = next_token
                #curr_token[0].append(next_token)
        return predict_sentence


    def translate_sentence_to_english(self,sentence):
        if self.english_tokenizer ==None or self.hindi_tokenizer==None or self.hi_en_model==None:
            print("the translate object is not initialized properly")
            return sentence
        sequence = self.hindi_tokenizer.texts_to_sequences([sentence])
        embedding = self.hi_en_model.encoder(np.array(sequence))
        _, next_h, next_c= self.hi_en_model.encoder_lstm(embedding)
        curr_token = [[0]]
        curr_token[0][0] = self.english_tokenizer.word_index['<START>']
        predict_sentence = ''
        for i in range(max_sentence_length):
                # print(curr_token, next_h.shape, next_c.shape)
                temp = self.hi_en_model.decoder(np.array(curr_token))
                output , next_h, next_c= self.hi_en_model.decoder_lstm(temp,initial_state= [next_h, next_c])
                next_token = np.argmax(output[0, 0, :])
                next_word =self.english_tokenizer.index_word[next_token]

                if next_word == '<END>':
                    break
                predict_sentence =predict_sentence+" " + next_word
                curr_token[0][0] = next_token
                #curr_token[0].append(next_token)
        return predict_sentence

if __name__ == "__main__":
    # trans = Translate(path_to_weights=Weights_DIR, data=train_df)
    # trans.train(num_epochs=1)
    # trans.save_model()
    trans1 = Translate(path_to_weights=Weights_DIR, data=train_df)
    trans1.load_weights()
    en ='give your application an accessibility workout ' 
    print(en)
    hi = trans1.translate_sentence_to_hindi(en)
    print(hi)
    en = trans1.translate_sentence_to_english(hi)
    print(en)

give your application an accessibility workout 
(1, 6, 256)
 स्रोत स्रोत विस्तारः अपने अपने अपने अपने अपने अपने अपने अपने अपने अपने अपने अपने अपने
(1, 16, 256)
 by and and current current current current current current event event event event event event event


In [7]:
train_df['en'][0]

'<START> give your application an accessibility workout <END>'

In [2]:
#Make imports
import numpy as np
import re
import pickle
import os
import seaborn as sns
import string

In [3]:
import pandas as pd

In [4]:
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.3.0


In [5]:
from dataset import prepare_data

In [6]:
train = prepare_data(type='train', max_entries=10000)

In [7]:
#Some parameters
vocab_size = 10000
total_sentences = 25000
maxlen = 10
epochs = 70
validation_split = 0.05

In [8]:
from tqdm import tqdm

In [9]:
en_data = []
hi_data = []

cnt = 0

for (en,hi) in tqdm(zip(train['en'].to_list(), train['hi'].to_list())):
  l = min(len(en.split()), len(hi.split()))
  if l <= maxlen:
    en_data.append(en)
    hi_data.append(hi)
    cnt += 1
  if cnt == total_sentences:
    break

10000it [00:00, 149256.58it/s]


In [10]:
#Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  1428
Hindi Vocab Size:  1849


In [11]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

In [12]:
#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

In [13]:
# Training and Testing split
# 95%, 5%
split = int(0.95 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(9549, 10) (9549, 10) (9549, 10)


In [14]:
#Define LSTM model
d_model = 256

#Encoder
inputs = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(english_vocab_size, d_model, mask_zero=True)(inputs)
_,state_h,state_c = tf.keras.layers.LSTM(d_model,activation='relu',return_state=True)(x)

#Decoder
targets = tf.keras.layers.Input(shape=(None,))
embedding_layer = tf.keras.layers.Embedding(hindi_vocab_size, d_model, mask_zero=True)
x = embedding_layer(targets)
decoder_lstm = tf.keras.layers.LSTM(d_model,activation='relu',return_sequences=True, return_state=True)
x,_,_ = decoder_lstm(x, initial_state=[state_h, state_c])
dense1 = tf.keras.layers.Dense(hindi_vocab_size, activation='softmax')
x = dense1(x)

model = tf.keras.models.Model(inputs=[inputs, targets],outputs=x)
model.summary()

loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    365568      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    473344      input_2[0][0]                    
_______________________________________________________________________________________

In [15]:
from config import *

In [16]:
#Save model after each epoch
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=Weights_DIR+"\\model\\",
    monitor='val_accuracy',
    mode='max'
)

In [17]:
model.fit(X_train, y_train, epochs=3, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])

Epoch 1/3
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: c:\Users\Abhishek pandir\ML\EnglishHindiTranslationNLP\weights\model\assets
Epoch 2/3

KeyboardInterrupt: 

In [18]:
model.save(Weights_DIR+"\\model.h5")

In [19]:
#Retrieve previously saved stuff
saved_model = tf.keras.models.load_model(Weights_DIR+"\\model.h5")

saved_model.summary()

inputs = saved_model.get_layer('input_1').output
_,state_h,state_c = saved_model.get_layer('lstm').output
targets = saved_model.get_layer('input_2').output
embedding_layer = saved_model.get_layer('embedding_1')
decoder_lstm = saved_model.get_layer('lstm_1')
dense1 = saved_model.get_layer('dense')

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    365568      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    473344      input_2[0][0]                    
_______________________________________________________________________________________

In [20]:
#Inference Model

#Encoder
encoder = tf.keras.models.Model(inputs, [state_h, state_c])

#Decoder
decoder_input_h = tf.keras.layers.Input(shape=(d_model,))
decoder_input_c = tf.keras.layers.Input(shape=(d_model,))
x = embedding_layer(targets)
x, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = dense1(x)
decoder = tf.keras.models.Model([targets] + [decoder_input_h, decoder_input_c], 
                                [x] + [decoder_output_h, decoder_output_c])

In [98]:
decoder.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    473344      input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 256)]        0                                            
_______________________________________________________________________________________

In [74]:
def predict_sentence(en_input):
  input_seq = en_tokenizer.texts_to_sequences([en_input])
  next_h, next_c = encoder.predict(input_seq)
  
  curr_token = np.zeros(1)
  curr_token[0] = hi_tokenizer.word_index['<START>']
  
  pred_sentence = ''
  
  for i in range(maxlen):
    print()
    output, next_h, next_c = decoder.predict([curr_token] + [next_h, next_c])
    next_token = np.argmax(output[0, 0, :])
    next_word = hi_tokenizer.index_word[next_token]
    if next_word == '<END>':
      break
    else:
      pred_sentence += ' ' + next_word
      curr_token[0] = next_token

  return pred_sentence

predict_sentence("Give your application an accessibility workout")

[[1, 260, 91, 21, 119, 476]]
[[0.01981037 0.00133565]] [2.] [[0.0428213  0.00343366]]
[[0.01179558 0.00039951]] [7.] [[0.03238681 0.00165538]]
[[0.00087991 0.00012648]] [26.] [[0.00734414 0.00050285]]
[[0.00030256 0.00863845]] [26.] [[0.00162887 0.0265565 ]]
[[6.6287947e-05 1.2668375e-02]] [14.] [[0.00034521 0.03790924]]
[[9.8019900e-06 2.5867904e-03]] [15.] [[7.5834236e-05 1.0165131e-02]]


' एक खाली खाली ले जाएँ'

In [30]:
#Testing and Analysis
import nltk

candidates = []
references = []

ctr = 20 
i = 0

while ctr>0:
  l = len(X_test[i].split())
  if l<=maxlen:   #Choose only sentences of length in range [5,15]
    pred_sentence = predict_sentence(X_test[i])
    candidates.append(pred_sentence.split())

    print("Input: ", X_test[i])
    print("Prediction: ", pred_sentence)

    # google_translated_sentence = translate_client.translate(X_test[i], target_language='hi')['translatedText']
    
    # print("Google Translated Reference: ", google_translated_sentence)
    print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
    print()
    references.append([y_test[i].split()[1:-1]])

    ctr -= 1
  i += 1

print(nltk.translate.bleu_score.corpus_bleu(references, candidates))

1
1
1
1
1
1
Input:  <START> give your application an accessibility workout <END>
Prediction:   a को एक खाली खाँचा
Dataset Reference:  अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें

1
1
1
1
1
Input:  <START> accerciser accessibility explorer <END>
Prediction:   एक खाली पत्ता चलें
Dataset Reference:  एक्सेर्साइसर पहुंचनीयता अन्वेषक

1
1
1
1
1
1
1
1
Input:  <START> the default plugin layout for the bottom panel <END>
Prediction:   s हेतु s के लिए अंतर्क्रियात्मक खाका
Dataset Reference:  निचले पटल के लिए डिफोल्ट प्लगइन खाका

1
1
1
1
1
1
1
1
Input:  <START> the default plugin layout for the top panel <END>
Prediction:   s हेतु s के लिए अंतर्क्रियात्मक खाका
Dataset Reference:  ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका

1
1
1
1
1
Input:  <START> highlight duration <END>
Prediction:   ताश को हाइलाइट करें
Dataset Reference:  अवधि को हाइलाइट रकें

1
1
1
1
1
1
1
1
Input:  <START> highlight border color <END>
Prediction:   a को एक पत्ता पत्ता ले जाएँ
Dataset Reference:  सीमांत बोर्डर के रंग को हाइलाइट

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
