## Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/MyDrive/EnglishHindiTranslationNLP"

/content/drive/MyDrive/EnglishHindiTranslationNLP


In [3]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 5.2 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 43.2 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 69.8 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |██████████████████████████

## Helper 

In [4]:
import pickle

# saving
def save_variable(variable, file_path):
    with open(file_path, 'wb') as handle:
        pickle.dump(variable, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
def load_variable(file_path):
    with open(file_path, 'rb') as handle:
        variable = pickle.load(handle)
    return variable

## Imports

In [29]:
#Make imports
import numpy as np
import re
import pickle
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 


from config import *
import seaborn as sns
import string
import tensorflow as tf
import pandas as pd
from dataset import prepare_data
from tqdm import tqdm
import random

In [None]:
def set_seeds(seed=10):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

## Setup Gpu

In [9]:
physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

## Define Constants

In [10]:
#Some parameters
vocab_size = 10000
total_sentences = 50000
maxlen = 16
epochs = 70
validation_split = 0.05
max_sentence_length= maxlen

## Load Data

In [11]:
train = prepare_data(type='train', max_entries=total_sentences)
en_data = []
hi_data = []
cnt = 0
for (en,hi) in tqdm(zip(train['en'].to_list(), train['hi'].to_list())):
  l = min(len(en.split()), len(hi.split()))
  if l <= maxlen:
    en_data.append(en)
    hi_data.append(hi)
    cnt += 1
  if cnt == total_sentences:
    break

50000it [00:00, 411648.59it/s]


## Tokenize Text 

In [12]:
#Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  3401
Hindi Vocab Size:  4427


## Define Model Arcitechre 

In [23]:
import tensorflow as tf
class LSTMModel(tf.keras.Model):

    def __init__(self,encoder_vocab_size = None, decoder_vocab_size = None, embedding_size = 128, num_rnn_units=32,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.encoder_vocab_size = encoder_vocab_size
        self.decoder_vocab_size = decoder_vocab_size
        self.embedding_size     = embedding_size
        # encoder
        self.input_1      = tf.keras.layers.InputLayer(input_shape=(None,), name = 'input_1')
        self.embedding_1  =   tf.keras.layers.Embedding(encoder_vocab_size, embedding_size,mask_zero=True, name = 'embedding_1')
        self.encoder_lstm =   tf.keras.layers.LSTM(num_rnn_units, return_state=True,  name ='encoder_lstm' )
        # decoder 
        self.input_2      = tf.keras.layers.InputLayer(input_shape=(None,), name='input_2')
        self.embedding_2  = tf.keras.layers.Embedding(decoder_vocab_size, embedding_size,mask_zero=True,name= "embedding_2")
        self.decoder_lstm = tf.keras.layers.LSTM(num_rnn_units,activation='relu',return_sequences=True, return_state=True, name ='decoder_lstm' )

        self.token_layer = tf.keras.layers.Dense(decoder_vocab_size,activation='softmax', name = 'token_layer')

    def call(self,inputs):
        encoder_input = self.input_1(inputs[0])
        decoder_input = self.input_2(inputs[1])
        # encode the inputs 
        encoder_embed = self.embedding_1(encoder_input)
        # run rnn on the encoded sequence
        _, state_h, state_c = self.encoder_lstm(encoder_embed)
        # decode the target 
        decoder_embed = self.embedding_2(decoder_input)
        x, _,_ = self.decoder_lstm(decoder_embed, initial_state=[state_h, state_c])
        return self.token_layer(x)
  
    # def get_config(self):
    #   config = super.get_config()
    #   config['encoder_vocab_size'] = self.encoder_vocab_size
    #   config['decoder_vocab_size'] = self.decoder_vocab_size
    #   config['embedding_size']     = self.embedding_size
    #   return config
    def predict_sequence(self,text, input_tokenizer, output_tokenizer, max_len=maxlen):
      if type(text)!=list:
        text = [text]
      input_sequence = input_tokenizer.texts_to_sequences(text)
      if type(input_sequence)==list:
        input_sequence = np.array(input_sequence)
      encoder_embed = self.embedding_1(input_sequence)
        # run rnn on the encoded sequence
      _, next_h, next_c = self.encoder_lstm(encoder_embed)
      curr_token = [[0]]
      curr_token[0][0] = output_tokenizer.word_index['<START>']

      out_seq = ""
      for i in range(max_len):
        decoder_embedding = self.embedding_2(np.array(curr_token))
        x, next_h, next_c = self.decoder_lstm(decoder_embedding, initial_state=[next_h, next_c])
        x = self.token_layer(x)
        next_token = np.argmax(x[0,0,:])
        next_word = output_tokenizer.index_word[next_token]
        if next_word =="<END>":
          break
        curr_token[0][0] = next_token
        #curr_token[0].append(next_token)
        out_seq= out_seq+" "+ next_word
      return out_seq

## English To Hindi Model [link text](https://)

### Prepare input Output Data For English to Hindi Translation Model

In [63]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

# Training and Testing split
# 95%, 5%
split = int(0.9 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(45000, 16) (45000, 16) (45000, 16)


### Initialize model 

In [64]:
en_hi_model = LSTMModel(encoder_vocab_size=english_vocab_size, decoder_vocab_size= hindi_vocab_size ,embedding_size=256, num_rnn_units=64) 
en_hi_model(np.array([[[1]*max_sentence_length], [[1]*max_sentence_length]]))
en_hi_model.summary()
# model.load_weights(Weights_DIR+"\\model.h5")
loss = tf.keras.losses.SparseCategoricalCrossentropy()

Model: "lstm_model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     multiple                  870656    
                                                                 
 encoder_lstm (LSTM)         multiple                  82176     
                                                                 
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     multiple                  1133312   
                                                                 
 decoder_lstm (LSTM)         multiple                  82176     
                                                                 
 token_layer (Dense)         multiple                 

### Train model

In [65]:
#Save model after each epoch
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=Weights_DIR+"\\model\\",
    monitor='val_accuracy',
    mode='max'
)
en_hi_model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])
en_hi_model.fit(X_train, y_train, epochs=10,batch_size= 64, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])

Epoch 1/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 2/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 3/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 4/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 5/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 6/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 7/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 8/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 9/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 10/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets




<keras.callbacks.History at 0x7ff86888b950>

### Save Model

In [66]:
en_hi_model.save_weights(Weights_DIR+"\\en_hi_model.h5")

### Test Model

In [67]:
#Testing and Analysis

candidates = []
references = []

ctr = 10 
i = 0

while ctr>0:
  l = len(X_test[i].split())
  if l<=maxlen:   #Choose only sentences of length in range [5,15]
    pred_sentence = en_hi_model.predict_sequence(X_test[i],en_tokenizer, hi_tokenizer)
    candidates.append(pred_sentence.split())

    print("Input: ", X_test[i])
    print("Prediction: ", pred_sentence)

    # google_translated_sentence = translate_client.translate(X_test[i], target_language='hi')['translatedText']
    
    # print("Google Translated Reference: ", google_translated_sentence)
    print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
    print()
    references.append([y_test[i].split()[1:-1]])

    ctr -= 1
  i += 1

#print(nltk.translate.bleu_score.corpus_bleu(references, candidates))

Input:  <START> give your application an accessibility workout <END>
Prediction:   अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
Dataset Reference:  अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें

Input:  <START> accerciser accessibility explorer <END>
Prediction:   एक्सेर्साइसर पहुंचनीयता अन्वेषक
Dataset Reference:  एक्सेर्साइसर पहुंचनीयता अन्वेषक

Input:  <START> the default plugin layout for the bottom panel <END>
Prediction:   ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका
Dataset Reference:  निचले पटल के लिए डिफोल्ट प्लगइन खाका

Input:  <START> the default plugin layout for the top panel <END>
Prediction:   ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका
Dataset Reference:  ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका

Input:  <START> a list of plugins that are disabled by default <END>
Prediction:   उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है
Dataset Reference:  उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है

Input:  <START> highlight duration <END>
Prediction:   हाइलाइ

## Hindi To English Translation Model

### Prepare input Output Data For Hindi To English Translation Model

In [68]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(hi_sequences, maxlen=maxlen, padding='post')

#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for en in en_sequences:
  decoder_inputs.append(en[:-1])
  decoder_outputs.append(en[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

# Training and Testing split
# 95%, 5%
split = int(0.9 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = hi_data[:split]
y_test = en_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(45000, 16) (45000, 16) (45000, 16)


### Initialize model 

In [69]:
hi_en_model = LSTMModel(encoder_vocab_size=hindi_vocab_size, decoder_vocab_size=  english_vocab_size, embedding_size=256, num_rnn_units=64) 
hi_en_model(np.array([[[1]*max_sentence_length], [[1]*max_sentence_length]]))
hi_en_model.summary()
# model.load_weights(Weights_DIR+"\\model.h5")
loss = tf.keras.losses.SparseCategoricalCrossentropy()
hi_en_model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])

Model: "lstm_model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     multiple                  1133312   
                                                                 
 encoder_lstm (LSTM)         multiple                  82176     
                                                                 
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     multiple                  870656    
                                                                 
 decoder_lstm (LSTM)         multiple                  82176     
                                                                 
 token_layer (Dense)         multiple                

### Train model

In [70]:
#Save model after each epoch
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=Weights_DIR+"\\model\\",
    monitor='val_accuracy',
    mode='max'
)

In [71]:
hi_en_model.fit(X_train, y_train, epochs=10,batch_size= 64, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])

Epoch 1/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 2/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 3/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 4/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 5/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 6/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 7/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 8/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 9/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


Epoch 10/10



INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/EnglishHindiTranslationNLP/weights\model\/assets




<keras.callbacks.History at 0x7ff85afba490>

### Save Model

In [72]:
hi_en_model.save_weights(Weights_DIR+"\\hi_en_model.h5")

### Test Model

In [73]:
hi_en_model.predict_sequence('<START> उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है<END>', hi_tokenizer, en_tokenizer)

' a list of plugins that are disabled by default'

In [74]:
#Testing and Analysis

candidates = []
references = []

ctr = 10 
i = 0

while ctr>0:
  l = len(X_test[i].split())
  if l<=maxlen:   #Choose only sentences of length in range [5,15]
    pred_sentence = hi_en_model.predict_sequence(X_test[i], hi_tokenizer,en_tokenizer)
    candidates.append(pred_sentence.split())

    print("Input: ", X_test[i])
    print("Prediction: ", pred_sentence)

    # google_translated_sentence = translate_client.translate(X_test[i], target_language='hi')['translatedText']
    
    # print("Google Translated Reference: ", google_translated_sentence)
    print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
    print()
    references.append([y_test[i].split()[1:-1]])

    ctr -= 1
  i += 1

#print(nltk.translate.bleu_score.corpus_bleu(references, candidates))

Input:  <START> अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें <END>
Prediction:   give your application an accessibility workout
Dataset Reference:  give your application an accessibility workout

Input:  <START> एक्सेर्साइसर पहुंचनीयता अन्वेषक <END>
Prediction:   accerciser accessibility explorer
Dataset Reference:  accerciser accessibility explorer

Input:  <START> निचले पटल के लिए डिफोल्ट प्लगइन खाका <END>
Prediction:   the default plugin layout for the bottom panel
Dataset Reference:  the default plugin layout for the bottom panel

Input:  <START> ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका <END>
Prediction:   the default plugin layout for the top so that contain alphanumeric or characters
Dataset Reference:  the default plugin layout for the top panel

Input:  <START> उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है <END>
Prediction:   a list of plugins that are disabled by default
Dataset Reference:  a list of plugins that are disabled by default

Input:  <START> अवधि क