## Imports

In [1]:
#Make imports
import numpy as np
import re
import pickle
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
from config import *
import seaborn as sns
import string
import tensorflow as tf
import pandas as pd
from dataset import prepare_data
from tqdm import tqdm
from helper import save_variable, load_variable
import random

In [2]:
def set_seeds(seed=10):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

## Setup Gpu

In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

## Define Constants

In [4]:
#Some parameters
vocab_size = 10000
total_sentences = 50000
maxlen = 16
epochs = 70
validation_split = 0.05
max_sentence_length= 16

## Load Data

In [5]:
train = prepare_data(type='train', max_entries=total_sentences)
en_data = []
hi_data = []
cnt = 0
for (en,hi) in tqdm(zip(train['en'].to_list(), train['hi'].to_list())):
  l = min(len(en.split()), len(hi.split()))
  if l <= maxlen:
    en_data.append(en)
    hi_data.append(hi)
    cnt += 1
  if cnt == total_sentences:
    break

50000it [00:00, 495221.00it/s]


## Tokenize Text 

In [6]:
#Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  3401
Hindi Vocab Size:  4427


In [7]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

# Training and Testing split
# 95%, 5%
split = int(0.9 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(45000, 16) (45000, 16) (45000, 16)


### Initialize model 

In [8]:
from models.lstm.model import LSTMModel

In [11]:
en_hi_model = LSTMModel(encoder_vocab_size=english_vocab_size, decoder_vocab_size= hindi_vocab_size ,embedding_size=256, num_rnn_units=64) 
en_hi_model(np.array([[[1]*max_sentence_length], [[1]*max_sentence_length]]))
en_hi_model.summary()
en_hi_model.load_weights(Weights_DIR+"\\en_hi_model.h5")

Model: "lstm_model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      multiple                  870656    
_________________________________________________________________
encoder_lstm (LSTM)          multiple                  82176     
_________________________________________________________________
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_2 (Embedding)      multiple                  1133312   
_________________________________________________________________
decoder_lstm (LSTM)          multiple                  82176     
_________________________________________________________________
token_layer (Dense)          multiple                 

### Test Model

In [12]:
#Testing and Analysis
candidates = []
references = []
ctr = 10 
i = 0
while ctr>0:
  l = len(X_test[i].split())
  if l<=maxlen:   #Choose only sentences of length in range [5,15]
    pred_sentence = en_hi_model.predict_sequence(X_test[i],en_tokenizer, hi_tokenizer)
    candidates.append(pred_sentence.split())

    print("Input: ", X_test[i])
    print("Prediction: ", pred_sentence)

    # google_translated_sentence = translate_client.translate(X_test[i], target_language='hi')['translatedText']
    
    # print("Google Translated Reference: ", google_translated_sentence)
    print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
    print()
    references.append([y_test[i].split()[1:-1]])

    ctr -= 1
  i += 1

#print(nltk.translate.bleu_score.corpus_bleu(references, candidates))

Input:  <START> give your application an accessibility workout <END>
Prediction:   अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
Dataset Reference:  अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें

Input:  <START> accerciser accessibility explorer <END>
Prediction:   एक्सेर्साइसर पहुंचनीयता अन्वेषक
Dataset Reference:  एक्सेर्साइसर पहुंचनीयता अन्वेषक

Input:  <START> the default plugin layout for the bottom panel <END>
Prediction:   ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका
Dataset Reference:  निचले पटल के लिए डिफोल्ट प्लगइन खाका

Input:  <START> the default plugin layout for the top panel <END>
Prediction:   ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका
Dataset Reference:  ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका

Input:  <START> a list of plugins that are disabled by default <END>
Prediction:   उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है
Dataset Reference:  उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है

Input:  <START> highlight duration <END>
Prediction:   हाइलाइ

## Hindi To English Translation Model

### Prepare input Output Data For Hindi To English Translation Model

In [13]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(hi_sequences, maxlen=maxlen, padding='post')

#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for en in en_sequences:
  decoder_inputs.append(en[:-1])
  decoder_outputs.append(en[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

# Training and Testing split
# 95%, 5%
split = int(0.9 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = hi_data[:split]
y_test = en_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(45000, 16) (45000, 16) (45000, 16)


### Initialize model 

In [14]:
hi_en_model = LSTMModel(encoder_vocab_size=hindi_vocab_size, decoder_vocab_size=  english_vocab_size, embedding_size=256, num_rnn_units=64) 
hi_en_model(np.array([[[1]*max_sentence_length], [[1]*max_sentence_length]]))
hi_en_model.summary()
hi_en_model.load_weights(Weights_DIR+"\\hi_en_model.h5")

Model: "lstm_model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_1 (Embedding)      multiple                  1133312   
_________________________________________________________________
encoder_lstm (LSTM)          multiple                  82176     
_________________________________________________________________
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_2 (Embedding)      multiple                  870656    
_________________________________________________________________
decoder_lstm (LSTM)          multiple                  82176     
_________________________________________________________________
token_layer (Dense)          multiple                 

### Test Model

In [16]:
#Testing and Analysis

candidates = []
references = []

ctr = 10 
i = 0

while ctr>0:
  l = len(X_test[i].split())
  if l<=maxlen:   #Choose only sentences of length in range [5,15]
    pred_sentence = hi_en_model.predict_sequence(X_test[i], hi_tokenizer,en_tokenizer)
    candidates.append(pred_sentence.split())

    print("Input: ", X_test[i])
    print("Prediction: ", pred_sentence)

    # google_translated_sentence = translate_client.translate(X_test[i], target_language='hi')['translatedText']
    
    # print("Google Translated Reference: ", google_translated_sentence)
    print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
    print()
    references.append([y_test[i].split()[1:-1]])

    ctr -= 1
  i += 1

Input:  <START> अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें <END>
Prediction:   give your application an accessibility workout
Dataset Reference:  give your application an accessibility workout

Input:  <START> एक्सेर्साइसर पहुंचनीयता अन्वेषक <END>
Prediction:   accerciser accessibility explorer
Dataset Reference:  accerciser accessibility explorer

Input:  <START> निचले पटल के लिए डिफोल्ट प्लगइन खाका <END>
Prediction:   the default plugin layout for the bottom panel
Dataset Reference:  the default plugin layout for the bottom panel

Input:  <START> ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका <END>
Prediction:   the default plugin layout for the top so that contain alphanumeric or characters
Dataset Reference:  the default plugin layout for the top panel

Input:  <START> उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है <END>
Prediction:   a list of plugins that are disabled by default
Dataset Reference:  a list of plugins that are disabled by default

Input:  <START> अवधि क