In [None]:
model_size='10000'

In [None]:
import numpy as np
import pickle
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
import os
import sys
import pandas as pd


In [None]:
def handle_helper_functions():
    sys.path.append('/Users/learn/Desktop/Projects/machine-translation/test/helper')                                       
    sys.path.append('/Users/learn/Desktop/Projects/machine-translation/utils')
    sys.path.append('/Users/learn/Desktop/Projects/machine-translation/data')
    sys.path.append('/Users/learn/Desktop/Projects/machine-translation/visualization')

handle_helper_functions()

In [None]:
from performance_metric import create_dataframe_to_score, bleu_score, calculate_ROUGE
from bar_chart import plot_chart_for_all_metrics, plotBarChart

In [None]:
def load_models_and_parameters(model_size):

    path=model_size+'/'

    from tensorflow import keras
    model = keras.models.load_model(path+'lstm_model')
    with open(path+ "src_parameters.pickle", 'rb') as handle:
        src_parameters = pickle.load(handle)

    with open(path+ "src_tokenizer.pickle", 'rb') as handle:
        src_tokenizer = pickle.load(handle)

    with open(path+ "target_parameters.pickle", 'rb') as handle:
        target_parameters = pickle.load(handle)

    with open(path+ "target_tokenizer.pickle", 'rb') as handle:
        target_tokenizer = pickle.load(handle)
    return model, src_tokenizer, target_tokenizer, src_parameters, target_parameters

model_path= '../../model/lstm/'+model_size
model, src_tokenizer, target_tokenizer, src_parameters, target_parameters= load_models_and_parameters(model_path)

In [None]:

src_length=src_parameters["src_length"]
src_vocab_size=src_parameters["src_vocab_size"]

target_length=target_parameters["target_length"]
target_vocab_size=target_parameters["target_vocab_size"]

print(src_length, target_length, src_vocab_size, target_vocab_size)




In [None]:
from iit_dataset import createDataset

In [None]:
data_size=1000

pool_oftexts, pairs =createDataset(data_size=data_size, type="test")
dataset= pool_oftexts
dataset = dataset.values
test=dataset

In [None]:
source_str, target_str = "Hindi", "English"


In [None]:
def encode_sequences(tokenizer, length, lines):
    # encode and pad sequences
    X = tokenizer.texts_to_sequences(lines) # integer encode sequences
    X = pad_sequences(X, maxlen=length, padding='post') # pad sequences with 0 values
    return X
 
def encode_output(sequences, vocab_size):
    # one hot encode target sequence
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [None]:
def word_for_id(integer, tokenizer):
    # map an integer to a word
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
def predict_seq(model, tokenizer, source):
    # generate target from a source sequence
    prediction = model.predict(source, verbose=0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

def compare_prediction(model, tar_tokenizer, sources, raw_dataset, limit=20):
    # evaluate a model
    actual, predicted = [], []
    src = f'{source_str.upper()} (SOURCE)'
    tgt = f'{target_str.upper()} (TARGET)'
    pred = f'AUTOMATIC TRANSLATION IN {target_str.upper()}'
    print(f'{src:30} {tgt:25} {pred}\n')
    
    for i, source in enumerate(sources): # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_seq(model, tar_tokenizer, source)
        raw_src,raw_target = raw_dataset[i]
        print(f' {i+1}. {raw_src:30} || {raw_target:25} || {translation}')
        if i >= limit: # Display some of the result
            break
 


In [None]:
# Prepare test data
idx_src=0
idx_tar=1
testX = encode_sequences(src_tokenizer, src_length, test[:, idx_src])
testY = encode_sequences(target_tokenizer, target_length, test[:, idx_tar])
testY = encode_output(testY, target_vocab_size)

In [None]:
print('\n\n### Result on the Test Set ###')
compare_prediction(model, target_tokenizer, testX, test)

### Statistics for Single Model

In [None]:
actual, predicted, actual_rouge, average_cosine =create_dataframe_to_score(model, target_tokenizer, testX, test)
bleu_test = bleu_score(actual, predicted)
rouge_test = calculate_ROUGE(actual=actual_rouge, predicted=predicted)

keys=["Dataset Size", *rouge_test.keys(), "cosine_similarity", *bleu_test.keys()]

values=[model_size,  *rouge_test.values(), average_cosine, *bleu_test.values() ]

table =pd.DataFrame(columns=keys, data=[values])

In [None]:

plotBarChart(x = bleu_test.keys(), height = bleu_test.values(), title= "BLEU Score with the test set")

plotBarChart(x = rouge_test.keys(), height = rouge_test.values(), title= "ROUGE Score with the test set")

plotBarChart(x = "cosine_average", height = average_cosine, title= "Cosine Score with the test set")


plot_chart_for_all_metrics(table=table)


### Transliterate English to Hindi

In [None]:
! pip install google-transliteration-api
from google.transliteration import transliterate_word


In [None]:

transliterate_eng_hindi = transliterate_word('yah hamaare desh ke lie vaastav mein anivaary vastu hai.', lang_code='hi', max_suggestions=1)
print(transliterate_eng_hindi)

In [None]:
encoded_hindi = encode_sequences(src_tokenizer, src_length, transliterate_eng_hindi)
encoded_hindi
actual= [[transliterate_eng_hindi[0], 'It is really essential item for our country.']]

In [None]:
compare_prediction(model, target_tokenizer, encoded_hindi, actual)