In [1]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers
import matplotlib.pyplot as plt
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
pd.set_option('display.max_colwidth', 200)

eng_heb = pd.read_csv('translations.csv', sep='\t')
englishList = eng_heb['ENGLISH'].tolist()
hebrewList = eng_heb['HEBREW'].tolist()

# englishList = englishList[:50000]
# hebrewList = hebrewList[:50000]
# empty lists
eng_l = []
eng_length = 0
heb_l = []
heb_length = 0

# populate the lists with sentence lengths
for i in englishList:
    eng_l.append(len(i.split(" ")))
    l = len(i.split(" "))
    if l > eng_length:
        eng_length = l
print('English Longest Sentense: %d' % eng_length)


for i in hebrewList:
    heb_l.append(len(i.split(" ")))
    l = len(i.split(" "))
    if l > heb_length:
        heb_length = l
print('Hebrew Longest Sentense: %d' % heb_length)


# function to build a tokenizer
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


# prepare english tokenizer
eng_tokenizer = tokenization(englishList)
eng_vocab_size = len(eng_tokenizer.word_index) + 1

print('English Vocabulary Size: %d' % eng_vocab_size)

# prepare Hebrew tokenizer
heb_tokenizer = tokenization(hebrewList)
heb_vocab_size = len(heb_tokenizer.word_index) + 1

print('Hebrew Vocabulary Size: %d' % heb_vocab_size)

English Longest Sentense: 1634
Hebrew Longest Sentense: 97
English Vocabulary Size: 20863
Hebrew Vocabulary Size: 52349


In [3]:

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq


eng_train, eng_test = train_test_split(englishList, test_size=0.2, random_state=12)
heb_train, heb_test = train_test_split(englishList, test_size=0.2, random_state=12)

# prepare training data
# heb as input and eng as output
trainX = encode_sequences(heb_tokenizer, heb_length, heb_train)
trainY = encode_sequences(eng_tokenizer, eng_length, eng_train)

# prepare validation data
testX = encode_sequences(heb_tokenizer, heb_length, heb_test)
testY = encode_sequences(eng_tokenizer, eng_length, eng_test)

In [4]:
print('about to load models')
# Load NMT model
# model1 = load_model('model.h1.12_nov_20')# trained for 6 epochs locally using the first 50,000 sentence pairs (train + test)
# preds1 = model1.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))
# model2 = load_model('model.h1.20_nov_20')# trained for 30 epochs on mamba using the first 50,000 sentence pairs (train + test)
# preds2 = model2.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))
# model3 = load_model('model.h1.24_nov_20')# trained for 90 epochs on mamba using the first 50,000 sentence pairs (train + test)
# preds3 = model3.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))
# model4 = load_model('model.h1.03_dec_20')# trained for __ epochs on mamba using the whole 18 million pairs (train + test)
# preds4 = model4.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))

# NOTE: Change the models that you want to load by putting their file name in this array
models = ['model.h1.03_dec_20']
print('models loaded')
preds = [load_model(model).predict_classes(testX.reshape((testX.shape[0], testX.shape[1]))) for model in models]
print('predictions made')
# preds = [preds1, preds2, preds3]

about to load models
models loaded


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


KeyboardInterrupt: 

In [None]:

def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None


def get_preds(ps):
    p_text = []
    for p in ps:
        temp = []
        for i in range(len(p)):
            t = get_word(p[i], eng_tokenizer)
            if i > 0:
                if t == get_word(p[i - 1], eng_tokenizer) or t is None:
                    temp.append('')
                else:
                    temp.append(t)

            else:
                if t is None:
                    temp.append('')
                else:
                    temp.append(t)

        p_text.append(' '.join(temp))
    return p_text


# convert predictions into text (English)
# preds_text1 = get_preds(preds1)
# preds_text2 = get_preds(preds2)
# preds_text3 = get_preds(preds3)
# preds_text4 = get_preds(preds3)
print('getting words')
preds[0] = preds[0][:10000]
preds_text = [get_preds(pred) for pred in preds]



In [None]:
print('getting scores')
# get bleu and meteor scores
nltk.download('wordnet')

preds_df = pd.DataFrame(columns=['actual', 'predicted', 'bleu', 'meteor'])
for pred_text in preds_text:
    print(f"length {len(pred_text)}")
    bleu_score = []
    met_score = []
    for i in range(len(eng_test)):
        eng = eng_test[i].split(" ")
        pred = pred_text[i].split(" ")
        bleu_score.append(sentence_bleu(eng, pred))
        met_score.append(meteor_score(eng, pred_text[i]))
    length = len(bleu_score)
    print(f'Average BLEU score: {sum(bleu_score)/length}')
    # Average BLEU score: 1.8345392759940913e-159
    print(f'Average METEOR score: {sum(met_score)/length}')
    # Average METEOR score: 0.26234439726940284

    pred_df = pd.DataFrame({'actual': eng_test, 'predicted': pred_text, 'bleu': bleu_score, 'meteor': met_score})
    preds_df = preds_df.append(pred_df, ignore_index=True)
print('saving')
# build dataframe and look at model results
pd.set_option('display.max_colwidth', 200)
preds_df.to_csv('preds.csv', sep='\t', index=False)
