In [10]:
import string
import re
import os
from pickle import dump, load
from unicodedata import normalize
from numpy import array, argmax
from numpy.random import rand, shuffle
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, plot_model, pad_sequences
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed
from keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import corpus_bleu

Data Preprocessing

In [None]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

def clean_pairs(lines):
	cleaned = list()
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			line = line.split()
			line = [word.lower() for word in line]
			line = [word.translate(table) for word in line]
			line = [re_print.sub('', w) for w in line]
			line = [word for word in line if word.isalpha()]
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

def load_clean_data(filename):
    return load(open(filename, 'rb'))

file1 = './content/data/fra_eng.pkl'
file2 = './content/data/fra_eng_both.pkl'
file3 = './content/data/fra_eng_train.pkl'
file4 = './content/data/fra_eng_test.pkl'

# Checking if all preprocessed files exist
if os.path.exists(file1) and os.path.exists(file2) and os.path.exists(file3) and os.path.exists(file4):
    print("Preprocessed files already exist. Skipping data preparation.")
else:
	filename = './content/data/fra.txt'
	doc = load_doc(filename)
	pairs = to_pairs(doc)
	cleaned_pairs = clean_pairs(pairs)
	save_clean_data(cleaned_pairs, './content/data/fra_eng.pkl')

	for i in range(100):
		print('[%s] => [%s]' % (cleaned_pairs[i,0], cleaned_pairs[i,1]))

	raw_dataset = load_clean_data('./content/data/fra_eng.pkl')
	n_sentences = 15000
	dataset = raw_dataset[:n_sentences, :]
	shuffle(dataset)
	train, test =  dataset[:12000], dataset[12000:]

	save_clean_data(dataset, './content/data/fra_eng_both.pkl')
	save_clean_data(train, './content/data/fra_eng_train.pkl')
	save_clean_data(test, './content/data/fra_eng_test.pkl')


Sequence Encoding

In [12]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

Model Architecture

In [13]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add((LSTM(n_units)))
    model.add(RepeatVector(tar_timesteps))
    model.add((LSTM(n_units, return_sequences=True)))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

English/French Tokenizers & Stats

In [15]:
dataset = load_clean_data('./content/data/fra_eng_both.pkl')
train = load_clean_data('./content/data/fra_eng_train.pkl')
test = load_clean_data('./content/data/fra_eng_test.pkl')

eng_tokenizer = create_tokenizer(dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:,0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length) + "\n")

fra_tokenizer = create_tokenizer(dataset[:,1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1
fra_length = max_length(dataset[:,1])
print('French Vocabulary Size: %d' % fra_vocab_size)
print('French Max Length: %d' % (fra_length))

English Vocabulary Size: 2888
English Max Length: 5

French Vocabulary Size: 5797
French Max Length: 11


Preparing Training/Testing Data & Compiling Model

In [16]:
trainX = encode_sequences(fra_tokenizer, fra_length, train[:,1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:,0])
trainY = encode_output(trainY, eng_vocab_size)

testX = encode_sequences(fra_tokenizer, fra_length, test[:,1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:,0])
testY = encode_output(testY, eng_vocab_size)

model = define_model(fra_vocab_size, eng_vocab_size, fra_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, fra_length))

model.summary()

#plot_model(model, to_file='./content/model-architecture.png', show_shapes=True)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 11, 256)           1484032   
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 5, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 5, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 5, 2888)          742216    
 ibuted)                                                         
                                                                 
Total params: 3,276,872
Trainable params: 3,276,872
Non-

Model Training

In [None]:
filename = './content/eng-fra-model.keras'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Model Evaluation

In [20]:
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

model = load_model('./content/eng-fra-model.keras')

print('Training Data Evaluation')
evaluate_model(model, eng_tokenizer, trainX[:10], train[:10])

print("\n" 'Testing Data Evaluation')
evaluate_model(model, eng_tokenizer, testX[:10], test[:10])


Training Data Evaluation
src=[jai besoin detudier], target=[i need to study], predicted=[i need to study]
src=[tom est un heros], target=[tom is a hero], predicted=[tom is a hero]
src=[quel gros chien], target=[what a big dog], predicted=[what a big dog]
src=[ce sont les maths que je prefere], target=[i like math best], predicted=[i like your you]
src=[je te crois vraiment], target=[i do believe you], predicted=[i do believe you]
src=[je men chargerai], target=[ill handle this], predicted=[ill handle it]
src=[je les ai soudoyees], target=[i bribed them], predicted=[i bribed them]
src=[puisje aller au lit], target=[may i go to bed], predicted=[may i go to bed]
src=[je ne laime pas], target=[i dont like him], predicted=[i dont like it]
src=[jai un ranch], target=[i have a ranch], predicted=[i have a ranch]
BLEU-1: 0.897436
BLEU-2: 0.879575
BLEU-3: 0.862512
BLEU-4: 0.789582

Testing Data Evaluation
src=[jarrive a le ressentir], target=[i can feel it], predicted=[i can remember it]
src=[ja