<h1 style="color:blue;"> LANGUAGE TRANSLATION </h1>

## German to English Translation

In [2]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [31]:
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding, RepeatVector, TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint

In [44]:
import string
import re
from pickle import dump, load
from unicodedata import normalize
from numpy import array, argmax
from numpy.random import rand, shuffle
from nltk.translate.bleu_score import corpus_bleu

## Load Data

Data preparation is divided into two subsections:

 - Clean Text
 - Split Text

In [5]:
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs
 
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

In [6]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
filename = 'deu-eng.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-german.pkl
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[hello] => [hallo]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[smile] => [lacheln]
[cheers] => [zum wohl]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me] => [nimm mich in den arm]
[hug me] => [umarme mich]
[i fell] => [ich fiel]
[i fell] => [ich fiel hin]
[i fell] => [ich sturzte]
[i fell] => [ich bin hingefallen]
[i fell] => [ich bin gesturzt]
[i know] => [ich wei]
[i lied] => [ich habe gelogen]
[i lost] => [ich habe verloren]
[im] => [ich bin jahre alt]
[im] => [ich bin]
[im ok] => [mir gehts gut]
[im ok] => [es geht mir gut]
[no way] => [unmoglich]
[no way] => [da

The clean data contains a little over 150,000 phrase pairs and some of the pairs toward the end of the file are very long.

In [7]:
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')
 
# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [8]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

In [9]:
dataset

array([['ask tom', 'frag tom'],
       ['it gets better', 'es wird besser'],
       ['tom has asthma', 'tom hat asthma'],
       ...,
       ['i dont get it', 'das kapier ich nicht'],
       ['dont follow me', 'komm mir nicht hinterher'],
       ['ill play along', 'ich werde mitarbeiten']], dtype='<U370')

In [10]:
print(train.shape)
print(test.shape)

(9000, 2)
(1000, 2)


## Tokenize

In [11]:
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer


# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)


In [12]:

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))


# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))


English Vocabulary Size: 2404
English Max Length: 5
German Vocabulary Size: 3856
German Max Length: 10


## Padding

Each input and output sequence must be encoded to integers and padded to the maximum phrase length. This is because we will use a word embedding for the input sequences and one hot encode the output sequences

In [13]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

The output sequence needs to be one-hot encoded. This is because the model will predict the probability of each word in the vocabulary as output

In [14]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y


## Data Prep

In [15]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [16]:
trainX.shape

(9000, 10)

In [17]:
trainX

array([[ 338,    3,    0, ...,    0,    0,    0],
       [   5,   48,  263, ...,    0,    0,    0],
       [   3,   12, 2006, ...,    0,    0,    0],
       ...,
       [   1,   47,    4, ...,    0,    0,    0],
       [   3, 1313,    0, ...,    0,    0,    0],
       [  44,   46,  214, ...,    0,    0,    0]])

In [18]:
trainY.shape

(9000, 5, 2404)

In [19]:
trainY[:2]

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]], dtype=float32)

## Train Neural Translation Model

We will use an encoder-decoder LSTM model on this problem. In this architecture, the input sequence is encoded by a front-end model called the encoder then decoded word by word by a backend model called the decoder.

In [32]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units, return_sequences = True))
    model.add(Dropout(0.2))
    model.add(LSTM(n_units))
    model.add(Dropout(0.2))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model


In [33]:
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics = ['Accuracy'])
# summarize defined model
print(model.summary())
#plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 10, 256)           987136    
_________________________________________________________________
lstm_7 (LSTM)                (None, 10, 256)           525312    
_________________________________________________________________
dropout (Dropout)            (None, 10, 256)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 5, 256)           

In [34]:
# fit model
#filename = 'model.h5'
#checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=50, batch_size=64, 
          validation_data=(testX, testY), 
          #callbacks=[checkpoint], 
          verbose=2)

Train on 9000 samples, validate on 1000 samples
Epoch 1/50
9000/9000 - 11s - loss: 4.3526 - Accuracy: 0.0000e+00 - val_loss: 3.6819 - val_Accuracy: 0.0000e+00
Epoch 2/50
9000/9000 - 2s - loss: 3.5434 - Accuracy: 0.0000e+00 - val_loss: 3.5449 - val_Accuracy: 0.0000e+00
Epoch 3/50
9000/9000 - 2s - loss: 3.4267 - Accuracy: 0.0000e+00 - val_loss: 3.4916 - val_Accuracy: 0.0000e+00
Epoch 4/50
9000/9000 - 2s - loss: 3.3415 - Accuracy: 0.0000e+00 - val_loss: 3.4529 - val_Accuracy: 0.0000e+00
Epoch 5/50
9000/9000 - 2s - loss: 3.2735 - Accuracy: 0.0000e+00 - val_loss: 3.4303 - val_Accuracy: 0.0000e+00
Epoch 6/50
9000/9000 - 2s - loss: 3.2025 - Accuracy: 0.0000e+00 - val_loss: 3.3649 - val_Accuracy: 0.0000e+00
Epoch 7/50
9000/9000 - 2s - loss: 3.0810 - Accuracy: 0.0000e+00 - val_loss: 3.2749 - val_Accuracy: 0.0000e+00
Epoch 8/50
9000/9000 - 2s - loss: 2.9792 - Accuracy: 0.0000e+00 - val_loss: 3.2105 - val_Accuracy: 0.0000e+00
Epoch 9/50
9000/9000 - 2s - loss: 2.8862 - Accuracy: 0.0000e+00 - val_l

<tensorflow.python.keras.callbacks.History at 0x1d609fa60c8>

## Evaluate Neural Translation Model

In [40]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

Next, we can repeat this for each source phrase in a dataset and compare the predicted result to the expected target phrase in English.
We can print some of these comparisons to screen to get an idea of how the model performs in practice.
We will also calculate the BLEU scores to get a quantitative idea of how well the model has performed

In [41]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
	actual, predicted = list(), list()
	for i, source in enumerate(sources):
		# translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict_sequence(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append([raw_target.split()])
		predicted.append(translation.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [45]:
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)


train
src=[frag tom], target=[ask tom], predicted=[ask tom]
src=[es wird besser], target=[it gets better], predicted=[it works hard]
src=[tom hat asthma], target=[tom has asthma], predicted=[tom has hiccups]
src=[du kannst gehen], target=[you may go], predicted=[you may go]
src=[ich fuhle mich beschutzt], target=[i feel protected], predicted=[i feel refreshed]
src=[tom beschleunigte], target=[tom accelerated], predicted=[toms accelerated]
src=[ist tom allein], target=[is tom alone], predicted=[is tom hungry]
src=[hast du tom verklagt], target=[did you sue tom], predicted=[did you stop tom]
src=[mach die ture zu], target=[close the door], predicted=[lock the door]
src=[ich bin ziemlich beschaftigt], target=[im quite busy], predicted=[im still busy]
BLEU-1: 0.704258
BLEU-2: 0.591665
BLEU-3: 0.488556
BLEU-4: 0.272257


In [46]:
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

test
src=[habt ihr zeit], target=[do you have time], predicted=[do you have]
src=[konnen wir jetzt gehen], target=[can we leave now], predicted=[can we go now]
src=[tom blieb], target=[tom stayed], predicted=[tom slipped]
src=[es geht mir phantastisch], target=[i feel fantastic], predicted=[i feel it]
src=[ich habe toms schlussel], target=[i have toms key], predicted=[i not a crook]
src=[gehe arbeiten], target=[go to work], predicted=[go to sleep]
src=[wir sind ungeduldig], target=[were impatient], predicted=[were are]
src=[lasst mich sehen], target=[let me see], predicted=[let me see it]
src=[ich bin moslem], target=[i am a muslim], predicted=[i am a student]
src=[das auto gehort mir], target=[i own this car], predicted=[the is blushed]
BLEU-1: 0.460236
BLEU-2: 0.319018
BLEU-3: 0.239483
BLEU-4: 0.099681
