##  Neural Machine Translation
###  By Chandra S Narain Kappera

### Dataset:
Tab seprated German and English Sentences

In [62]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import corpus_bleu

### Data Exploration:

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [13]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

In [14]:
data = to_pairs(load_doc('deu.txt'))

In [21]:
data[200:210]

[['Get away!', 'Verpiss dich!'],
 ['Get away!', 'Hau ab!'],
 ['Get away!', 'Verschwinde!'],
 ['Get away!', 'Verdufte!'],
 ['Get away!', 'Mach dich fort!'],
 ['Get away!', 'Zieh Leine!'],
 ['Get away!', 'Mach dich vom Acker!'],
 ['Get away!', 'Verzieh dich!'],
 ['Get away!', 'Verkrümele dich!'],
 ['Get away!', 'Troll dich!']]

### Data Cleaning
Remove all punctuation characters.  
Normalize all Unicode characters to ASCII (e.g. Latin characters).  
Normalize the case to lowercase.  
Remove any remaining tokens that are not alphabetic.  


In [26]:
import string
import re
from unicodedata import normalize

def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [30]:
# save a list of clean sentences to file
from pickle import dump

def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [32]:
clean_pairs = clean_pairs(data)
save_clean_data(clean_pairs,'clean_data.pkl')

Saved: clean_data.pkl


In [34]:
for i in range(10):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]


### Test Train Split

In [36]:
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
 
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)
 
# load dataset
raw_dataset = load_clean_sentences('clean_data.pkl')
 
# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


### Load the train and test datasets:

In [37]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

### Map text to integers

In [43]:
# fit a tokenizer
from keras.preprocessing.text import Tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [44]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

English Vocabulary Size: 2516
English Max Length: 5
German Vocabulary Size: 3871
German Max Length: 10


In [47]:
# encode and pad sequences
from keras.preprocessing.sequence import pad_sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [48]:
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

### Prepare the data

In [50]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

### Define the NMT model

In [52]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model
 
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 256)           990976    
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 5, 2516)           646612    
Total params: 2,688,212
Trainable params: 2,688,212
Non-trainable params: 0
_________________________________________________________________
None


In [54]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Train on 9000 samples, validate on 1000 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 3.61593, saving model to model.h5
 - 48s - loss: 4.3741 - val_loss: 3.6159
Epoch 2/30

Epoch 00002: val_loss improved from 3.61593 to 3.47968, saving model to model.h5
 - 51s - loss: 3.4355 - val_loss: 3.4797
Epoch 3/30

Epoch 00003: val_loss improved from 3.47968 to 3.40885, saving model to model.h5
 - 50s - loss: 3.2929 - val_loss: 3.4088
Epoch 4/30

Epoch 00004: val_loss improved from 3.40885 to 3.30110, saving model to model.h5
 - 49s - loss: 3.1791 - val_loss: 3.3011
Epoch 5/30

Epoch 00005: val_loss improved from 3.30110 to 3.16015, saving model to model.h5
 - 49s - loss: 2.9956 - val_loss: 3.1602
Epoch 6/30

Epoch 00006: val_loss improved from 3.16015 to 3.05548, saving model to model.h5
 - 50s - loss: 2.8204 - val_loss: 3.0555
Epoch 7/30

Epoch 00007: val_loss improved from 3.05548 to 2.96088, saving model to model.h5
 - 50s - loss: 2.6625 - val_loss: 2.9609
Epoch 8/30

Epoch 

<keras.callbacks.History at 0x1dc031d6f28>

### Evaluate

In [56]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [57]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [58]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [63]:
evaluate_model(model, eng_tokenizer, testX, test)

src=[alle mann von bord], target=[abandon ship], predicted=[face aboard]
src=[wie kann ich mich nutzlich machen], target=[how can i help], predicted=[why did i ask]
src=[klapp dein buch zu], target=[close your book], predicted=[the your hat]
src=[ich will das], target=[i want that], predicted=[i want this]
src=[ihr werdet euch verlaufen], target=[youll get lost], predicted=[youll look you]
src=[sie sind ein feigling], target=[youre a coward], predicted=[youre a coward]
src=[zeig sie ihr], target=[show it to her], predicted=[show it to her]
src=[ich brauche nur tom], target=[i only need tom], predicted=[i need to tom]
src=[ich erinnere mich], target=[i remember], predicted=[i surrender]
src=[ich mochte das gleiche], target=[i want the same], predicted=[i want want too]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.067749
BLEU-2: 0.253323
BLEU-3: 0.429322
BLEU-4: 0.489847


### Extensions

**Data Cleaning**  Different data cleaning operations could be performed on the data, such as not removing punctuation or normalizing case, or perhaps removing duplicate English phrases.   
**Vocabulary** The vocabulary could be refined, perhaps removing words used less than 5 or 10 times in the dataset and replaced with “unk“.  
**More Data** - The dataset used to fit the model could be expanded to 50,000, 100,000 phrases, or more.  
**Layers** The encoder and/or the decoder models could be expanded with additional layers and trained for more epochs, providing more representational capacity for the model.  
**Units** The number of memory units in the encoder and decoder could be increased, providing more representational capacity for the model.  
**Regularization** The model could use regularization, such as weight or activation regularization, or the use of dropout on the LSTM layers.  
**Pre-Trained Word Vectors** Pre-trained word vectors could be used in the model.  
Recursive Model. A recursive formulation of the model could be used where the next word in the output sequence could be conditional on the input sequence and the output sequence generated so far.  

In [65]:
#Inspired by: MachineLearningMastery.com