# English to French Neural-Machine Translation Example
A working example of a sequence-to-sequence model using bidirectional LSTM layers.
Training the model took ~2 hours on a machine with a quad-core 4200Mhz CPU, a GTX 1080Ti, and 64gb of memory.

In [1]:
import os, sys, re, string
import numpy as np
from unicodedata import normalize

### Import and normalize raw text data

In [2]:
# Functions to help clean and normalize the inputs:

def to_pairs(text):
    ''' Converts lines read from free text to English-French pairs. '''
    lines = text.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def _normalize_sentence(sent):
    ''' Helper function for clean_pairs(); normalize the input sentences (remove puncuations, special characters, capitalizations, etc.). '''
    re_print = re.compile(r'[^{}]'.format(re.escape(string.printable)))
    punc_table = str.maketrans('', '', string.punctuation)

    # Text normalization:
    sent = normalize('NFD', sent).encode('ascii', 'ignore').decode('utf-8')  # Normalize unicode characters
    sent = sent.split()                                                      # Tokenize on white space
    sent = [word.lower() for word in sent]                                   # Convert to lower case
    sent = [word.translate(punc_table) for word in sent]                     # Remove punctuation characters
    sent = [re_print.sub('', w) for w in sent]                               # Remove non-printable characters
    sent = [word for word in sent if word.isalpha()]                         # Remove non-word tokens

    # Reformat as a string and return:
    return ' '.join(sent)

def clean_pairs(lines):
    ''' Applies normalization (defined above) to each sentence pair in the input data. '''
    cleaned = []
    for pair in lines:
        clean_pair = [_normalize_sentence(line) for line in pair]
        cleaned.append(clean_pair)
    return np.array(cleaned)

In [3]:
# Import data and clean input sentences:
with open('English-French.txt', encoding='utf-8') as f:
    text = f.read()

sent_pairs = to_pairs(text)
sent_pairs_cleaned = clean_pairs(sent_pairs)

In [4]:
# Have a look at our cleaned input data:
sent_pairs_cleaned[45000:45005]

array([['staying home isnt fun', 'rester chez soi na rien damusant'],
       ['staying home isnt fun', 'rester chez soi nest pas marrant'],
       ['stop biting your nails', 'arrete de ronger tes ongles'],
       ['stop deluding yourself', 'arrete de te mentir a toimeme'],
       ['stop deluding yourself', 'arretez de vous mentir a vousmeme']],
      dtype='<U314')

### Limit our training data, and create train-test splits
Some of the sentences from the dataset can be extremely long. The performance of LSTMs start to drop off if the sequences get too long (look into attention models for a potential enhancement). We also don't want the training time to take too long. So for this example, we'll limit the data to only the sentences with 5 words or fewer for English and 7 words or fewer for French. This results in about 67K training examples; we'll then split this into the first 55000 sentences for training, and the remaining for validation.

In [5]:
# Limit to only the sentences with reasonable lengths (5-word English and 7-word French):
sent_pairs_cleaned_sub = np.array([i for i in sent_pairs_cleaned if len(i[0].split()) <= 5 and len(i[1].split()) <= 7])
print('Full dataset:', len(sent_pairs_cleaned_sub))

# Use this for creating the tokenizers so that the sentence ordering stays the same:
sent_pairs_cleaned_ordered = sent_pairs_cleaned_sub.copy()

# Shuffle input data and split into training & testing:
np.random.shuffle(sent_pairs_cleaned_sub)
train = sent_pairs_cleaned_sub[:55000]
test = sent_pairs_cleaned_sub[55000:]
print('Training sentences:', len(train), '\nTesting sentences:', len(test))

Full dataset: 67222
Training sentences: 55000 
Testing sentences: 12222


### Tokenization and sequence encoding
With our training and testing data created, the next step is to prepare the data for modeling. Namely:
1. Tokenization: Separate sentences into arrays of individual words.
2. Indexing: Map each word (for both languages) to an integer index. This is what will feed into the LSTM.
3. Padding: Zero-pad sequences so that they're all the same length.
4. Encode target: Since the target of the prediction will be a sequence as well, we must one-hot-encode the target sentence.

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical, plot_model
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector, TimeDistributed, Bidirectional

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
# Creating tokenizers:

def create_tokenizer(lines):
    ''' Keras tokenizer to be applied to each set of sentences. '''
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    ''' Function to compute the maximum sentence length for each language. '''
    return max(len(line.split()) for line in lines)

In [8]:
# English tokenizer:
eng_tokenizer = create_tokenizer(sent_pairs_cleaned_ordered[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(sent_pairs_cleaned_ordered[:, 0])
print('English vocabulary size:', eng_vocab_size)
print('English max sentence length:', eng_length)

# French tokenizer:
fra_tokenizer = create_tokenizer(sent_pairs_cleaned_ordered[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1
fra_length = max_length(sent_pairs_cleaned_ordered[:, 1])
print('French vocabulary size:', fra_vocab_size)
print('French max sentence length:', fra_length)

English vocabulary size: 8345
English max sentence length: 5
French vocabulary size: 16001
French max sentence length: 7


In [9]:
# Create sentence encodings for both the source and target:

def encode_sequences(tokenizer, length, lines):
    ''' Encodes text to interger-coded sequences, zero-padded based on max-length. '''
    seq = tokenizer.texts_to_sequences(lines)
    return pad_sequences(seq, maxlen=length, padding='post')

def ohe_output(sequences, vocab_size):
    ''' One-hot-encodes the target sentence. '''
    ylist = [to_categorical(seq, num_classes=vocab_size) for seq in sequences]
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [10]:
# Prepare training data:
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(fra_tokenizer, fra_length, train[:, 1])
trainY = ohe_output(trainY, fra_vocab_size)

# Prepare validation data:
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(fra_tokenizer, fra_length, test[:, 1])
testY = ohe_output(testY, fra_vocab_size)

### Build neural-machine translation model
Here I chose to build a bidirectional LSTM.

In [11]:
# Define sequence-to-sequence LSTM model:
embedding_dim = 100
model = Sequential()

# As usual, we start with our embedding layer:
model.add(Embedding(eng_vocab_size, embedding_dim, input_length=eng_length, mask_zero=True))

# Encoding LSTM layer; notice that we pass the output to a RepeatVector() instead of using return_sequences because the output
# sequence must feed into the decoding layer which has a different number of time steps:
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(RepeatVector(fra_length))

# Decoding LSTM layer; notice how we add a time-distributed dense layer at the end:
model.add(Bidirectional(LSTM(embedding_dim, return_sequences=True)))
model.add(TimeDistributed(Dense(fra_vocab_size, activation='softmax')))

In [12]:
# View model architecture:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 100)            834500    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               160800    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 7, 200)            0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 7, 200)            240800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 7, 16001)          3216201   
Total params: 4,452,301
Trainable params: 4,452,301
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Train model; notice that we're defining some callback features to control the training process:
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')
checkpoint = ModelCheckpoint('Eng-Fra_Translation_2.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callbacks_list = [earlystop, checkpoint]

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(trainX, trainY, epochs=50, batch_size=64, validation_data=(testX, testY), callbacks=callbacks_list, verbose=1)

Train on 55000 samples, validate on 12222 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 4.28439, saving model to Eng-Fra_Translation_2.h5
Epoch 2/50

Epoch 00002: val_loss improved from 4.28439 to 3.76657, saving model to Eng-Fra_Translation_2.h5
Epoch 3/50

Epoch 00003: val_loss improved from 3.76657 to 3.37896, saving model to Eng-Fra_Translation_2.h5
Epoch 4/50

Epoch 00004: val_loss improved from 3.37896 to 3.11658, saving model to Eng-Fra_Translation_2.h5
Epoch 5/50

Epoch 00005: val_loss improved from 3.11658 to 2.90914, saving model to Eng-Fra_Translation_2.h5
Epoch 6/50

Epoch 00006: val_loss improved from 2.90914 to 2.74014, saving model to Eng-Fra_Translation_2.h5
Epoch 7/50

Epoch 00007: val_loss improved from 2.74014 to 2.60592, saving model to Eng-Fra_Translation_2.h5
Epoch 8/50

Epoch 00008: val_loss improved from 2.60592 to 2.48950, saving model to Eng-Fra_Translation_2.h5
Epoch 9/50

Epoch 00009: val_loss improved from 2.48950 to 2.40335, saving model t

<keras.callbacks.History at 0x20b729809b0>

### Save/Load model

In [10]:
from keras.models import load_model

In [None]:
# Save model:
model.save('Eng-Fra_Translation_Test.h5')

In [11]:
# Load saved model:
model = load_model('Eng-Fra_Translation_BidirLSTM.h5')

### Have a look at some model translation outputs

In [12]:
# Create table of id-to-word mappings:
eng_id_to_word = {v:k for k, v in eng_tokenizer.word_index.items()}
fra_id_to_word = {v:k for k, v in fra_tokenizer.word_index.items()}
eng_id_to_word[0] = 0
fra_id_to_word[0] = 0

In [13]:
# Functions to help with predicting new sentences:

def encode_input(sent, tokenizer, input_length):
    ''' Encode an English input sentence according to our normalization and tokenization rules defined earlier. '''
    sent_n = _normalize_sentence(sent)
    return encode_sequences(tokenizer, input_length, [sent_n])

def decode_prediction(pred, pred_length, id_to_word_mapping):
    ''' Converts a prediction output matrix to readable French.
        Note: The id_to_word_mapping must be built using the same French tokenizer as was used to train the model!
    '''
    pred_ids = [pred[0][i].argmax() for i in range(pred_length)]
    pred_words = [id_to_word_mapping[i] for i in pred_ids if i != 0]
    if pred_words[-1] == pred_words[-2]:
        pred_words = pred_words[:-1]
    return ' '.join(pred_words)

def translate_eng_fra(in_sentence, model):
    ''' Main translation function. '''
    input_enc = encode_input(in_sentence, eng_tokenizer, eng_length)
    pred_out = model.predict(input_enc)
    return decode_prediction(pred_out, fra_length, fra_id_to_word)

In [14]:
# Try your own sentences:
sentence = "I love the movie."
print('English :', sentence)
print('Model   :', translate_eng_fra(sentence, model))
print('-------------')

sentence = "The weather is beautiful."
print('English :', sentence)
print('Model   :', translate_eng_fra(sentence, model))
print('-------------')

sentence = "God bless you!"
print('English :', sentence)
print('Model   :', translate_eng_fra(sentence, model))
print('-------------')

sentence = "What's your favorite book?"
print('English :', sentence)
print('Model   :', translate_eng_fra(sentence, model))
print('-------------')

sentence = "My mother is well."
print('English :', sentence)
print('Model   :', translate_eng_fra(sentence, model))

English : I love the movie.
Model   : jadore le film
-------------
English : The weather is beautiful.
Model   : il est est belle
-------------
English : God bless you!
Model   : que dieu vous benisse
-------------
English : What's your favorite book?
Model   : quel est ton livre prefere
-------------
English : My mother is well.
Model   : ma mere est bien


In [16]:
# Try some random sentences from the test set:
rn = np.random.choice(range(len(test)))
rn_eng = test[rn][0]
rn_fra = test[rn][1]
print('English :', rn_eng)
print('French  :', rn_fra)
print('---------')
pred = translate_eng_fra(rn_eng, model)
print('Model   :', pred)

English : its not easy raising children
French  : ce nest pas facile delever des enfants
---------
Model   : ce nest pas facile delever des enfants
