# Machine Translation Project

### Step 1. Import dependencies and script setup

In [13]:
%load_ext autoreload
%autoreload 1

import os
import collections
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Concatenate, LSTM
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from tensorflow.python.client import device_lib

# Data loader
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()
    return data.split('\n')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# CPU/GPU device check
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8435159831571808138
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5376753664
locality {
  bus_id: 1
  links {
  }
}
incarnation: 11986777501007342572
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


### Step 2. Data load and exploration

In [18]:
# English sentences
english_sentences = load_data('data/small_vocab_en')
# French sentences
french_sentences = load_data('data/small_vocab_fr')

In [21]:
# Sentence examples
for sample_i in range(2):
    print('English sentence {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('French translation {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

English sentence 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
French translation 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
English sentence 2:  the united states is usually chilly during july , and it is usually freezing in november .
French translation 2:  les Ã©tats-unis est gÃ©nÃ©ralement froid en juillet , et il gÃ¨le habituellement en novembre .


In [23]:
# Data exploration and main metrics

english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


### Step 3. Data preprocessing

#### Tokenization

In [26]:
# Tokenization helper function

def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


#### Padding

In [28]:
# Padding helper function

def pad(x, length=None):
    if length is None:
        length = max(len(i) for i in x)
    padded_sequences = pad_sequences(x, maxlen = length, padding = 'post')
    
    return padded_sequences

# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


#### Preprocessing pipeline

In [30]:
# Preprocessing pipeline helper function

def preprocess(x, y):
  
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 345


## Models descriptions in increasing complexity

* Model 1. Simple RNN (GRU)
* Model 2. GRU With embedding
* Model 3. Bidirectional GRU
* Model 4. Encoder-Decoder GRU
* Model 5. Final model (combination of the above)

In [37]:
# IDs to text helper function

def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

### Model 1. Simple RNN (GRU)

In [39]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 1e-3
    
    inputs = Input(shape = input_shape[1:])
    x = GRU(64, return_sequences = True)(inputs)
    x = GRU(64, return_sequences = True)(x)
    x = Dense(128, activation = 'relu')(x)
    outputs = Dense(french_vocab_size, activation = 'softmax')(x)
    
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=128, epochs=10, validation_split=0.2)

# Print prediction(s)
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
new jersey est parfois calme en mois et il et il est est en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Model 2. GRU With embedding

In [42]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 1e-3
    
    inputs = Input(shape = input_shape[1:])
    x = Embedding(input_dim = english_vocab_size, output_dim = 128, input_length = input_shape[1])(inputs)
    x = GRU(64, return_sequences = True)(x)
    x = GRU(64, return_sequences = True)(x)
    x = Dense(128, activation = 'relu')(x)
    outputs = Dense(french_vocab_size, activation = 'softmax')(x)
    
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

# Inputs reshape
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

rnn_model_with_embedding = embed_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
rnn_model_with_embedding.fit(tmp_x, preproc_french_sentences, batch_size=128, epochs=10, validation_split=0.2)

# Print predictions
print(logits_to_text(rnn_model_with_embedding.predict(tmp_x[:1])[0], french_tokenizer))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
new jersey est parfois calme en l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Model 3. Bidirectional GRU

In [46]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 1e-3
    
    inputs = Input(shape = input_shape[1:])
    x = Bidirectional(GRU(64, return_sequences = True))(inputs)
    x = Bidirectional(GRU(64, return_sequences = True))(x)
    x = Dense(128, activation = 'relu')(x)
    outputs = Dense(french_vocab_size, activation = 'softmax')(x)
    
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

# Inputs reshape
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

bidirectional_rnn_model = bd_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

# Train the model
bidirectional_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=128, epochs=10, validation_split=0.2)

# Print predictions
print(logits_to_text(bidirectional_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
new jersey est parfois calme en mois de il est est en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Model 4. Encoder-Decoder GRU

In [50]:
def encdec_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    
    encoder_inputs = Input(shape = input_shape[1:])
    gru_layer1 = GRU(64, return_sequences = False, return_state = True)
    gru_layer1_outputs, state_h = gru_layer1(encoder_inputs)

    repeater = RepeatVector(output_sequence_length)
    decoder_inputs = repeater(gru_layer1_outputs)
    gru_layer3 = GRU(64, return_sequences = True)
    gru_layer3_outputs = gru_layer3(decoder_inputs, initial_state = state_h)
    dense_layer_last = Dense(french_vocab_size, activation = 'softmax')
    dense_layer_last_outputs = dense_layer_last(gru_layer3_outputs)

    model = Model(inputs = encoder_inputs, outputs = dense_layer_last_outputs)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

# Inputs reshape
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

encdec_rnn_model = encdec_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

# Train the model
encdec_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=128, epochs=10, validation_split=0.2)

# Print model predictions
print(logits_to_text(encdec_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
new jersey est parfois agrã©able en mois mais il est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


### Model 5. Final model (combination of the above)

In [None]:
def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 1e-3
    
    encoder_inputs = Input(shape = input_shape[1:])
    x = Embedding(input_dim = english_vocab_size, output_dim = 256)(encoder_inputs)
    x = Bidirectional(GRU(128, return_sequences = True))(x)
    encoder_outputs, h1, h2 = Bidirectional(GRU(128, return_sequences = False, return_state = True))(x)
    
    repeater = RepeatVector(output_sequence_length)
    decoder_inputs = repeater(encoder_outputs)
    x = Bidirectional(GRU(128, return_sequences = True))(decoder_inputs, initial_state = [h1,h2])
    x = Bidirectional(GRU(128, return_sequences = True))(x)
    x = Dense(256, activation = 'relu')(x)
    outputs = Dense(french_vocab_size, activation = 'softmax')(x)
    
    model = Model(inputs = encoder_inputs, outputs = outputs)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

# Inputs reshape
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)

custom_model = model_final(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

# Train the model
custom_model.fit(tmp_x, preproc_french_sentences, batch_size=128, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

In [None]:
!git add .
!git commit -m "Model 5. Final model (combination of the above)"

## Final predictions with examples

In [None]:
# Final predictions function

def final_predictions(x, y, x_tk, y_tk):

    model = model_final(x.shape, len(y[0]), english_vocab_size, french_vocab_size)
    model.fit(x, y, batch_size=128, epochs=10, validation_split=0.2) 

    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'

    sentence = 'he saw a old yellow truck'
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictions = model.predict(sentences, len(sentences))

    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('Il a vu un vieux camion jaune')
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))

final_predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)

In [None]:
!git add .
!git commit -m "final predictions and example sentences"