# Machine Translation Project (English- French)

## Import libraries

In [3]:
import collections
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras. layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [4]:
import tensorflow as tf
print("GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print(tf.config.list_physical_devices('GPU'))


GPUs Available: 1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15800800925622539688
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4408931328
locality {
  bus_id: 1
}
incarnation: 15410409250742514012
physical_device_desc: "device: 0, name: DML, pci bus id: <undefined>"
xla_global_id: -1
]


## Load Data

In [7]:
def load_data(path):
    with open(path, 'r') as f:
        data = f.read()
    return data.split('\n')

english_sen = load_data("data/english.txt")
french_sen = load_data("data/french.txt")

In [8]:
print(english_sen[0:5])
print(french_sen[0:5])

['new jersey is sometimes quiet during autumn , and it is snowy in april .', 'the united states is usually chilly during july , and it is usually freezing in november .', 'california is usually quiet during march , and it is usually hot in june .', 'the united states is sometimes mild during june , and it is cold in september .', 'your least liked fruit is the grape , but my least liked is the apple .']
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril .", 'les Ã©tats-unis est gÃ©nÃ©ralement froid en juillet , et il gÃ¨le habituellement en novembre .', 'california est gÃ©nÃ©ralement calme en mars , et il est gÃ©nÃ©ralement chaud en juin .', 'les Ã©tats-unis est parfois lÃ©gÃ¨re en juin , et il fait froid en septembre .', 'votre moins aimÃ© fruit est le raisin , mais mon moins aimÃ© est la pomme .']


## Sample data

In [10]:
english_sen_words = []
french_sen_words = []
for sentences in english_sen:
    for words in sentences.split():
        english_sen_words.append(words)
english_sen_words_counter = collections.Counter(english_sen_words)

for sentences in french_sen:
    for words in sentences.split():
        french_sen_words.append(words)
french_sen_words_counter = collections.Counter(french_sen_words)

In [11]:
print(f"No: of words in english dataset: {len(english_sen_words)}")
print(f"No: of Unique words in english dataset: {len(english_sen_words_counter)}")
print(f"Top 10 most Common english words: {[key for key, value in english_sen_words_counter.most_common(10)]}")
print()
print(f"No: of words in french dataset: {len(french_sen_words)}")
print(f"No: of Unique words in french dataset: {len(french_sen_words_counter)}")
print(f"Top 10 most Common french words: {[key for key, value in french_sen_words_counter.most_common(10)]}")

No: of words in english dataset: 1823250
No: of Unique words in english dataset: 227
Top 10 most Common english words: ['is', ',', '.', 'in', 'it', 'during', 'the', 'but', 'and', 'sometimes']

No: of words in french dataset: 1961295
No: of Unique words in french dataset: 355
Top 10 most Common french words: ['est', '.', ',', 'en', 'il', 'les', 'mais', 'et', 'la', 'parfois']


## Preprocessing data

1. Tokenize the words into ids
2. Add padding to make all the sequences the same length.

In [14]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']

text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer)
print(text_tokenizer.word_index)
print(text_tokenized)
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print(f'Sequence {sample_i + 1} in text_sentences')
    print(f'  Input:  {sent}')
    print(f'  Output: {token_sent}')

<keras.preprocessing.text.Tokenizer object at 0x000002229B7EF310>
{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}
[[1, 2, 4, 5, 6, 7, 1, 8, 9], [10, 11, 12, 2, 13, 14, 15, 16, 3, 17], [18, 19, 3, 20, 21]]
Sequence 1 in text_sentences
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in text_sentences
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in text_sentences
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [15]:

def pad_simple(sentences, length=None):
    # if user did NOT specify length, use max sentence length
    if length is None:
        length = max(len(s) for s in sentences)

    # pad all sentences to THIS length
    return pad_sequences(sentences, maxlen=length, padding='post')

# Apply padding
padded = pad_simple(text_tokenized)

# Print original vs padded
for i, (orig, padded_sent) in enumerate(zip(text_tokenized, padded), 1):
    print(f"Sentence {i}")
    print("  Original:", orig)
    print("  Padded:  ", padded_sent)


Sentence 1
  Original: [1, 2, 4, 5, 6, 7, 1, 8, 9]
  Padded:   [1 2 4 5 6 7 1 8 9 0]
Sentence 2
  Original: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
  Padded:   [10 11 12  2 13 14 15 16  3 17]
Sentence 3
  Original: [18, 19, 3, 20, 21]
  Padded:   [18 19  3 20 21  0  0  0  0  0]


In [16]:
def preprocess(x,y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    
    preprocess_x = pad_simple(preprocess_x)
    preprocess_y = pad_simple(preprocess_y)
    
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    
    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sen, french_sen)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 345


In [17]:
len(english_tokenizer.word_index)

199

## Models
- Model1 is simple RNN
- Model2 is Bidirectional RNN
- Model3 is an embedding RNN

* Since model gives tokans values of french language so it is necessary to convert back them to texts using tokanizer.word_index.

In [20]:
def logits_to_text(logits, tokenizer):
    Index_to_word = {id:word for word, id in tokenizer.word_index.items()}
    Index_to_word[0] = '<PPAD>'

    return ' '.join([Index_to_word[prediction] for prediction in np.argmax(logits, 1)])

In [21]:
from keras.models import Sequential
from keras.layers import GRU, TimeDistributed, Dense, Dropout, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

def simple_model(output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 0.005

    model = Sequential()

    model.add(Embedding(
        input_dim=english_vocab_size,
        output_dim=256,
        input_length=output_sequence_length
    ))

    model.add(GRU(
        256,
        return_sequences=True,
        reset_after=False
    ))

    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

    model.compile(
        loss=sparse_categorical_crossentropy,
        optimizer=Adam(learning_rate),
        metrics=['accuracy']
    )

    return model


tmp_x = pad_simple(preproc_english_sentences, length=max_french_sequence_length)
tmp_x = tmp_x.astype('float32')   # ❗ VERY IMPORTANT

y = preproc_french_sentences


simple_rnn_model = simple_model(
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size
)

simple_rnn_model.fit(
    tmp_x,
    y,
    batch_size=1024,
    epochs=5,
    validation_split=0.2
)


Epoch 1/5

KeyboardInterrupt: 

In [None]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sen[:1])

print('\nOriginal text:')
print(english_sen[:1])

## Model 2: Bidirectional RNNs

In [None]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    #Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences=True, reset_after=False), input_shape=input_shape[1:]))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])
    
    return model

tmp_x = pad_simple(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

# Train the neural network
bd_rnn_model = bd_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

print(bd_rnn_model.summary())

bd_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=5, validation_split=0.2)

In [None]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(bd_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sen[:1])

print('\nOriginal text:')
print(english_sen[:1])

## Model 3: Embedding

In [29]:
def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    # Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(GRU(256, return_sequences=True, reset_after=False)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])
    
    return model

tmp_x = pad_simple(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

# Build the model
embed_rnn_model = bidirectional_embed_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

print(embed_rnn_model.summary())

embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=5, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 21, 256)           50944     
                                                                 
 bidirectional (Bidirectiona  (None, 21, 512)          787968    
 l)                                                              
                                                                 
 time_distributed_2 (TimeDis  (None, 21, 1024)         525312    
 tributed)                                                       
                                                                 
 dropout_1 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_3 (TimeDis  (None, 21, 345)          353625    
 tributed)                                                       
                                                      

<keras.callbacks.History at 0x222ffad3ee0>

In [30]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sen[:1])

print('\nOriginal text:')
print(english_sen[:1])

Prediciton:
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PPAD> <PPAD> <PPAD> <PPAD> <PPAD> <PPAD> <PPAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [31]:
embed_rnn_model.save('english_to_french_model.keras')
# Serialize English Tokenizer to JSON
with open('english_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(english_tokenizer.to_json(), ensure_ascii=False))
    
# Serialize French Tokenizer to JSON
with open('french_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(french_tokenizer.to_json(), ensure_ascii=False))
    
# Save max lengths
max_french_sequence_length_json = max_french_sequence_length
with open('sequence_length.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(max_french_sequence_length_json, ensure_ascii=False))