## Neural translation model


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import unicodedata
import re
from IPython.display import Image
# for splitting sentences 
import re
# for finding random samples
import random
from tensorflow.keras.layers import Layer, concatenate, Input, Masking, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

We will use a language dataset from http://www.manythings.org/anki/ to build a neural translation model. This dataset consists of over 200,000 pairs of sentences in English and German. 



In [None]:
#  load the dataset

NUM_EXAMPLES = 20000
data_examples = []
with open('data/deu.txt', 'r', encoding='utf8') as f:
    for line in f.readlines():
        if len(data_examples) < NUM_EXAMPLES:
            data_examples.append(line)
        else:
            break
            

In [None]:
# preprocess English and German sentences

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"ü", 'ue', sentence)
    sentence = re.sub(r"ä", 'ae', sentence)
    sentence = re.sub(r"ö", 'oe', sentence)
    sentence = re.sub(r'ß', 'ss', sentence)
    
    sentence = unicode_to_ascii(sentence)
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r"[^a-z?.!,']+", " ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    
    return sentence.strip()

#### The custom translation model

The custom model consists of an encoder RNN and a decoder RNN. The encoder takes words of an English sentence as input, and uses a pre-trained word embedding to embed the words into a 128-dimensional space. To indicate the end of the input sentence, a special end token (in the same 128-dimensional space) is passed in as an input. This token is a TensorFlow Variable that is learned in the training phase (unlike the pre-trained word embedding, which is frozen).

The decoder RNN takes the internal state of the encoder network as its initial state. A start token is passed in as the first input, which is embedded using a learned German word embedding. The decoder RNN then makes a prediction for the next German word, which during inference is then passed in as the following input, and this process is repeated until the special `<end>` token is emitted from the decoder.

## 1. Text preprocessing


In [None]:
en_sentences = [preprocess_sentence(example.split('\t')[0]) for example in data_examples]
de_sentences = [preprocess_sentence(example.split('\t')[1]) for example in data_examples]
de_sentences = ["<start> " + sentence + " <end>" for sentence in de_sentences]
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=None,
                                                  filters='',
                                                  lower=True)
tokenizer.fit_on_texts(de_sentences)
de_sentences_sequences = tokenizer.texts_to_sequences(de_sentences)
random_indices = np.random.choice(len(de_sentences), size=5, replace=False)
for idx in random_indices:
    print(f"English sentence:\t\t{en_sentences[idx]}")
    print(f"German sentence:\t\t{de_sentences[idx]}")
    print(f"German sentence sequence:\t{de_sentences_sequences[idx]}\n")

# Padding
de_sentences_sequences = tf.keras.preprocessing.sequence.pad_sequences(de_sentences_sequences, padding='post')

English sentence:		who has it now ?
German sentence:		<start> wer hat ihn jetzt ? <end>
German sentence sequence:	[1, 43, 16, 44, 62, 7, 2]

English sentence:		tom was here .
German sentence:		<start> tom war hier . <end>
German sentence sequence:	[1, 5, 24, 33, 3, 2]

English sentence:		tom will miss me .
German sentence:		<start> tom wird mich vermissen . <end>
German sentence sequence:	[1, 5, 48, 22, 473, 3, 2]

English sentence:		i'm shy .
German sentence:		<start> ich bin schuechtern . <end>
German sentence sequence:	[1, 4, 15, 337, 3, 2]

English sentence:		i can't eat .
German sentence:		<start> ich kann nicht essen . <end>
German sentence sequence:	[1, 4, 30, 12, 154, 3, 2]



## 2. Prepare the data with tf.data.Dataset objects

#### Load the embedding layer
As part of the dataset preproceessing ,  use a pre-trained English word embedding module from TensorFlow Hub. The URL for the module is https://tfhub.dev/google/tf2-preview/nnlm-en-dim128-with-normalization/1. This module has also been made available as a complete saved model in the folder `'./models/tf2-preview_nnlm-en-dim128_1'`. 

This embedding takes a batch of text tokens in a 1-D tensor of strings as input. It then embeds the separate tokens into a 128-dimensional space. 


In [None]:
# Load embedding module from Tensorflow Hub

embedding_layer = hub.KerasLayer("./models/tf2-preview_nnlm-en-dim128_1", 
                                 output_shape=[128], input_shape=[], dtype=tf.string)

In [None]:
# Test the layer

embedding_layer(tf.constant(["these", "aren't", "the", "droids", "you're", "looking", "for"])).shape

TensorShape([7, 128])

In [None]:
# Split dataset
en_train, en_val, de_train, de_val = train_test_split(en_sentences, de_sentences_sequences, test_size=0.2)
# Create train and validation Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((en_train, de_train))
val_dataset = tf.data.Dataset.from_tensor_slices((en_val, de_val))

In [None]:
def preprocess_dataset(dataset):
    def map_whitespace_split(en, de):
        return tf.strings.split(en, sep=" "), de
    
    def map_embed_sentences(en ,de):
        return embedding_layer(en), de
    
    def filter_cut(en, de):
        return tf.math.less(tf.cast(tf.shape(en)[0],dtype=tf.int32), 13)
    
    def map_pad_sentences(en, de):
        padding_needed = [[13 - len(en), 0], [0, 0]]
        return tf.pad(en, padding_needed), de
    
    dataset = dataset.map(map_whitespace_split)
    dataset = dataset.map(map_embed_sentences)
    dataset = dataset.filter(filter_cut)
    dataset = dataset.map(map_pad_sentences)
    dataset = dataset.batch(16, drop_remainder=True)

    return dataset


train_dataset = preprocess_dataset(train_dataset)
val_dataset = preprocess_dataset(val_dataset)
# Print the element spec for each Dataset
print(train_dataset.element_spec)
print(val_dataset.element_spec)

(TensorSpec(shape=(16, None, 128), dtype=tf.float32, name=None), TensorSpec(shape=(16, 14), dtype=tf.int32, name=None))
(TensorSpec(shape=(16, None, 128), dtype=tf.float32, name=None), TensorSpec(shape=(16, 14), dtype=tf.int32, name=None))


In [None]:
en, de = next(iter(train_dataset.take(1)))
print(f"Shape of the English data example from the training Dataset: {en.shape}")
print(f"German data example Tensor from the validation Dataset: {de.shape}")

Shape of the English data example from the training Dataset: (16, 13, 128)
German data example Tensor from the validation Dataset: (16, 14)


## 3. Create the custom layer



In [None]:
class CustomLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(CustomLayer, self).__init__(**kwargs)
        self.token_embedding = tf.Variable(initial_value=tf.random.uniform(shape=(128,)), trainable=True)

    def call(self, input):
        output = tf.tile(tf.reshape(self.token_embedding, shape=(1, 1, tf.shape(self.token_embedding)[0])),
                         [tf.shape(input)[0], 1, 1])
        return tf.keras.layers.concatenate([input, output], axis=1)

In [None]:

print(f"Batch shape before the end token embedding: {en.shape}")
print(f"Batch shape after the end token embedding: {CustomLayer()(en).shape}")

Batch shape before the end token embedding: (16, 13, 128)
Batch shape after the end token embedding: (16, 14, 128)


## 4. Build the encoder network


In [None]:
# Encoder model
def EncoderModel(input_shape):
    input = tf.keras.layers.Input(shape=input_shape)
    h = CustomLayer()(input)
    h = tf.keras.layers.Masking(mask_value=0)(h)
    output, hidden_state, cell_state = tf.keras.layers.LSTM(512, return_state=True)(h)
    encoder_model = tf.keras.Model(input, [hidden_state, cell_state])
    return encoder_model

encoder_model = EncoderModel((13,128))

In [None]:
print(f"Encoder model hidden state example shape: {encoder_model(en)[0].shape}")
print(f"Encoder model cell state example shape: {encoder_model(en)[1].shape}\n")

Encoder model hidden state example shape: (16, 512)
Encoder model cell state example shape: (16, 512)



In [None]:
# Print the encoder model summary
encoder_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 13, 128)]         0         
_________________________________________________________________
custom_layer_1 (CustomLayer) (None, 14, 128)           128       
_________________________________________________________________
masking (Masking)            (None, 14, 128)           0         
_________________________________________________________________
lstm (LSTM)                  [(None, 512), (None, 512) 1312768   
Total params: 1,312,896
Trainable params: 1,312,896
Non-trainable params: 0
_________________________________________________________________


## 5. Build the decoder network


In [None]:
# Decoder model
class DecoderModel(tf.keras.Model):
    def __init__(self, **kwargs):
        super(DecoderModel, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, mask_zero=True)
        self.lstm = tf.keras.layers.LSTM(512, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(len(tokenizer.word_index)+1)

    def call(self, input, hidden_state=None, cell_state=None):
        # hidden_state, cell_state from encoder
        x = self.embedding(input)
        x, hidden, cell = self.lstm(x, initial_state=[hidden_state, cell_state])
        x = self.dense(x)
        return x, hidden, cell

decoder_model = DecoderModel()  

In [None]:
encoder_hidden_state, encoder_cell_state = encoder_model(en)
decoder_output, decoder_hidden_state, decoder_cell_state = decoder_model(de, encoder_hidden_state, encoder_cell_state)

print("Output shape: {}".format(decoder_output.shape))
print("Hidden state shape: {}".format(decoder_hidden_state.shape))
print("Cell state shape: {}\n".format(decoder_cell_state.shape))

Output shape: (16, 14, 5744)
Hidden state shape: (16, 512)
Cell state shape: (16, 512)



In [None]:
decoder_model.summary()

Model: "decoder_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  735232    
_________________________________________________________________
lstm_1 (LSTM)                multiple                  1312768   
_________________________________________________________________
dense (Dense)                multiple                  2946672   
Total params: 4,994,672
Trainable params: 4,994,672
Non-trainable params: 0
_________________________________________________________________


## 6. Make a custom training loop


In [None]:
loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

# Define a function that computes the forward and backward pass for the translation model
@tf.function
def loss_opt_fun(en_input, de_input, de_output, loss_f):
    with tf.GradientTape() as g:
        encoder_hidden_state, encoder_cell_state = encoder_model(en_input)
        decoder_output, decoder_hidden_state, decoder_cell_state = decoder_model(de_input, encoder_hidden_state, encoder_cell_state)

        loss = tf.math.reduce_mean(loss_f(de_output, decoder_output))
        grads = g.gradient(loss, encoder_model.trainable_variables + decoder_model.trainable_variables)
    return loss, grads

In [None]:
epochs = 1
loss_train = []
loss_val = []

for epoch in range(epochs):
    loss_batch = []
    for en, de in train_dataset:
        decoder_inputs, decoder_outputs = de[:, :-1], de[:, 1:]
        loss, grads = loss_opt_fun(en, decoder_inputs, decoder_outputs, loss_func)
        optimizer.apply_gradients(zip(grads, encoder_model.trainable_variables + decoder_model.trainable_variables))
        loss_batch += [loss]
    
    loss_train += [np.mean(loss_batch)]

    loss_batch = []
    for en, de in val_dataset:
        (decoder_inputs, decoder_outputs) = de[:, :-1], de[:, 1:]
        loss, grads = loss_opt_fun(en, decoder_inputs, decoder_outputs, loss_func)
        loss_batch += [loss]

    loss_val += [np.mean(loss_batch)]
    print(f"Epoch {epoch};\tLoss train {loss_train[-1]};\tLoss val {loss_val[-1]}")

KeyboardInterrupt: 

In [None]:
# Loss vs Epoch plots for the training and validation sets
fig = plt.figure(figsize=(10,5))
plt.plot(loss_train)
plt.plot(loss_val)
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(['Training', 'Validation'], loc='best')
plt.show()

## 7. Use the model to translate


In [None]:
random_ind = np.random.choice(20000,5)
examples = []
for ind in random_ind:
    examples.append(data_examples[ind])
english_sentences = [sentence.split('\t')[0] for sentence in examples]
processed_english = []
for sentence in english_sentences:
    processed_english.append(preprocess_sentence(sentence))
    

start = tokenizer.word_index['<start>']
end = tokenizer.word_index['<end>']
examples_tokens = []
for p_english in processed_english:
    english = tf.strings.split(p_english,sep = " ")
    english = embedding_layer(english)
    english = tf.pad(english, [[13-len(english), 0], [0, 0]], constant_values = 0)
    english = tf.expand_dims(english, 0)
    hidden_state, cell_state = encoder_model(english)
    translated_tokens = []
    tf_token = tf.Variable([[start]])
    while True:
        output_1,hidden_state, cell_state = decoder_model(tf_token, hidden_state, cell_state)
        output_2 = tf.argmax(output_1, 2).numpy()[0,0]
        tf_token = tf.Variable([[output_2]])
        if output_2 == end:
            break
        else:
            translated_tokens.append(output_2)
    examples_tokens.append(translated_tokens)    

In [None]:
inv_german_index = {value:key for key,value in tokenizer.word_index.items()}
german_sentences = []
for example_token in examples_tokens:
    output_words = []
    for token in example_token:
        output_words.append(inv_german_index[token])
    output = " ".join(output_words)
    german_sentences.append(output)

In [None]:
table = PrettyTable(['English sentences', 'German Translations'])
for english,german in zip(english_sentences,german_sentences):
    table.add_row([english,german])

print(table)