# Assignment 3: Build a seq2seq model for machine translation.


## 1. Data preparation

1. Download data (e.g., "deu-eng.zip") from http://www.manythings.org/anki/
2. Unzip the .ZIP file.
3. Put the .TXT file (e.g., "deu.txt") in the directory "./Data/".

In [1]:
%%time
# importing necessary libraries

import re
import string
from unicodedata import normalize
import numpy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot, plot_model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model
from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot, plot_model
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu
from tensorflow.keras.layers import Input, LSTM, Bidirectional, Dense, Concatenate, Attention
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction


CPU times: user 4.2 s, sys: 397 ms, total: 4.59 s
Wall time: 5.76 s


In [2]:
%%time
# defining a function to load the content of a text file into memory.

def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.63 µs


In [3]:
%%time

# Function to split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


In [4]:
%%time
# function to clean the text data

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # Only normalize the English sentences
            if not any(ord(char) > 127 for char in line):
                line = normalize('NFD', line).encode('ascii', 'ignore').decode('UTF-8')
            # Tokenize on white space
            line = line.split()
            # Convert to lowercase
            line = [word.lower() for word in line]
            # Remove punctuation from each token
            line = [word.translate(table) for word in line]
            # Remove tokens with numbers in them for English sentences only
            if all(ord(char) <= 127 for word in line for char in word):
                line = [word for word in line if word.isalpha()]
            # Store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return numpy.array(cleaned)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 8.34 µs


In [5]:
%%time
# file name
filename = 'spa.txt'

# number of training samples
n_train = 4000

CPU times: user 0 ns, sys: 3 µs, total: 3 µs
Wall time: 7.15 µs


In [6]:
%%time

# load dataset
doc = load_doc(filename)

# split into Language1-Language2 pairs
pairs = to_pairs(doc)

# clean sentences
clean_pairs = clean_data(pairs)[0:n_train, :]

CPU times: user 566 ms, sys: 14.1 ms, total: 580 ms
Wall time: 905 ms


In [7]:
%%time
# extracting and preprocessing text data

input_texts = clean_pairs[:, 0]
target_texts = ['\t' + text + '\n' for text in clean_pairs[:, 1]]

print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(input_texts.shape))

Length of input_texts:  (4000,)
Length of target_texts: (4000,)
CPU times: user 5.25 ms, sys: 25 µs, total: 5.27 ms
Wall time: 11 ms


In [8]:
%%time

# calculating the maximum sequence length for both input and target sentences in my dataset

max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

max length of input  sentences: 12
max length of target sentences: 39
CPU times: user 4.16 ms, sys: 17 µs, total: 4.18 ms
Wall time: 4.2 ms


## 2. Text processing

### 2.1. Convert texts to sequences

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.

In [9]:
%%time

# defining the encode and pad sequences function:
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index


encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length,
                                                      input_texts)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length,
                                                       target_texts)

print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

shape of encoder_input_seq: (4000, 12)
shape of input_token_index: 27
shape of decoder_input_seq: (4000, 39)
shape of target_token_index: 41
CPU times: user 129 ms, sys: 946 µs, total: 130 ms
Wall time: 225 ms


In [10]:
%%time
# determining the number of tokens in both the encoder and decoder

num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 28
num_decoder_tokens: 42
CPU times: user 1.25 ms, sys: 24 µs, total: 1.27 ms
Wall time: 1.28 ms


**Remark:** To this end, the input language and target language texts are converted to 2 matrices.

- Their number of rows are both n_train.
- Their number of columns are respective max_encoder_seq_length and max_decoder_seq_length.

In [11]:
%%time
# The followings print a sentence and its representation as a sequence.

target_texts[100]

decoder_input_seq[100, :]

CPU times: user 34 µs, sys: 4 µs, total: 38 µs
Wall time: 41.2 µs


array([ 5,  7, 18,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0], dtype=int32)

## 2.2. One-hot encode

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.
- It is represented by a $n\times t \times v$ tensor ($t$ is the number of unique chars) after the one-hot encoding.

In [12]:
%%time
# one hot encode target sequence
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = numpy.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)

decoder_target_seq = numpy.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:, 1:]
decoder_target_data = onehot_encode(decoder_target_seq,
                                    max_decoder_seq_length,
                                    num_decoder_tokens)

print(encoder_input_data.shape)
print(decoder_input_data.shape)

(4000, 12, 28)
(4000, 39, 42)
CPU times: user 175 ms, sys: 58.3 ms, total: 233 ms
Wall time: 415 ms


## 3. Build the networks (for training)

- Build encoder, decoder, and connect the two modules to get "model".

- Fit the model on the bilingual data to train the parameters in the encoder and decoder.

### 3.1. Encoder network

- Input:  one-hot encode of the input language

- Return:

    -- output (all the hidden states   $h_1, \cdots , h_t$) are always discarded
    
    -- the final hidden state  $h_t$
    
    -- the final conveyor belt $c_t$

In [13]:
%%time

latent_dim = 256

# inputs of the encoder network
encoder_inputs = Input(shape=(None, num_encoder_tokens),
                       name='encoder_inputs')

# set the LSTM layer


encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True,
                                  dropout=0.5, name='encoder_bilstm'))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
# build the encoder network model
encoder_model = Model(inputs=encoder_inputs,
                      outputs=[state_h, state_c],
                      name='encoder')

CPU times: user 1.44 s, sys: 433 ms, total: 1.87 s
Wall time: 2.04 s


In [14]:
%%time
# Print a summary and save the encoder network structure to "./encoder.pdf"

SVG(model_to_dot(encoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=encoder_model, show_shapes=False,
    to_file='encoder.pdf'
)

encoder_model.summary()

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, None, 28)]           0         []                            
 )                                                                                                
                                                                                                  
 bidirectional (Bidirection  [(None, 512),                583680    ['encoder_inputs[0][0]']      
 al)                          (None, 256),                                                        
                              (None, 256),                                                        
                              (None, 256),                                                        
                              (None, 256)]                                                  

### 3.2. Decoder network

- Inputs:  

    -- one-hot encode of the target language
    
    -- The initial hidden state $h_t$
    
    -- The initial conveyor belt $c_t$

- Return:

    -- output (all the hidden states) $h_1, \cdots , h_t$

    -- the final hidden state  $h_t$ (discarded in the training and used in the prediction)
    
    -- the final conveyor belt $c_t$ (discarded in the training and used in the prediction)

In [15]:
%%time
latent_dim = 2*latent_dim
# inputs of the decoder network
decoder_input_h = Input(shape=(latent_dim,), name='decoder_input_h')
decoder_input_c = Input(shape=(latent_dim,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# set the LSTM layer
decoder_lstm = LSTM(latent_dim, return_sequences=True,
                    return_state=True, dropout=0.5, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x,
                                                      initial_state=[decoder_input_h, decoder_input_c])

# set the dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_lstm_outputs)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x, decoder_input_h, decoder_input_c],
                      outputs=[decoder_outputs, state_h, state_c],
                      name='decoder')

CPU times: user 762 ms, sys: 23.9 ms, total: 786 ms
Wall time: 842 ms


In [16]:
%%time
# Print a summary and save the decoder network structure to "./decoder.pdf"

SVG(model_to_dot(decoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=decoder_model, show_shapes=False,
    to_file='decoder.pdf'
)

decoder_model.summary()

Model: "decoder"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 decoder_input_x (InputLaye  [(None, None, 42)]           0         []                            
 r)                                                                                               
                                                                                                  
 decoder_input_h (InputLaye  [(None, 512)]                0         []                            
 r)                                                                                               
                                                                                                  
 decoder_input_c (InputLaye  [(None, 512)]                0         []                            
 r)                                                                                         

### 3.3. Connect the encoder and decoder

In [17]:
%%time
# input layers
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# connect encoder to decoder
encoder_final_states = encoder_model([encoder_input_x])
decoder_lstm_output, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_final_states)
decoder_pred = decoder_dense(decoder_lstm_output)

model = Model(inputs=[encoder_input_x, decoder_input_x],
              outputs=decoder_pred,
              name='model_training')

print(state_h)
print(decoder_input_h)

KerasTensor(type_spec=TensorSpec(shape=(None, 512), dtype=tf.float32, name=None), name='decoder_lstm/PartitionedCall:2', description="created by layer 'decoder_lstm'")
KerasTensor(type_spec=TensorSpec(shape=(None, 512), dtype=tf.float32, name='decoder_input_h'), name='decoder_input_h', description="created by layer 'decoder_input_h'")
CPU times: user 781 ms, sys: 19.2 ms, total: 800 ms
Wall time: 825 ms


In [18]:
%%time
# displaying the summary of the model

SVG(model_to_dot(model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=model, show_shapes=False,
    to_file='model_training.pdf'
)

model.summary()

Model: "model_training"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input_x (InputLaye  [(None, None, 28)]           0         []                            
 r)                                                                                               
                                                                                                  
 decoder_input_x (InputLaye  [(None, None, 42)]           0         []                            
 r)                                                                                               
                                                                                                  
 encoder (Functional)        [(None, 512),                583680    ['encoder_input_x[0][0]']     
                              (None, 512)]                                           

### 3.5. Fit the model on the bilingual dataset

- encoder_input_data: one-hot encode of the input language

- decoder_input_data: one-hot encode of the input language

- decoder_target_data: labels (left shift of decoder_input_data)

- tune the hyper-parameters

- stop when the validation loss stop decreasing.

In [19]:
%%time
# printing the shape:
print('shape of encoder_input_data' + str(encoder_input_data.shape))
print('shape of decoder_input_data' + str(decoder_input_data.shape))
print('shape of decoder_target_data' + str(decoder_target_data.shape))

shape of encoder_input_data(4000, 12, 28)
shape of decoder_input_data(4000, 39, 42)
shape of decoder_target_data(4000, 39, 42)
CPU times: user 153 µs, sys: 18 µs, total: 171 µs
Wall time: 400 µs


In [20]:
%%time
# compiling the model

model.compile(optimizer='adam', loss='categorical_crossentropy')

model.fit([encoder_input_data, decoder_input_data],  # training data
          decoder_target_data,                       # labels (left shift of the target sequences)
          batch_size=64, epochs=50, validation_split=0.2)

model.save('seq2seq.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 48 s, sys: 2.47 s, total: 50.4 s
Wall time: 1min 25s


  saving_api.save_model(


## 4. Make predictions


### 4.1. Translate English to XXX

1. Encoder read a sentence (source language) and output its final states, $h_t$ and $c_t$.
2. Take the [star] sign "\t" and the final state $h_t$ and $c_t$ as input and run the decoder.
3. Get the new states and predicted probability distribution.
4. sample a char from the predicted probability distribution
5. take the sampled char and the new states as input and repeat the process (stop if reach the [stop] sign "\n").

In [21]:
%%time
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

CPU times: user 31 µs, sys: 2 µs, total: 33 µs
Wall time: 37 µs


In [22]:
%%time
# defining decoder layer
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = numpy.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = numpy.argmax(output_tokens[0, -1, :])


        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = numpy.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence


CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.87 µs


In [23]:
%%time
# Take one sequence (part of the training set) for trying out decoding.
for seq_index in range(50, 70):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('English:       ', input_texts[seq_index])
    print('Your target language (true): ', target_texts[seq_index][0:-1])
    print('Your target language (pred): ', decoded_sentence[0:-1])


-
English:        i try
Your target language (true):  	lo intento
Your target language (pred):  conrí
-
English:        i won
Your target language (true):  	¡he ganado
Your target language (pred):  ¡hora ara
-
English:        i won
Your target language (true):  	¡he ganado yo
Your target language (pred):  ¡hora ara
-
English:        oh no
Your target language (true):  	¡oh no
Your target language (pred):  ¡es gandio
-
English:        relax
Your target language (true):  	tomátelo con soda
Your target language (pred):  ¡alarel
-
English:        relax
Your target language (true):  	tranquila
Your target language (pred):  ¡alarel
-
English:        shoot
Your target language (true):  	¡fuego
Your target language (pred):  ¡dispara
-
English:        shoot
Your target language (true):  	¡disparad
Your target language (pred):  ¡dispara
-
English:        shoot
Your target language (true):  	¡disparen
Your target language (pred):  ¡dispara
-
English:        shoot
Your target language (true):  	di

### 4.2. Translate an English sentence to the target language (Spanish)

1. Tokenization
2. One-hot encode
3. Translate

In [24]:
%%time
# translating a source sentence from one language to another using a sequence-to-sequence model

input_sentence = 'I love you'

input_sequence = text2sequences(max_encoder_seq_length, [input_sentence])[0]

input_x = onehot_encode(input_sequence, max_encoder_seq_length, num_encoder_tokens)

translated_sentence = decode_sequence(input_x)

print('source sentence is: ' + input_sentence)
print('translated sentence is: ' + translated_sentence)

source sentence is: I love you
translated sentence is: me perdo ayuda

CPU times: user 1.11 s, sys: 37 ms, total: 1.15 s
Wall time: 1.21 s


## 5. Evaluate the translation using BLEU score

Reference:
- https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
- https://en.wikipedia.org/wiki/BLEU


**Hint:**

- Randomly partition the dataset to training, validation, and test.

- Evaluate the BLEU score using the test set. Report the average.

- A reasonable BLEU score should be 0.1 ~ 0.5. Over-high or over-low means something is wrong.

In [25]:
%%time
# Splitting the data
train_pairs, temp_pairs = train_test_split(clean_pairs, test_size=0.2, random_state=42)
valid_pairs, test_pairs = train_test_split(temp_pairs, test_size=0.5, random_state=42)


CPU times: user 5 ms, sys: 949 µs, total: 5.95 ms
Wall time: 10.4 ms


In [26]:
%%time
# defining a function called translate that takes a list of sentences and translates them using a sequence-to-sequence model.
def translate(sentences):
    translated_sentences = []
    for sentence in sentences:
        input_sequence = text2sequences(max_encoder_seq_length, [sentence])[0]
        input_x = onehot_encode(input_sequence, max_encoder_seq_length, num_encoder_tokens)
        translated_sentence = decode_sequence(input_x)
        translated_sentences.append(translated_sentence.strip())
    return translated_sentences

test_input_texts = test_pairs[:50, 0]
translated_texts = translate(test_input_texts)


CPU times: user 44.7 s, sys: 1.53 s, total: 46.2 s
Wall time: 50.4 s


In [27]:
%%time
# Split the reference sentences and translations into words
references = [ref.split() for ref in test_pairs[:50, 1]]
translations = [trans.split() for trans in translated_texts]


bleu_score_2gram = corpus_bleu([[ref] for ref in references], translations, weights=(0.25, 0, 0, 0))

print(f'BLEU-2 Score: {bleu_score_2gram:.4f}')


BLEU-2 Score: 0.3806
CPU times: user 6.85 ms, sys: 26 µs, total: 6.88 ms
Wall time: 6.99 ms


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [28]:
%%time
# split the reference and translation sentences into words
references = [ref.split() for ref in test_pairs[:50, 1]]
translations = [trans.split() for trans in translated_texts]

# Define a SmoothingFunction
smoothie = SmoothingFunction().method4

# Calculate the BLEU score with smoothing
bleu_score_2gram = corpus_bleu([[ref] for ref in references], translations, weights=(0.25, 0, 0, 0), smoothing_function=smoothie)

print(f'BLEU-2 Score with Smoothing: {bleu_score_2gram:.4f}')


BLEU-2 Score with Smoothing: 0.3806
CPU times: user 2.27 ms, sys: 12 µs, total: 2.28 ms
Wall time: 2.31 ms
