## Import Libraries

In [55]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

## Downloading the dataset (Spanish to English)

In [58]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

## Convert Unicode to ASCII

In [59]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

## Preprocess sentence

This will remove unwanted characters such as !, ?, > and other special characters which does not help in performance of the model.\
Also, it will remove some non alpha characters and replace them with space

In [60]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # filling space between the word and the punctuation
    # eg: "How are you?" => "How are you ."
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    # Remove spaces from start and the end
    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [61]:
en_sentence = u"i am learning."
sp_sentence = u"Estoy aprendiendo."
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> i am learning . <end>
b'<start> estoy aprendiendo . <end>'


## Creating dataset

Implementing the above function for whole dataset and then converting it into the pairs.\
[[english, spanish],
 [english, spanish],
 [......., .......]]


In [62]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
    
    #read the file on specified path and then split on based of new line
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    # make word pairs.
    # For every training example, preprocess every word and put them in a lost.
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [63]:
# Calling the above function
en, sp = create_dataset(path_to_file, None)
print(en[-1])
print(sp[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


## Tokenization and Padding

Tokenization - convert the sequence into list of words\
Padding - Pad zeroes at the end of every sequence to make its length equal to maximum length

In [64]:
def tokenize(lang):
    
    # create tokenizer object. we dont need to specify filters as we have already preprocessed our sequences.
    # filters will remove the charctes that we provide as an arguement
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
    
    # fit tokenizer on the texts
    lang_tokenizer.fit_on_texts(lang)
    
    # transform each text into series of integers
    tensor = lang_tokenizer.texts_to_sequences(lang)

    # padd sequences to the maximum length. padding="post" means that zeroes will be padded to the end of sequence
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    return tensor, lang_tokenizer

## Load dataset

This will implement above twwo functions for whole dataset

In [65]:
def load_dataset(path, num_examples=None):
    
    # creating cleaned input, output pairs
    # inp_lang will be our imput, while targ lang will be our predictions
    inp_lang, targ_lang = create_dataset(path, num_examples)

    # tokenize input text
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    
    # tokenize target text
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [66]:
# We will limit to 30000 training examples for faster trasining
num_examples = 30000

#calling load_dataset function
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

## Train Test Split

In [67]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)


In [68]:
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

24000 24000 6000 6000


In [71]:
"""def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])


"""

'def convert(lang, tensor):\n    for t in tensor:\n        if t!=0:\n            print ("%d ----> %s" % (t, lang.index_word[t]))\n\nprint ("Input Language; index to word mapping")\nconvert(inp_lang, input_tensor_train[0])\nprint ()\nprint ("Target Language; index to word mapping")\nconvert(targ_lang, target_tensor_train[0])\n\n\n'

## Hyperparameters

Buffer_size helps creating the buffer at the backend for specific number of elements. This will reserve a space for improving performance\
batch size is number of training examples in each batch. \
steps_per_epoch is number of iterations in obe epoch. For dataset of size 100 and batch size as 10, the steps_per_epoch will be equal to 10\
embedding_dim is size of vector by which each word will be represented\
vocab_inp_size and vocab_tar_size is the size of vocabulary of input and target dataset\
\
tf.data.Dataset.from_tensor_slices converts every element into tensors. For example, tf.data.Dataset.from_tensor_slices([1, 2, 3]) will output-\
tf.Tensor(1, shape=(), dtype=int32), tf.Tensor(2, shape=(), dtype=int32), tf.Tensor(3, shape=(), dtype=int32)\

dataset.BATCH group the above tensors created into the batches of batch_size

In [72]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

The shape od each batch creates is equal to (batch_size, max_length)

In [73]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 11]), TensorShape([64, 16]))

## Encoder

Encoder will receive the inputs from input language (English in this case).\
The encoder output will be of the shape (batch_size, max_length, hidden_size)\
The encoder hidden state of shape (batch_size, hidden_size)

In [74]:
class Encoder(tf.keras.Model):
    
    # initialize class variables
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        
        # assign bacth size
        self.batch_sz = batch_sz
        
        # assign number of units in hidden state (enc_units)
        self.enc_units = enc_units
        
        # Define embedding layer.
        # Embeddings class converts the integers into the Dense vector of the size (vocab_size, embedding_dimension)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        # A simple GRU layer
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
        
    

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    # initializing hidden states
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [75]:
# initializing encoder class
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 11, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


## Bahdanau Attention

In this excercise, we will use Bahdanau Attention mechanism.\
Attenstion layer forms a context vector where each nunber represents the amount of attention that needs to be payed on the word at that index.
1. Firstly the score is calulated using fully cinnected dense layer and then passing their sum through tanh activation score = FC(tanh(FC(EO) + FC(H))), where FC is fully connected layer, while EO and H is encoder output and hidden states.
2. Pass the score through softmax function to get attention weights.
3. Define context vector by multiplying attention weights to encoder outout.
4. Concatenate context vector with the embedding matrix.

In [76]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [77]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 11, 1)


## Decoder

Decoder will receive the inputs from the attention layer and Embedding layer\
Its output will be of the form (batch_size, vocab_size)\
It will tell the probability of every word that could come next for each batch.\
On each call, decoder will output only for one word. For complete sequence, we need to call decoder repetitively.

In [78]:
class Decoder(tf.keras.Model):
    
    # initialize class variables
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        
        # assign batch size
        self.batch_sz = batch_sz
        
        # number of decoder hidden units
        self.dec_units = dec_units
        
        # Embeddings class converts the integers into the Dense vector of the size (vocab_size, embedding_dimension)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        # GRU layer
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        
        # Dense layer
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [81]:
# initializing decoder class
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 9414)


## Optimizer

We will use Adams optimizer and Sparse Categorical Cross Entropy. 

In [27]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

## Loss Function

In [28]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

## Defining checkpoints

The weights of encoder and decoder will be stored in seperate files

In [29]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## One step of training

In [83]:

def train_step(inp, targ, enc_hidden):
    loss = 0

    # tf.GradientTape will compute the derivations and record their gradients
    with tf.GradientTape() as tape:
        
        # call encoder class. This will provide output from encoder and hidden states
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        # hidden state passes from encoder to decoder
        # the hidden states to the decoder will be coming from encoder, hence assigning dec_hidden equal to enc_hidden
        dec_hidden = enc_hidden

        # initialize decoder input
        # for a start, it will be start tokens of the batch size
        # [[1], [1], [1], .... ]
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input.
        # This will speed up the training as well as boost performance.
        # Without teacher forcing, the next word that will be predicted will be based on the previous predicted word.
        # This could be totally dofferent prediction as length of the sequence increases.
        # Hence, the next word to be predicted can be based on true values during the training phase.
        
        # Below loos will predict till the max length of the target language
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            # calculating loss
            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    # compute batch loss
    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

## Training

In [84]:
# taking number of epochs = 2 due to time and resource constraints
EPOCHS = 2

# Fror every epoch
for epoch in range(EPOCHS):
    start = time.time()

    # get encoder hidden state
    enc_hidden = encoder.initialize_hidden_state()
    
    # initialize loss
    total_loss = 0

    # Loop through the range of steps_per_epoch and go on every batch
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        
        # compute batch loss 
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 3.2526
Epoch 1 Batch 100 Loss 1.6709
Epoch 1 Batch 200 Loss 1.4759
Epoch 1 Batch 300 Loss 1.5692
Epoch 1 Loss 1.6047
Time taken for 1 epoch 1963.6445171833038 sec

Epoch 2 Batch 0 Loss 1.2740
Epoch 2 Batch 100 Loss 1.3769
Epoch 2 Batch 200 Loss 1.3233
Epoch 2 Batch 300 Loss 1.2448
Epoch 2 Loss 1.3267
Time taken for 1 epoch 1952.1911334991455 sec



## Evaluation

In [85]:
def evaluate(sentence):
    
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    # preprocess the input sentence
    sentence = preprocess_sentence(sentence)

    # convert input words to unique integer
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
    
    # convert the above inputs to tensor
    inputs = tf.convert_to_tensor(inputs)

    # initialize results
    result = ''

    # define hidden units and get encoder output and hidden units
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    # set decoder hidden units equal to encoder hidden units
    dec_hidden = enc_hidden
    
    # initialize decoder input to start token.
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    # Loop till maximum length of the target language
    for t in range(max_length_targ):
        
        # get predictions of the shape (1, vocab_size). Because in this case batch_size is 1
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        # Get the maximum id from prediction matric. This will be the word with highest probability
        predicted_id = tf.argmax(predictions[0]).numpy()

        # Append it to the result
        result += targ_lang.index_word[predicted_id] + ' '

        # Append end of sentence token
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [86]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    #attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    #plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [87]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x23e8781de50>

In [88]:
translate(u'how are you?')

Input: <start> how are you ? <end>
Predicted translation: ¿ quien es un buen . <end> 


In [53]:
translate(u'esta es mi vida.')

Input: <start> esta es mi vida . <end>
Predicted translation: this is my life . <end> 


In [54]:
translate(u'¿todavia estan en casa?')

Input: <start> ¿ todavia estan en casa ? <end>
Predicted translation: are you home ? <end> 
