<img style="float: left;" src="./images/PallenceAI-Final.png" width="800">

# Machine Translation: Sequence to Sequence Models 

## Transformers: Encoder - Decoder

<img style="float: left;" src="./images/mt2.webp" width="500">

### Import needed libraries 

In [1]:
#Basic Python packages for data wrangling
import numpy as np
import pandas as pd
import random
import re
import string
import matplotlib.pyplot as plt

from pathlib import Path

#Tensorflow & Keras related packages
import tensorflow as tf
from tensorflow import keras
from keras import layers

from utils import plot_history

### Load the Dataset

In [2]:
# First download the dataset from 
#...http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

In [3]:
path = Path("data/spa-eng/spa.txt")

In [4]:
with open(path, encoding="utf-8") as f:
    lines = f.read().split("\n")[:-1]

sentence_pairs = []
for line in lines:
    english, spanish = line.split("\t")
    sentence_pairs.append((english, spanish))

### Understand & Preprocess the Data

In [5]:
sentence_pairs[0:10]

[('Go.', 'Ve.'),
 ('Go.', 'Vete.'),
 ('Go.', 'Vaya.'),
 ('Go.', 'Váyase.'),
 ('Hi.', 'Hola.'),
 ('Run!', '¡Corre!'),
 ('Run.', 'Corred.'),
 ('Who?', '¿Quién?'),
 ('Fire!', '¡Fuego!'),
 ('Fire!', '¡Incendio!')]

In [6]:
random.shuffle(sentence_pairs)

**Convert the dataset into a tf.data.Dataset**

In [7]:
#Convert the dataset into a tf.data.Dataset
english = [pairs[0] for pairs in sentence_pairs]
spanish = [pairs[1] for pairs in sentence_pairs]
all_data = tf.data.Dataset.from_tensor_slices((english,spanish))

In [8]:
len(all_data) # Total 118964 sentence pairs 

118964

In [9]:
# Displaying a sample. First two sentence pairs
for en,sp in all_data.take(2):
    print("English:", en.numpy().decode('utf-8'))
    print("Spanish:", sp.numpy().decode('utf-8'))

English: Tom always keeps his word.
Spanish: Tom siempre mantiene su palabra.
English: Tom should do the same thing I do.
Spanish: Tom debería hacer lo mismo que yo.


**Add [start], [end] to target sentences (spanish)**

In [10]:
# We need to add [start], [end] tokens to target sentences 
#..so that the model will start predicting when [start] is given as first token, 
#..and when it predicts [end] or reaches max sequence length it will stop

def add_tokens(source_sentence, target_sentence):
    target_sentence = tf.strings.join(["[START] ", target_sentence, " [END]"])
    return source_sentence, target_sentence

# Apply the token addition to the datasets
all_data = all_data.map(add_tokens)

# Displaying a sample
for en,sp in all_data.take(3):
    print("English:", en.numpy().decode('utf-8'))
    print("Spanish:", sp.numpy().decode('utf-8'))

English: Tom always keeps his word.
Spanish: [START] Tom siempre mantiene su palabra. [END]
English: Tom should do the same thing I do.
Spanish: [START] Tom debería hacer lo mismo que yo. [END]
English: Can I speak with the teacher?
Spanish: [START] ¿Puedo hablar con el profesor? [END]


**Split the data into train, val, test**

In [11]:
# Get the total number of samples
total_samples = all_data.cardinality().numpy()

# Calculate the sizes of new splits
test_size = val_size = int(0.15 * total_samples)
train_size = total_samples - test_size - val_size

# Split the dataset
train_data = all_data.take(train_size)
val_data = all_data.skip(train_size).take(val_size)
test_data = all_data.skip(train_size + val_size).take(test_size)

# Print the sizes of the new splits
print("Train size:", train_size)
print("Validation size:", val_size)
print("Test size:", test_size)

Train size: 83276
Validation size: 17844
Test size: 17844


In [12]:
for en,sp in train_data.take(3):
    print("English:", en.numpy().decode('utf-8'))
    print("Spanish:", sp.numpy().decode('utf-8'))

English: Tom always keeps his word.
Spanish: [START] Tom siempre mantiene su palabra. [END]
English: Tom should do the same thing I do.
Spanish: [START] Tom debería hacer lo mismo que yo. [END]
English: Can I speak with the teacher?
Spanish: [START] ¿Puedo hablar con el profesor? [END]


**Vectorize the data**

In [13]:
# Going forward source will be english language, target will be spanish.

In [14]:
# Define constants 
VOCAB_SIZE = 15000 # Max tokens
MAX_SEQ_LEN = 20 # Max sequence length
EMBED_DIM=256 # Embedding dimension
HIDDEN_DIM = 1024 # Hidden dimension for dense layers
BATCH_SIZE=64 # Batch size
NUM_HEADS=8 # Number of heads for Multiheaded attention layer

In [15]:
# Define the vectorizers
source_vectorizer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_SEQ_LEN,
)

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

target_vectorizer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    # we add an extra token since target will be offset by one token
    output_sequence_length=MAX_SEQ_LEN + 1, 
    standardize=custom_standardization,
)

In [16]:
# Adapt the vectorizers to the dataset. Meaning creating the vocabulary for respective languages.
source_vectorizer.adapt(train_data.map(lambda x,y: x)) # for english sentences
target_vectorizer.adapt(train_data.map(lambda x,y: y)) # for spanish sentences

#or..
# source_vectorizer.adapt(english) # for english sentences created earlier
# target_vectorizer.adapt(spanish) # for spanish sentences created earlier

#This will take a while

In [17]:
source_vocab =source_vectorizer.get_vocabulary()
source_vocab

['',
 '[UNK]',
 'the',
 'i',
 'to',
 'you',
 'tom',
 'a',
 'is',
 'he',
 'in',
 'of',
 'that',
 'it',
 'was',
 'do',
 'this',
 'have',
 'me',
 'my',
 'for',
 'she',
 'dont',
 'are',
 'what',
 'his',
 'mary',
 'we',
 'your',
 'on',
 'be',
 'with',
 'want',
 'not',
 'im',
 'and',
 'like',
 'at',
 'know',
 'him',
 'can',
 'go',
 'her',
 'has',
 'will',
 'its',
 'there',
 'they',
 'time',
 'were',
 'very',
 'as',
 'how',
 'did',
 'had',
 'all',
 'about',
 'here',
 'think',
 'up',
 'didnt',
 'get',
 'out',
 'when',
 'from',
 'if',
 'cant',
 'an',
 'no',
 'one',
 'going',
 'by',
 'why',
 'doesnt',
 'would',
 'come',
 'see',
 'good',
 'ill',
 'youre',
 'please',
 'who',
 'just',
 'been',
 'need',
 'more',
 'so',
 'help',
 'than',
 'tell',
 'but',
 'where',
 'never',
 'now',
 'am',
 'got',
 'us',
 'too',
 'some',
 'something',
 'last',
 'ive',
 'take',
 'much',
 'day',
 'could',
 'should',
 'money',
 'car',
 'people',
 'work',
 'well',
 'home',
 'back',
 'really',
 'went',
 'our',
 'said',
 'a

In [18]:
source_index = {i:word for i,word in enumerate(source_vocab)}
source_index

{0: '',
 1: '[UNK]',
 2: 'the',
 3: 'i',
 4: 'to',
 5: 'you',
 6: 'tom',
 7: 'a',
 8: 'is',
 9: 'he',
 10: 'in',
 11: 'of',
 12: 'that',
 13: 'it',
 14: 'was',
 15: 'do',
 16: 'this',
 17: 'have',
 18: 'me',
 19: 'my',
 20: 'for',
 21: 'she',
 22: 'dont',
 23: 'are',
 24: 'what',
 25: 'his',
 26: 'mary',
 27: 'we',
 28: 'your',
 29: 'on',
 30: 'be',
 31: 'with',
 32: 'want',
 33: 'not',
 34: 'im',
 35: 'and',
 36: 'like',
 37: 'at',
 38: 'know',
 39: 'him',
 40: 'can',
 41: 'go',
 42: 'her',
 43: 'has',
 44: 'will',
 45: 'its',
 46: 'there',
 47: 'they',
 48: 'time',
 49: 'were',
 50: 'very',
 51: 'as',
 52: 'how',
 53: 'did',
 54: 'had',
 55: 'all',
 56: 'about',
 57: 'here',
 58: 'think',
 59: 'up',
 60: 'didnt',
 61: 'get',
 62: 'out',
 63: 'when',
 64: 'from',
 65: 'if',
 66: 'cant',
 67: 'an',
 68: 'no',
 69: 'one',
 70: 'going',
 71: 'by',
 72: 'why',
 73: 'doesnt',
 74: 'would',
 75: 'come',
 76: 'see',
 77: 'good',
 78: 'ill',
 79: 'youre',
 80: 'please',
 81: 'who',
 82: 'just

In [19]:
target_vocab = target_vectorizer.get_vocabulary()
target_vocab

['',
 '[UNK]',
 '[start]',
 '[end]',
 'de',
 'que',
 'a',
 'no',
 'tom',
 'la',
 'el',
 'en',
 'es',
 'un',
 'me',
 'se',
 'por',
 'lo',
 'una',
 'los',
 'su',
 'Él',
 'está',
 'con',
 'mi',
 'le',
 'ella',
 'qué',
 'te',
 'para',
 'mary',
 'y',
 'las',
 'más',
 'al',
 'yo',
 'tu',
 'estoy',
 'muy',
 'eso',
 'tiene',
 'este',
 'esta',
 'del',
 'él',
 'estaba',
 'tengo',
 'quiero',
 'fue',
 'si',
 'aquí',
 'casa',
 'como',
 'hacer',
 'puedo',
 'algo',
 'todo',
 'esto',
 'hay',
 'tiempo',
 'ha',
 'gusta',
 'todos',
 'tan',
 'son',
 'nada',
 'cuando',
 'favor',
 'ir',
 'vez',
 'era',
 'puede',
 'bien',
 'mucho',
 'he',
 'ellos',
 'nos',
 'sé',
 'solo',
 'mañana',
 'nunca',
 'ser',
 'dos',
 'ya',
 'creo',
 'sus',
 'trabajo',
 'estás',
 'tienes',
 'dónde',
 'cómo',
 'dinero',
 'dijo',
 'ahora',
 'hablar',
 'tomás',
 'quién',
 'están',
 'pero',
 'soy',
 'día',
 'había',
 'hace',
 'ese',
 'siempre',
 'tú',
 'libro',
 'puedes',
 'poco',
 'hoy',
 'quiere',
 'esa',
 'verdad',
 'nadie',
 'años',


In [20]:
target_index = {i:word for i,word in enumerate(target_vocab)}
target_index

{0: '',
 1: '[UNK]',
 2: '[start]',
 3: '[end]',
 4: 'de',
 5: 'que',
 6: 'a',
 7: 'no',
 8: 'tom',
 9: 'la',
 10: 'el',
 11: 'en',
 12: 'es',
 13: 'un',
 14: 'me',
 15: 'se',
 16: 'por',
 17: 'lo',
 18: 'una',
 19: 'los',
 20: 'su',
 21: 'Él',
 22: 'está',
 23: 'con',
 24: 'mi',
 25: 'le',
 26: 'ella',
 27: 'qué',
 28: 'te',
 29: 'para',
 30: 'mary',
 31: 'y',
 32: 'las',
 33: 'más',
 34: 'al',
 35: 'yo',
 36: 'tu',
 37: 'estoy',
 38: 'muy',
 39: 'eso',
 40: 'tiene',
 41: 'este',
 42: 'esta',
 43: 'del',
 44: 'él',
 45: 'estaba',
 46: 'tengo',
 47: 'quiero',
 48: 'fue',
 49: 'si',
 50: 'aquí',
 51: 'casa',
 52: 'como',
 53: 'hacer',
 54: 'puedo',
 55: 'algo',
 56: 'todo',
 57: 'esto',
 58: 'hay',
 59: 'tiempo',
 60: 'ha',
 61: 'gusta',
 62: 'todos',
 63: 'tan',
 64: 'son',
 65: 'nada',
 66: 'cuando',
 67: 'favor',
 68: 'ir',
 69: 'vez',
 70: 'era',
 71: 'puede',
 72: 'bien',
 73: 'mucho',
 74: 'he',
 75: 'ellos',
 76: 'nos',
 77: 'sé',
 78: 'solo',
 79: 'mañana',
 80: 'nunca',
 81: 's

In [21]:
# Functions to vectorize the source and target sentences and to prepare the final dataset
def vectorize_dataset(source, target):
    source = source_vectorizer(source)
    target = target_vectorizer(target)
    return ({
        "encoder_inputs": source,
        "decoder_inputs": target[:, :-1],  # Exclude the last token for decoder input
    }, target[:, 1:])  # Shift by one for the target output

def make_dataset(data):
    dataset = data.batch(BATCH_SIZE) # batches the data
    dataset = dataset.map(vectorize_dataset, num_parallel_calls=4)
    
    return dataset.shuffle(2048).prefetch(16).cache()

In [22]:
# Prepare the final vectorized data which basically takes the sequences of words for each language and convert them into integers
train_int = make_dataset(train_data)
val_int = make_dataset(val_data)
test_int = make_dataset(test_data)

# Display a sample
for inputs, targets in train_int.take(1):
    print(f"encoder_inputs shape: {inputs['encoder_inputs'].shape}")
    print(f"decoder_inputs shape: {inputs['decoder_inputs'].shape}")
    print(f"targets shape: {targets.shape}")

# because of vectorization, the original dataset, 
# ..where each element was a sentence pair (english,spanish) is now 
# batches of data, with 64samples in each batch. 

# each sample is a tuple where first element is a dict of inputs, second element is targets
# sequence length for inputs and targets is 20. 

encoder_inputs shape: (64, 20)
decoder_inputs shape: (64, 20)
targets shape: (64, 20)


In [23]:
for inputs, targets in train_int.take(1):
    print ("Encoder Inputs:", inputs['encoder_inputs'][0])
    print ("Decoder Inputs:",inputs['decoder_inputs'][0])
    print ("Targets:",targets[0])

# As you see decoder_inputs is nothing but target sequence beginning with token [start] (index=2)
# targets is also the target sequence but offset by one token and 
#...begins with element next to [start] token

Encoder Inputs: tf.Tensor(
[  26 2622    4 3945   42  468 4488    0    0    0    0    0    0    0
    0    0    0    0    0    0], shape=(20,), dtype=int64)
Decoder Inputs: tf.Tensor(
[   2   30 2511    1   10  467    1    3    0    0    0    0    0    0
    0    0    0    0    0    0], shape=(20,), dtype=int64)
Targets: tf.Tensor(
[  30 2511    1   10  467    1    3    0    0    0    0    0    0    0
    0    0    0    0    0    0], shape=(20,), dtype=int64)


### Build a Transformer based encoder-decoder model

<img style="float: left;" src="./images/transformer_ed.jpg" width="800">

In [24]:
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [25]:
# Define constants or parameters
VOCAB_SIZE = 15000 # Max tokens
MAX_SEQ_LEN = 20 # Max sequence length

EMBED_DIM = 256 # Embedding dimension
HIDDEN_DIM = 1024 # Hidden dimension for dense layers
BATCH_SIZE = 64 # Batch size
NUM_HEADS = 8 # Number of heads for Multiheaded attention layer

#### Embeddings class

In [26]:
class CombinedEmbedding(layers.Layer):
    def __init__(self, MAX_SEQ_LEN, VOCAB_SIZE, EMBED_DIM, **kwargs):
        super().__init__(**kwargs)
        
        self.token_embeddings = layers.Embedding(input_dim = VOCAB_SIZE, output_dim=EMBED_DIM, 
                                                 mask_zero=True) # input embedding layer
        
        self.position_embeddings = layers.Embedding(input_dim = MAX_SEQ_LEN, 
                                                    output_dim = EMBED_DIM) # position embedding layer
        # both the above embeddings are initialized randomly first 
        #....and will be calculated as part of training process.
        
        self.sequence_length = MAX_SEQ_LEN
        self.max_tokens = VOCAB_SIZE
        self.embed_dim = EMBED_DIM

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        
        # Word or token embeddings
        embedded_tokens = self.token_embeddings(inputs)
        
        # Positional Embeddings
        embedded_positions = self.position_embeddings(positions)
        
        return embedded_tokens + embedded_positions # Return combined embeddings

    def compute_mask(self, inputs, mask=None):
        return self.token_embeddings.compute_mask(inputs, mask)
    # The compute_mask method in a custom layer ensures that the masking information 
    #...is correctly propagated through the layers.
    # without this the mask may not be propagated properly through the sebsequent layers.

    # whenever we use custom layers, mainly for saving and loading the model
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "sequence_length": self.sequence_length,
            "max_tokens": self.max_tokens,
        })
        return config

#### Transformer Encoder class

<img style="float: left;" src="./images/transformer_ed.jpg" width="800">

In [27]:
class TransformerEncoder(layers.Layer):
    def __init__(self, EMBED_DIM, NUM_HEADS, HIDDEN_DIM,**kwargs):
        super().__init__(**kwargs)
        self.embed_dim = EMBED_DIM
        self.num_heads = NUM_HEADS
        self.ff_dim = HIDDEN_DIM
        
        # Define Multiheaded Attention Layer
        self.attention = layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)
        
        # Define Feed forward dense layers
        self.ffn = keras.Sequential([
            layers.Dense(HIDDEN_DIM, activation="relu"),
            layers.Dense(EMBED_DIM),])
        
        # Define Normalization Layers
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

        
    # Actual computation in the call method below.
    def call(self, inputs, mask=None):
        if mask is not None:
            # reshape since the attention layer expects 3d or 4d: 
            # (batch_size, num_heads, seq_length, seq_length)
            mask = mask[:, tf.newaxis, :] 
        
        # Multiheaded Attention Layer
        attn_output = self.attention(inputs, inputs, inputs, attention_mask=mask)
        
        # Normalization Layer
        out1 = self.layernorm1(inputs + attn_output)
        
        # Dense feed forward Layer
        ffn_output = self.ffn(out1)
        
        # 2nd Normalization Layer
        out2 = self.layernorm2(out1 + ffn_output)
        
        return out2

    # this method is mainly for loading the saved model, when custom layers are used.
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
        })
        return config

#### Transformer Decoder class

In [28]:
class TransformerDecoder(layers.Layer):
    def __init__(self, EMBED_DIM, NUM_HEADS, HIDDEN_DIM, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = EMBED_DIM
        self.num_heads = NUM_HEADS
        self.ff_dim = HIDDEN_DIM
        
        #Attention Layers
        self.attention1 = layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)
        self.attention2 = layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)
        
        #Feedforward Dense Layer
        self.ffn = keras.Sequential([
            layers.Dense(HIDDEN_DIM, activation="relu"),
            layers.Dense(EMBED_DIM),])
        
        #Normalization
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.supports_masking=True
        

    # Causal mask for the Decoder Inputs. because we dont want attention on future tokens
    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

# 1 0 0 0 0
# 1 1 0 0 0
# 1 1 1 0 0
# 1 1 1 1 0
# 1 1 1 1 1

    # Main computation inside the call method
    def call(self, inputs, encoder_outputs, mask=None):
        
        # Causal mask for decoder inputs: self attention
        causal_mask = self.get_causal_attention_mask(inputs)
        
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask) # combined both masks
        else:
            padding_mask = mask
            
        # Self Attention for Decoder inputs
        attention_output1 = self.attention1(query=inputs,
                                            value=inputs,
                                            key=inputs,
                                            attention_mask=causal_mask)
        out1 = self.layernorm1(inputs + attention_output1)
        
        # Cross Attention between self attended Decoder inputs & encoder outputs.
        attention_output2 = self.attention2(query=out1,
                                            value=encoder_outputs,
                                            key=encoder_outputs,
                                            attention_mask=padding_mask)
        out2 = self.layernorm2(out1 + attention_output2)
        
        # Feed forward dense layers
        ffn_output = self.ffn(out2)
        
        out3 = self.layernorm3(out2 + ffn_output)
        
        return out3

    #for loading the saved model, with custom layers.    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
        })
        return config

<img style="float: left;" src="./images/transformer_ed.jpg" width="800">

**Encoder**

In [30]:
# Build an encoder, which takes an input source sentence, and encodes it as vector with 
#.. EMBED_DIM units.
# basically it captured the essense of the sentence.
# Returns a context aware sequence of vectors, unlike RNN that returns a single vector at last time step

# Encoder Input 
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")

# Embedding class (for Both token + Positional Embeddings) for Embedding layer
x = CombinedEmbedding(MAX_SEQ_LEN, VOCAB_SIZE, EMBED_DIM)(encoder_inputs)

# Transformer Encoder Layer
encoder_outputs= TransformerEncoder(EMBED_DIM, NUM_HEADS, HIDDEN_DIM)(x)

**Decoder**

In [31]:
# Build the decoder now

# Inputs
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")

# Embedding layer (combined word embeddings + positional embeddings)
x = CombinedEmbedding(MAX_SEQ_LEN, VOCAB_SIZE, EMBED_DIM)(decoder_inputs)

# Transformer Decoder Block
x = TransformerDecoder(EMBED_DIM, NUM_HEADS, HIDDEN_DIM)(x,encoder_outputs)

# Dropout Layer
x = layers.Dropout(0.4)(x)

# Final dense layer mapping probability distribution over spanish vocabulary
decoder_outputs= layers.Dense(VOCAB_SIZE, activation="softmax")(x) 

In [32]:
model_transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model_transformer.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 combined_embedding_1 (Combined  (None, None, 256)   3845120     ['encoder_inputs[0][0]']         
 Embedding)                                                                                       
                                                                                                  
 combined_embedding_2 (Combined  (None, None, 256)   3845120     ['decoder_inputs[0][0]']     

### Compile & Train the Model

In [33]:
model_transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
path = Path("./models/model_transformer.keras")

In [34]:
# We will reuse this function to train and evaluate for convenience
def train_evaluate(model,path,train,val,test):
    
    #call backs
    checkpoint_cb = keras.callbacks.ModelCheckpoint(filepath = path,
                                                       save_best_only=True) # Save only best model
    
    earlystop_cb = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    callbacks = [checkpoint_cb, earlystop_cb]

    #Compile the model
    model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy",  metrics = ["accuracy"])
    
    #Train the model
    history = model.fit(train, validation_data = val, callbacks = callbacks, epochs=50)
    
    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(test)
    
    return (history,test_accuracy)

In [35]:
(history_transformer,test_accuracy_transformer) = train_evaluate(model_transformer,path,
                                                                 train_int,
                                                                 val_int,
                                                                 test_int)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


In [36]:
print (f"Accuracy on the test data set is {test_accuracy_transformer}")

Accuracy on the test data set is 0.6651646494865417


In [40]:
# from tensorflow.keras.models import load_model

# # Define all custom layers
# custom_objects = {
#     "CombinedEmbedding": CombinedEmbedding,
#     "TransformerEncoder": TransformerEncoder,
#     "TransformerDecoder": TransformerDecoder
# }

# # Load the model with all custom layers specified
# loaded_model = load_model("model_transformer.keras", custom_objects=custom_objects)


<img style="float: left;" src="./images/transformer_ed.jpg" width="800">

### Inference: Translate Few sentences

In [41]:
def translate_sequence(input_sentence):
    vectorized_input_sentence = source_vectorizer([input_sentence])
    target_sentence = "[start]"
    for i in range(MAX_SEQ_LEN):
        vectorized_target_sentence = target_vectorizer([target_sentence])[:,:-1]
        #The [:,:-1] slicing is essential in Transformer-based architectures to ensure that the 
        # ...target sequence fed into the decoder does not include the [END] token or padding
        
        next_token_predictions = model_transformer.predict(
            [vectorized_input_sentence, vectorized_target_sentence])
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = target_index[sampled_token_index]
        target_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return target_sentence


source_sentences = [en for en,sp in test_data.take(20)]

for _ in range(20):
    input_sentence = random.choice(source_sentences)
    print("\n")
    print(input_sentence)
    print(translate_sequence(input_sentence))



tf.Tensor(b'I caught Tom cheating.', shape=(), dtype=string)
[start] yo [UNK] a tom [UNK] [end]


tf.Tensor(b'I caught Tom cheating.', shape=(), dtype=string)
[start] yo [UNK] a tom [UNK] [end]


tf.Tensor(b"Let's find something to sit on.", shape=(), dtype=string)
[start] [UNK] algo para [UNK] [end]


tf.Tensor(b'I came here when I was a kid.', shape=(), dtype=string)
[start] he venido aquí cuando era niño [end]


tf.Tensor(b'You must be very hungry now.', shape=(), dtype=string)
[start] tienes que tener mucha hambre ahora [end]


tf.Tensor(b'A pair of gloves is a nice gift.', shape=(), dtype=string)
[start] algunos días [UNK] es un regalo [end]


tf.Tensor(b'What did you do in there?', shape=(), dtype=string)
[start] qué hiciste con aquí [end]


tf.Tensor(b'I saw him naked.', shape=(), dtype=string)
[start] lo vi a las [UNK] [end]


tf.Tensor(b'What time do you walk the dog?', shape=(), dtype=string)
[start] a qué hora vas a caminar al perro [end]


tf.Tensor(b'Tom is very blunt.',

[start] ellos están problemas con su palabra de [UNK] [UNK] [end]


tf.Tensor(b'He kept on writing stories about animals.', shape=(), dtype=string)
[start] Él decidió a escribir sobre los animales [end]
