In [None]:
!pip install tensorflow_datasets
!pip install tensorflow-text



In [None]:
import logging
import os
import time
import numpy as np
import tensorflow as tf
import tensorflow_text
import tensorflow_datasets as tfds
# Suppress warnings
logging.getLogger('tensorflow').setLevel(logging.ERROR)

# **Loading The Portuguse To English Translation Dataset**

In [None]:
translation_examples, translation_metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info = True, as_supervised = True)
translation_training_examples, translation_validation_examples = translation_examples['train'], translation_examples['validation']

In [None]:
tf.keras.utils.get_file('ted_hrlr_translate_pt_en_converter.zip',
                        'https://storage.googleapis.com/download.tensorflow.org/models/ted_hrlr_translate_pt_en_converter.zip', cache_dir = '.',
                        cache_subdir = '../Data', extract = True)

'./Data/ted_hrlr_translate_pt_en_converter.zip'

# **Exploring The Dataset**

In [None]:
translation_tokenizers = tf.saved_model.load('../Data/ted_hrlr_translate_pt_en_converter')
for portuguese_examples, english_examples in translation_training_examples.batch(3).take(1):
  print('> Examples in Portuguese')
  for pt in portuguese_examples.numpy():
    print(pt.decode('utf-8'))
  print('> Examples in English')
  for en in english_examples.numpy():
    print(en.decode('utf-8'))

> Examples in Portuguese
e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
mas e se estes fatores fossem ativos ?
mas eles não tinham a curiosidade de me testar .
> Examples in English
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n't test for curiosity .


In [None]:
english_tokenizer_encoded = translation_tokenizers.en.tokenize(english_examples)
for row in english_tokenizer_encoded.to_list():
  print(row)

[2, 72, 117, 79, 1259, 1491, 2362, 13, 79, 150, 184, 311, 71, 103, 2308, 74, 2679, 13, 148, 80, 55, 4840, 1434, 2423, 540, 15, 3]
[2, 87, 90, 107, 76, 129, 1852, 30, 3]
[2, 87, 83, 149, 50, 9, 56, 664, 85, 2512, 15, 3]


In [None]:
english_tokenizer_decoded = translation_tokenizers.en.detokenize(english_tokenizer_encoded)
for line in english_tokenizer_decoded.numpy():
  print(line.decode('utf-8'))

and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n ' t test for curiosity .


# **Creating The Input Pipeline**

In [None]:
def filter_max_tokens(pt_example, en_example):
  num_tokens = tf.maximum(tf.shape(pt_example)[1], tf.shape(en_example)[1])
  return num_tokens < 128

def tokenize_pairs(pt_example, en_example):
  pt_example = translation_tokenizers.pt.tokenize(pt_example)
  # Convert to dense tensor, padded with zeros
  pt_example = pt_example.to_tensor()

  en_example = translation_tokenizers.en.tokenize(en_example)
  # Convert to dense tensor, padded with zeros
  en_example = en_example.to_tensor()
  return pt_example, en_example

def make_batches(ds):
  return ds.cache().shuffle(20000).batch(64).map(tokenize_pairs, num_parallel_calls = tf.data.AUTOTUNE).filter(filter_max_tokens).prefetch(tf.data.AUTOTUNE)

translation_training_batches = make_batches(translation_training_examples)
translation_validation_batches = make_batches(translation_validation_examples)

# Adding The Positional Encodings

In [None]:
def get_angles(position, i, d_dimensional_model):
  angle_rates = 1/np.power(10000, (2*(i//2))/np.float32(d_dimensional_model))
  return position * angle_rates

def positional_encoding(position, d_dimensional_model):
  angle_radians = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_dimensional_model)[np.newaxis, :], d_dimensional_model)
  # For every even element in the array (2i)
  angle_radians[:, 0::2] = np.sin(angle_radians[:, 0::2])
  # For every odd element in the array (2i+1)
  angle_radians[:, 1::2] = np.cos(angle_radians[:, 1::2])
  return tf.cast(angle_radians[np.newaxis, ...], dtype = tf.float32)

# **Look-Ahead Mask - Masking Future Tokens In A Sequence**

In [None]:
def create_padding_mask(sequence):
  '''
  Adding extra dimensions for the additional padding in the attention logits
  '''
  sequence = tf.cast(tf.math.equal(sequence, 0), tf.float32)
  return sequence[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, sequence_len)

def create_lookahead_mask(size):
  return 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) # (sequence_len, sequence_len)

In [None]:
def scaled_dot_product_attention(q, k, v, mask = None):
  '''
  Calculates the attention weights. Returns both output and attention weights
  Args
    q : query shape == (..., sequence_len_q, depth_q)
    k : key shape == (..., sequence_len_k, depth_k)
    v : value shape == (..., sequence_len_v, depth_v)
    mask : Float tensor with shape that is broadcastable to (..., sequence_len_q, sequence_len_k). Default = None
  q, k, v : Must have matching leading dimensions
  k, v : Must have matching penultimate dimension (sequence_len_k == sequence_len_v)
  mask : Has different shapes depending on its type (padding/look-ahead) but it must be broadcastable for addition
  '''
  qk_matmul = tf.matmul(q, k, transpose_b = True) # output_shape = (..., sequence_len_q, sequence_len_k)
  scaled_attention_logits = qk_matmul/tf.math.sqrt(tf.cast(tf.shape(k)[-1], tf.float32))
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)
  # Normalize the last axis (sequence_len_k) using softmax to ensure that the scores add up to 1
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1) # output_shape = (..., sequence_len_q, sequence_len_k)
  output = tf.matmul(attention_weights, v) # output_shape = (..., sequence_len_q, depth_v)
  return output, attention_weights

# **Implementing The Multi-Head Attention Mechanism**

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, *, d_dimensional_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_dimensional_model = d_dimensional_model

    assert d_dimensional_model % self.num_heads == 0
    self.depth = d_dimensional_model//self.num_heads
    self.wq = tf.keras.layers.Dense(d_dimensional_model)
    self.wk = tf.keras.layers.Dense(d_dimensional_model)
    self.wv = tf.keras.layers.Dense(d_dimensional_model)
    self.dense = tf.keras.layers.Dense(d_dimensional_model)

  def split_heads(self, x, batch_size):
    '''
    Split the last dimension into (num_heads, depth) and transpose the result such that the shape = (batch_size, num_heads, sequence_len, depth)
    '''
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm = [0, 2, 1, 3])

  def call(self, v, k, q , mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q) # Shape: (batch_size, sequence_len, d_dimensional_model)
    q = self.split_heads(q, batch_size) # Shape: (batch_size, num_heads, sequence_len_q, depth)
    k = self.wk(k) # Shape: (batch_size, sequence_len, d_dimensional_model)
    k = self.split_heads(k, batch_size) # Shape: (batch_size, num_heads, sequence_len_k, depth)
    v = self.wv(v) # Shape: (batch_size, sequence_len, d_dimensional_model)
    v = self.split_heads(v, batch_size) # Shape: (batch_size, num_heads, sequence_len_v, depth)

    '''
    scaled_attention.shape == (batch_size, num_heads, sequence_len_q, depth)
    attention_weights.shape == (batch_size, num_heads, sequence_len_q, sequence_len_k)
    '''
    scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
    scaled_attention = tf.transpose(scaled_attention, perm = [0, 2, 1, 3]) # Reshaped to (batch_size, sequence_len_q, num_heads, depth)
    concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_dimensional_model)) # Shape: (batch_size, sequence_len_q, d_dimensional_model)
    output = self.dense(concat_attention) # Shape: (batch_size, sequence_len_q, d_dimensional_model)
    return output, attention_weights

# **Defining The Point-Wise Feed-Forward Network (2 Fully-Connected Layers)**

In [None]:
def pointwise_feed_forward_network(d_dimensional_model, dff):
  return tf.keras.Sequential([tf.keras.layers.Dense(dff, activation = 'relu'), tf.keras.layers.Dense(d_dimensional_model)])

# **Defining The Encoder And Decoder Layers**

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, *, d_dimensional_model, num_heads, dff, dropout_rate = 0.1):
    super(EncoderLayer, self).__init__()
    self.MHA = MultiHeadAttention(d_dimensional_model = d_dimensional_model, num_heads = num_heads)
    self.FFN = pointwise_feed_forward_network(d_dimensional_model, dff)

    self.LayerNorm1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
    self.LayerNorm2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask):
    attention_output, _ = self.MHA(x, x, x, mask) # Shape: (batch_size, input_sequence_len, d_dimensional_model)
    attention_output = self.dropout1(attention_output, training = training)
    output1 = self.LayerNorm1(x + attention_output) # Shape: (batch_size, input_sequence_len, d_dimensional_model)
    FFN_output = self.FFN(output1) # Shape: (batch_size, input_sequence_len, d_dimensional_model)
    FFN_output = self.dropout2(FFN_output, training = training)
    output2 = self.LayerNorm2(output1 + FFN_output) # Shape: (batch_size, input_sequence_len, d_dimensional_model)
    return output2

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, *, d_dimensional_model, num_heads, dff, dropout_rate = 0.1):
    super(DecoderLayer, self).__init__()
    self.MHA1 = MultiHeadAttention(d_dimensional_model = d_dimensional_model, num_heads = num_heads)
    self.MHA2 = MultiHeadAttention(d_dimensional_model = d_dimensional_model, num_heads = num_heads)

    self.FFN = pointwise_feed_forward_network(d_dimensional_model, dff)

    self.LayerNorm1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
    self.LayerNorm2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
    self.LayerNorm3 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, encoder_output, training, lookahead_mask, padding_mask):
    '''
    encoder_output.shape: (batch_size, input_sequence_len, d_dimensional_model)
    '''
    attention1, attention_weights_block1 = self.MHA1(x, x, x, lookahead_mask) # Shape: (batch_size, target_sequence_len, d_dimensional_model)
    attention1 = self.dropout1(attention1, training = training)
    output1 = self.LayerNorm1(attention1 + x)

    attention2, attention_weights_block2 = self.MHA2(encoder_output, encoder_output, output1, padding_mask) # Shape: (batch_size, target_sequence_len, d_dimensional_model)
    attention2 = self.dropout2(attention2, training = training)
    output2 = self.LayerNorm2(attention2 + output1) # Shape: (batch_size, target_sequence_len, d_dimensional_model)

    FFN_output = self.FFN(output2) # Shape: (batch_size, target_sequence_len, d_dimensional_model)
    FFN_output = self.dropout3(FFN_output, training = training)
    output3 = self.LayerNorm3(FFN_output + output2) # Shape: (batch_size, target_sequence_len, d_dimensional_model)
    return output3, attention_weights_block1, attention_weights_block2

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_dimensional_model, num_heads, dff, input_vocab_size, dropout_rate = 0.1):
    super(Encoder, self).__init__()
    self.d_dimensional_model = d_dimensional_model
    self.num_layers = num_layers
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_dimensional_model)
    self.positional_encoding = positional_encoding(128, self.d_dimensional_model)
    self.encoder_layers = [EncoderLayer(d_dimensional_model = d_dimensional_model, num_heads = num_heads, dff = dff, dropout_rate = dropout_rate)
    for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask):
    sequence_len = tf.shape(x)[1]
    # Adding embedding and positional encoding
    x = self.embedding(x) # Shape: (batch_size, input_sequence_len, d_dimensional_model)
    x *= tf.math.sqrt(tf.cast(self.d_dimensional_model, tf.float32))
    x += self.positional_encoding[:, :sequence_len, :]
    x = self.dropout(x, training = training)
    for j in range(self.num_layers):
      x = self.encoder_layers[j](x, training, mask)
    return x # Shape: (batch_size, input_sequence_len, d_dimensional_model)

class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_dimensional_model, num_heads, dff, target_vocab_size, dropout_rate = 0.1):
    super(Decoder, self).__init__()
    self.d_dimensional_model = d_dimensional_model
    self.num_layers = num_layers
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_dimensional_model)
    self.positional_encoding = positional_encoding(128, d_dimensional_model)
    self.decoder_layers = [DecoderLayer(d_dimensional_model = d_dimensional_model, num_heads = num_heads, dff = dff, dropout_rate = dropout_rate)
    for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, encoder_output, training, lookahead_mask, padding_mask):
    sequence_len = tf.shape(x)[1]
    attention_weights = {}
    x = self.embedding(x) # Shape: (batch_size, target_sequence_len, d_dimensional_model)
    x *= tf.math.sqrt(tf.cast(self.d_dimensional_model, tf.float32))
    x += self.positional_encoding[:, :sequence_len, :]
    x = self.dropout(x, training = training)

    for k in range(self.num_layers):
      x, block1, block2 = self.decoder_layers[k](x, encoder_output, training, lookahead_mask, padding_mask)
      attention_weights['decoder_layer{}_block1'.format(k+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(k+1)] = block2
    return x, attention_weights # x.shape: (batch_size, target_sequence_len, d_dimensional_model)

# **Defining The Transformer**

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_dimensional_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate = 0.1):
    super().__init__()
    self.encoder = Encoder(num_layers = num_layers, d_dimensional_model = d_dimensional_model, num_heads = num_heads, dff = dff,
                           input_vocab_size = input_vocab_size, dropout_rate = dropout_rate)
    self.decoder = Decoder(num_layers = num_layers, d_dimensional_model = d_dimensional_model, num_heads = num_heads, dff = dff,
                           target_vocab_size = target_vocab_size, dropout_rate = dropout_rate)
    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs, training):
    input, target = inputs
    encoder_padding_mask, lookahead_mask, decoder_padding_mask = self.create_masks(input, target)
    encoder_output = self.encoder(input, training, encoder_padding_mask) # Shape: (batch_size, input_sequence_len, d_dimensional_model)
    # decoder_output.shape: (batch_size, target_sequence_len, d_dimensional_model)
    decoder_output, attention_weights = self.decoder(target, encoder_output, training, lookahead_mask, decoder_padding_mask)
    final_output = self.final_layer(decoder_output) # Shape: (batch_size, target_sequence_len, target_vocab_size)
    return final_output, attention_weights

  def create_masks(self, input, target):
    encoder_padding_mask = create_padding_mask(input)
    # This padding mask is used in the second attention block of the decoder to mask out the encoder outputs
    decoder_padding_mask = create_padding_mask(input)
    # This lookahead mask is used in the first attention block of the decoder to pad and mask future tokens in the input received by the decoder
    lookahead_mask = create_lookahead_mask(tf.shape(target)[1])
    decoder_target_padding_mask = create_padding_mask(target)
    lookahead_mask = tf.maximum(decoder_target_padding_mask, lookahead_mask)
    return encoder_padding_mask, lookahead_mask, decoder_padding_mask

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_dimensional_model, warmup_steps = 4000):
    super(CustomSchedule, self).__init__()
    self.d_dimensional_model = d_dimensional_model
    self.d_dimensional_model = tf.cast(self.d_dimensional_model, tf.float32)
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** (-1.5))
    return tf.math.rsqrt(self.d_dimensional_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(128)
translation_optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1 = 0.9, beta_2 = 0.98, epsilon = 1e-9)

In [None]:
def loss_function(actual, prediction):
  mask = tf.math.logical_not(tf.math.equal(actual, 0))
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')
  loss = loss_object(actual, prediction)
  mask = tf.cast(mask, dtype = loss.dtype)
  loss *= mask
  return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def accuracy_function(actual, prediction):
  accuracies = tf.equal(actual, tf.argmax(prediction, axis = 2))
  mask = tf.math.logical_not(tf.math.equal(actual, 0))
  accuracies = tf.math.logical_and(mask, accuracies)

  accuracies = tf.cast(accuracies, dtype = tf.float32)
  mask = tf.cast(mask, dtype = tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

training_loss = tf.keras.metrics.Mean(name = 'train_loss')
training_accuracy = tf.keras.metrics.Mean(name = 'train_accuracy')

In [None]:
translation_transformer = Transformer(num_layers = 4, d_dimensional_model = 128, num_heads = 8, dff = 512,
                                      input_vocab_size = translation_tokenizers.pt.get_vocab_size().numpy(),
                                      target_vocab_size = translation_tokenizers.en.get_vocab_size().numpy(), dropout_rate = 0.1)

translation_checkpoint_path = 'Checkpoints/train'
translation_checkpoint = tf.train.Checkpoint(transformer = translation_transformer, optimizer = translation_optimizer)
translation_checkpoint_manager = tf.train.CheckpointManager(translation_checkpoint, translation_checkpoint_path, max_to_keep = 5)

# If a checkpoint exists, restore the latest checkpoint
if translation_checkpoint_manager.latest_checkpoint:
  translation_checkpoint.restore(translation_checkpoint_manager.latest_checkpoint)
  print('Latest checkpoint restored!')

In [None]:
train_step_signature = [tf.TensorSpec(shape = (None, None), dtype = tf.int64), tf.TensorSpec(shape = (None, None), dtype = tf.int64)]

@tf.function(input_signature = train_step_signature)
def train_step(input, target):
  target_input = target[:, :-1]
  target_actual = target[:, 1:]
  with tf.GradientTape() as tape:
    predictions, _ = translation_transformer([input, target_input], training = True)
    loss = loss_function(target_actual, predictions)
  gradients = tape.gradient(loss, translation_transformer.trainable_variables)
  translation_optimizer.apply_gradients(zip(gradients, translation_transformer.trainable_variables))
  training_loss(loss)
  training_accuracy(accuracy_function(target_actual, predictions))

# **Training The Transformer**

In [None]:
for epoch in range(20):
  start = time.perf_counter()
  training_loss.reset_states()
  training_accuracy.reset_states()

  # Input: Portuguese; Target: English
  for batch, (input, target) in enumerate(translation_training_batches):
    train_step(input, target)

    if batch % 50 == 0:
      print('Epoch {}, Batch {}: Loss = {:.4f}, Accuracy = {:.4f}'.format(epoch + 1, batch, training_loss.result(), training_accuracy.result()))

  if (epoch + 1) % 5 == 0:
    checkpoint_save_path = translation_checkpoint_manager.save()
    print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, checkpoint_save_path))
    print('Epoch {}: Loss = {:.4f}, Accuracy = {:.4f}'.format(epoch + 1, training_loss.result(), training_accuracy.result()))
    print('Time taken to complete 1 epoch: {:.2f}s\n'.format(time.perf_counter() - start))

Epoch 1, Batch 0: Loss = 8.8896, Accuracy = 0.0000
Epoch 1, Batch 50: Loss = 8.8225, Accuracy = 0.0012
Epoch 1, Batch 100: Loss = 8.7163, Accuracy = 0.0229
Epoch 1, Batch 150: Loss = 8.6014, Accuracy = 0.0353
Epoch 1, Batch 200: Loss = 8.4596, Accuracy = 0.0417
Epoch 1, Batch 250: Loss = 8.2898, Accuracy = 0.0466
Epoch 1, Batch 300: Loss = 8.0973, Accuracy = 0.0524
Epoch 1, Batch 350: Loss = 7.8965, Accuracy = 0.0607
Epoch 1, Batch 400: Loss = 7.7058, Accuracy = 0.0687
Epoch 1, Batch 450: Loss = 7.5375, Accuracy = 0.0754
Epoch 1, Batch 500: Loss = 7.3899, Accuracy = 0.0815
Epoch 1, Batch 550: Loss = 7.2555, Accuracy = 0.0883
Epoch 1, Batch 600: Loss = 7.1323, Accuracy = 0.0955
Epoch 1, Batch 650: Loss = 7.0171, Accuracy = 0.1024
Epoch 1, Batch 700: Loss = 6.9101, Accuracy = 0.1088
Epoch 2, Batch 0: Loss = 5.6769, Accuracy = 0.1845
Epoch 2, Batch 50: Loss = 5.4008, Accuracy = 0.1988
Epoch 2, Batch 100: Loss = 5.3558, Accuracy = 0.2005
Epoch 2, Batch 150: Loss = 5.3169, Accuracy = 0.2036

# **Translation**

In [None]:
class Translator(tf.Module):
  def __init__(self, tokenizers, transformer):
    self.tokenizers = tokenizers
    self.transformer = transformer

  def __call__(self, sentence, max_length = 128):
    # Adding the start and end tokens since the input sentence is Portuguese
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]
    sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()
    encoder_input = sentence
    # Initialize the output with the English start token since English is the output language
    start_end = self.tokenizers.en.tokenize([''])[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]
    # A 'tf.TensorArray' is required here, instead of a Python list' to ensure that dynamic-loop is traceable by 'tf.function'
    output_array = tf.TensorArray(dtype = tf.int64, size = 0, dynamic_size = True).write(0, start)

    for a in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions, _ = self.transformer([encoder_input, output], training = False)
      # Select the last token from the sequence_len dimension
      predictions = predictions[:, -1:, :] # Shape: (batch_size, 1, vocab_size)
      prediction_id = tf.argmax(predictions, axis = -1)
      # Concatenate the prediction_id to the output which will be the decoder input
      output_array = output_array.write(a + 1, prediction_id[0])
      if prediction_id == end:
        break
    output = tf.transpose(output_array.stack()) # Shape: (1, tokens)
    text = self.tokenizers.en.detokenize(output)[0]
    tokens = self.tokenizers.en.lookup(output)[0]
    # 'tf.function' prevents the use of attention weights that were calculated during the last iteration, hence they need to be recalculated outside of the loop
    _, attention_weights = self.transformer([encoder_input, output[:, :-1]], training = False)
    return text, tokens, attention_weights

In [None]:
translator = Translator(translation_tokenizers, translation_transformer)

def translation_printer(sentence, tokens, ground_truth):
  print('Input: {:15s}'.format(sentence))
  print('Prediction: {:15s}'.format(tokens.numpy().decode('utf-8')))
  print('Ground truth: {:15s}'.format(ground_truth))

input_sentence = 'os meus vizinhos ouviram sobre esta ideia.'
ground_truth = 'and my neighbouring homes heard about this idea.'
translated_text, translated_tokens, attention_weights = translator(tf.constant(input_sentence))
translation_printer(input_sentence, translated_text, ground_truth)

Input: os meus vizinhos ouviram sobre esta ideia.
Prediction: my neighbors have heard about this idea .
Ground truth: and my neighbouring homes heard about this idea.
