# Setup

In [2]:
from packaging import version
import tensorflow as tf

assert version.parse(tf.__version__) >= version.parse("2.8.0")

In [3]:
if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. Neural nets can be very slow without a GPU.")
    if "google.colab" in sys.modules:
        print("Go to Runtime > Change runtime and select a GPU hardware "
              "accelerator.")
    if "kaggle_secrets" in sys.modules:
        print("Go to Settings > Accelerator and select GPU.")

Let's download the Shakespeare data from Andrej Karpathy's [char-rnn project](https://github.com/karpathy/char-rnn/)

In [4]:
import tensorflow as tf

shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

# An Encoder–Decoder Network for Neural Machine Translation

In [26]:
## Download the spanish/english dataset

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, extract=True) # cache_dir defaults to ~/.keras/datasets


from pathlib import Path
# The file might be extracted to a subdirectory within ~/.keras/datasets
# Search for 'spa.txt' in subdirectories:
dataset_path = Path(path).parent
for file_path in dataset_path.rglob('spa.txt'):
    if file_path.is_file():
        dataset_path = file_path
        break

text = dataset_path.read_text()

In [27]:
import numpy as np

text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.shuffle(pairs)
sentences_english, sentences_spanish = zip(*pairs)  # separates the pairs into 2 lists

In [28]:
for i in range(3):
    print(sentences_english[i], "=>", sentences_spanish[i])

He will get better little by little. => Mejorará poco a poco.
He has only four pesos. => No tiene más que cuatro pesos.
I meet her once a week. => Me encuentro con ella una vez a la semana.


In [29]:
vocab_size = 10000
max_length = 50

# create a keras TextVectorization layer using the above parameters
# then call .adapt on that layer, passing in the english sentences as parameter

layer_english_vectorization = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length, pad_to_max_tokens=False, ragged=False, name = "English_Vectorize")
layer_english_vectorization.adapt(sentences_english)

# now do the same with the spanish -- make and adapt a layer
# BUT for the spanish add 'starttoken' and 'endtoken' to each sentence first

layer_spanish_vectorization = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length,  pad_to_max_tokens=False, ragged=False, name = "Spanish_Vectorize")
layer_spanish_vectorization.adapt([f"starttoken {s} endtoken" for s in sentences_spanish])

In [30]:
layer_english_vectorization.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [31]:
layer_spanish_vectorization.get_vocabulary()[:10]

['', '[UNK]', 'starttoken', 'endtoken', 'de', 'que', 'a', 'no', 'tom', 'la']

In [32]:
X_train_encoder = tf.constant(sentences_english[:100_000])
X_valid_encoder = tf.constant(sentences_english[100_000:])
X_train_decoder = tf.constant([f"starttoken {s}" for s in sentences_spanish[:100_000]])
X_valid_decoder = tf.constant([f"starttoken {s}" for s in sentences_spanish[100_000:]])
Y_train = layer_spanish_vectorization([f"{s} endtoken" for s in sentences_spanish[:100_000]])
Y_valid = layer_spanish_vectorization([f"{s} endtoken" for s in sentences_spanish[100_000:]])

In [33]:
encoder_input_layer = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_input_layer = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [34]:
# Change this or leave it -- its the vector embedding dimension
embed_size = 512

# define encoder_vectors as the english vectorization layer applied to the encoder input layer
encoder_vectors = layer_english_vectorization(encoder_input_layer)

# define decoder vectors
decoder_vectors = layer_spanish_vectorization(decoder_input_layer)

# define an encoder_embedding_layer as a keras Embedding layer, with mask_zero = True
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=False, name = "Encoder_Embedding")

# define an decoder_encoder_embedding_layer as a keras Embedding layer, with mask_zero = True

decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True, name = "Decoder_Embedding")

# finally define encoder_embeddings and decoder_embeddings as the output of the embedding layers applied to the vectors
encoder_embeddings = encoder_embedding_layer(encoder_vectors)
decoder_embeddings = decoder_embedding_layer(decoder_vectors)

In [35]:
encoder = tf.keras.models.Sequential([
            tf.keras.layers.LSTM(512, return_state=True, dropout=0.2)]                       )
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [36]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True, dropout=0.2)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [37]:
# create a Dense output_layer of the appropriate size for the spanish vocabulary
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax", name = "Output_Dense")\


# create a dropout_layer
dropout = tf.keras.layers.Dropout(0.5, name = "Output_Dropout")

# define Y_proba. take the decoder_outputs through the dropout layer and into the dense layer
Y_proba = output_layer(dropout(decoder_outputs))

## Custom loss and accuracy

In [38]:
class MaskedSparseCategoricalCrossentropy(tf.keras.losses.Loss):
    def __init__(self, from_logits=False, name="masked_sparse_categorical_crossentropy"):
        super().__init__(reduction="none", name=name)
        self.from_logits = from_logits

    @tf.function  # Compiles into a TensorFlow graph for speed
    def call(self, y_true, y_pred):
        loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=self.from_logits)
        mask = tf.cast(y_true != 0, dtype=tf.float32)  # Ignore padding (0 tokens)

        loss *= mask  # Apply mask
        loss_per_sequence = tf.reduce_sum(loss, axis=-1) / tf.maximum(tf.reduce_sum(mask, axis=-1), 1.0)
        
        return tf.reduce_mean(loss_per_sequence)  # Average over batch


In [39]:
class MaskedSparseCategoricalAccuracy(tf.keras.metrics.Metric):
    def __init__(self, name="masked_sparse_categorical_accuracy", **kwargs):
        super().__init__(name=name, **kwargs)
        self.correct_predictions = self.add_weight(name="correct", initializer="zeros")
        self.total_valid = self.add_weight(name="total", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Compute predicted class (argmax for probabilities or logit inputs)
        y_pred_classes = tf.argmax(y_pred, axis=-1, output_type=tf.int64)

        # Create mask where y_true != 0 (ignoring zero values)
        mask = tf.cast(y_true != 0, dtype=tf.float32)

        # Compare predictions with ground truth
        correct = tf.cast(tf.equal(y_true, y_pred_classes), dtype=tf.float32)

        # Apply mask
        correct *= mask

        # Update total count
        self.correct_predictions.assign_add(tf.reduce_sum(correct))
        self.total_valid.assign_add(tf.reduce_sum(mask))

    def result(self):
        return self.correct_predictions / tf.maximum(self.total_valid, 1.0)  # Avoid division by zero

    def reset_state(self):
        self.correct_predictions.assign(0)
        self.total_valid.assign(0)


In [40]:
class SaveBestModelWithEarlyStopping(tf.keras.callbacks.Callback):
    def __init__(self, save_path="best_model.keras", patience=5):
        """
        Custom callback to:
        - Save the best model based on val_loss
        - Stop training if val_loss doesn't improve for 'patience' epochs

        Args:
            save_path (str): Path to save the best model.
            patience (int): Number of epochs to wait before stopping training if no improvement.
        """
        super().__init__()
        self.save_path = save_path
        self.patience = patience
        self.best_val_loss = float("inf")  # Initialize with a large value
        self.wait = 0  # Counter for patience

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_loss = logs.get("val_loss")

        if val_loss is not None:
            if val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                self.model.save(self.save_path)  # Save the entire model
                print(f"\nEpoch {epoch+1}: val_loss improved to {val_loss:.4f}. Model saved to {self.save_path}")
                self.wait = 0  # Reset patience counter
            else:
                self.wait += 1  # No improvement, increase patience counter
                print(f"\nEpoch {epoch+1}: val_loss did not improve. Patience: {self.wait}/{self.patience}")

            # Stop training if patience limit is reached
            if self.wait >= self.patience:
                print(f"\nEarly stopping triggered! No improvement for {self.patience} epochs.")
                self.model.stop_training = True

# Example Usage:
save_best_early_stop = SaveBestModelWithEarlyStopping(save_path="snapshots/translator_A.keras", patience=2)

# Add this when fitting your model
# model.fit(..., callbacks=[save_best_early_stop])


## Model and Training

In [41]:
from tensorflow.keras.utils import plot_model

model_A = tf.keras.Model(inputs=[encoder_input_layer, decoder_input_layer],
                       outputs=[Y_proba])
model_A.compile(loss=MaskedSparseCategoricalCrossentropy(from_logits=False), optimizer="nadam",
              metrics=[MaskedSparseCategoricalAccuracy()])
model_A.fit((X_train_encoder, X_train_decoder), Y_train, epochs=5, batch_size=64,
          validation_data=((X_valid_encoder, X_valid_decoder), Y_valid),
         callbacks = [save_best_early_stop])

Epoch 1/5
[1m 171/1563[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:36[0m 69ms/step - loss: 0.8938 - masked_sparse_categorical_accuracy: 0.1501

KeyboardInterrupt: 

In [None]:
X_train[10:11],X_train_dec[10:11],np.array(Y_train[10])

In [42]:
pred = model_A.predict((X_train[10:11],X_train_dec[10:11]))[0]
print(pred.shape)
y_indices = np.argmax(pred,axis=1)
[text_vec_layer_es.get_vocabulary()[i] for i in y_indices]

NameError: name 'X_train' is not defined

In [43]:
[text_vec_layer_es.get_vocabulary()[i] for i in Y_train[10]]

NameError: name 'text_vec_layer_es' is not defined

In [44]:
Y_train

<tf.Tensor: shape=(100000, 50), dtype=int64, numpy=
array([[6360,  111,    6, ...,    0,    0,    0],
       [   7,   40,   33, ...,    0,    0,    0],
       [  14, 1357,   24, ...,    0,    0,    0],
       ...,
       [  17,  717,    4, ...,    0,    0,    0],
       [  20,   15,   17, ...,    0,    0,    0],
       [  21, 9812,    4, ...,    0,    0,    0]])>

In [112]:
tf.random.categorical([[0.01,0.01,0.01,0.97]], num_samples=10)

<tf.Tensor: shape=(1, 10), dtype=int64, numpy=array([[3, 3, 3, 1, 2, 1, 3, 2, 2, 1]])>

In [125]:
def translate(a_model, sentence_en, temperature = 3):
    translation = ""
    for word_idx in range(max_length):
        X =  tf.convert_to_tensor([sentence_en])  # encoder input 
        X_dec = tf.convert_to_tensor(["starttoken " + translation])
        y_proba = a_model.predict((X, X_dec))[0, word_idx]  # last token's probas
        y_proba = np.log(y_proba) / temperature
        predicted_word_id = tf.random.categorical([y_proba], num_samples=1)[0,0]
        print(predicted_word_id)
        predicted_word = layer_spanish_vectorization.get_vocabulary()[predicted_word_id]
        if predicted_word == "endtoken":
            break
        translation += " " + predicted_word
    return translation.strip()

In [126]:
translate(model_B, "I like soccer")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
tf.Tensor(47, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(8053, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(21, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(744, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(167, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(93, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(13, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(6, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(210, shape=(), dtype=int64)
[1m1/1[0m [3

'tengo publicado los deberes así dinero un a otro piso'

Nice! However, the model struggles with longer sentences:

In [127]:
translate(model_A, "I like soccer and also going to the beach")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(852, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(755, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(8399, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(419, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(207, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(1448, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
tf.Tensor(87, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
tf.Tensor(717, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(1788, shape=(), dtype=int64)
[1m1/1

'hablo final indicaciones edad les billetera creo acabo harías lanza esfuerzos entre encantan griego colón japón convirtieron visitarte van huevos pacientes hizo intente rendimiento pecas empeorar fácilmente cerca que cables asesinado viene coche atrapados un evacuar mayor temprano preguntémosle médica'

In [128]:
translate(model_A, "When I fell before the game I broke my leg")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(7053, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(8191, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(4095, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(9982, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(9, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
tf.Tensor(1790, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(8097, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(4001, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(213, shape=(), dtype=int64)
[1m1

'preguntaron opuso ciertas llegasteis la estómago preciada modales desde extranjera conocía competente pasada hacer opinión actuar última vigilado respirar especiales seriamente aparentemente [UNK] enviarme logro pasado de ordinario observando campo dólares movió perro realidad profesora dólares sobredosis instalar adelante extraterrestres colectivo'

## Bidirectional RNNs

To create a bidirectional recurrent layer, just wrap a regular recurrent layer in a `Bidirectional` layer:

In [49]:
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_state=True))

In [50]:
from tensorflow.keras.layers import Lambda

encoder_outputs, *encoder_state = encoder(encoder_embeddings)

# Use a Lambda layer to handle the concatenation
def concatenate_states(states):
    short_term = tf.concat(states[::2], axis=-1)  # short-term (0 & 2)
    long_term = tf.concat(states[1::2], axis=-1)   # long-term (1 & 3)
    return [short_term, long_term]

# Apply the Lambda layer to the encoder state
encoder_state = Lambda(concatenate_states)(encoder_state)

**Warning**: the following cell will take a while to run (possibly a couple hours if you are not using a GPU).

In [52]:
# extra code — completes the model and trains it
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)
save_best_early_stop = SaveBestModelWithEarlyStopping(save_path="snapshots/translator_B.keras", patience=2)

model_B = tf.keras.Model(inputs=[encoder_input_layer, decoder_input_layer],
                       outputs=[Y_proba])
model_B.compile(loss=MaskedSparseCategoricalCrossentropy(from_logits=False), optimizer="nadam",
              metrics=[MaskedSparseCategoricalAccuracy()])
model_B.fit((X_train_encoder, X_train_decoder), Y_train, epochs=6, batch_size=64,
          validation_data=((X_valid_encoder, X_valid_decoder), Y_valid),
           callbacks = [save_best_early_stop])

Epoch 1/6
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - loss: 0.6867 - masked_sparse_categorical_accuracy: 0.2579
Epoch 1: val_loss improved to 0.4450. Model saved to snapshots/translator_B.keras
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 98ms/step - loss: 0.6866 - masked_sparse_categorical_accuracy: 0.2579 - val_loss: 0.4450 - val_masked_sparse_categorical_accuracy: 0.4262
Epoch 2/6
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - loss: 0.3947 - masked_sparse_categorical_accuracy: 0.4653
Epoch 2: val_loss improved to 0.3345. Model saved to snapshots/translator_B.keras
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 95ms/step - loss: 0.3947 - masked_sparse_categorical_accuracy: 0.4653 - val_loss: 0.3345 - val_masked_sparse_categorical_accuracy: 0.5280
Epoch 3/6
[1m1562/1563[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 40ms/step - loss: 0.2746 - masked_sparse_categ

<keras.src.callbacks.history.History at 0x7afd9b5d53a0>

In [53]:
translate(model_B, "I like soccer")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


'me gusta el fútbol'

In [54]:
translate(model_B, "I like soccer and also going to the beach")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


'me gusta el fútbol y no fui a la playa'

In [55]:
translate(model_B, "When I fell on the football field I broke my leg")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


'cuando me [UNK] el domingo me rasguñó la pierna'

In [85]:
translate(model_B, "We can order dinner but I don't want pizza.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


'podemos vernos pero no es más importante'

## Beam Search

This is a very basic implementation of beam search. I tried to make it readable and understandable, but it's definitely not optimized for speed! The function first uses the model to find the top _k_ words to start the translations (where _k_ is the beam width). For each of the top _k_ translations, it evaluates the conditional probabilities of all possible words it could add to that translation. These extended translations and their probabilities are added to the list of candidates. Once we've gone through all top _k_ translations and all words that could complete them, we keep only the top _k_ candidates with the highest probability, and we iterate over and over until they all finish with an EOS token. The top translation is then returned (after removing its EOS token).

* Note: If p(S) is the probability of sentence S, and p(W|S) is the conditional probability of the word W given that the translation starts with S, then the probability of the sentence S' = concat(S, W) is p(S') = p(S) * p(W|S). As we add more words, the probability gets smaller and smaller. To avoid the risk of it getting too small, which could cause floating point precision errors, the function keeps track of log probabilities instead of probabilities: recall that log(a\*b) = log(a) + log(b), therefore log(p(S')) = log(p(S)) + log(p(W|S)).

In [81]:
# extra code – a basic implementation of beam search

def beam_search(a_model, sentence_en, beam_width, verbose=False):
    X = tf.convert_to_tensor([sentence_en])  # encoder input
    X_dec = tf.convert_to_tensor(["startofseq"])  # decoder input
    y_proba = a_model.predict((X, X_dec))[0, 0]  # first token's probas
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [  # list of best (log_proba, translation)
        (np.log(word_proba), layer_spanish_vectorization.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
    ]
    
    # extra code – displays the top first words in verbose mode
    if verbose:
        print("Top first words:", top_translations)

    for idx in range(1, max_length):
        candidates = []
        for log_proba, translation in top_translations:
            print(".")
            print(translation)
            if translation.endswith("endofseq"):
                candidates.append((log_proba, translation))
                continue  # translation is finished, so don't try to extend it
            X = tf.convert_to_tensor([sentence_en])  # encoder input
            X_dec = tf.convert_to_tensor(["startofseq " + translation])  # decoder input
            y_proba = a_model.predict((X, X_dec))[0, idx]  # last token's proba
            for word_id, word_proba in enumerate(y_proba):
                word = layer_spanish_vectorization.get_vocabulary()[word_id]
                candidates.append((log_proba + np.log(word_proba),
                                   f"{translation} {word}"))
        top_translations = sorted(candidates, reverse=True)[:beam_width]

        # extra code – displays the top translation so far in verbose mode
        if verbose:
            print("Top translations so far:", top_translations)

        if all([tr.endswith("endofseq") for _, tr in top_translations]):
            return top_translations[0][1].replace("endofseq", "").strip()

In [129]:
# extra code – shows how the model making an error

sentence_en = "I like sleeping and steak"
translate(model_B, sentence_en)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
tf.Tensor(6, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(89, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
tf.Tensor(4944, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(676, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(2697, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(31, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(1405, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(2678, shape=(), dtype=int64)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
tf.Tensor(2325, shape=(), dtype=int64)
[1m1/1[

'a ahora consume treinta electrónico y crimen lento adora eso'

In [83]:
# extra code – shows how beam search can help
beam_search(model_B, sentence_en, beam_width=3, verbose=True)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Top first words: [(-0.34797168, 'me'), (-1.6177223, 'a'), (-3.414055, 'yo')]
.
me
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
.
a
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
.
yo
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Top translations so far: [(-0.38936737, 'me gusta'), (-2.0557368, 'a mí'), (-3.6431463, 'yo me')]
.
me gusta
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
.
a mí
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
.
yo me
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Top translations so far: [(-1.7894331, 'me gusta beber'), (-2.068506, 'me gusta dormir'), (-2.0925288, 'a mí me')]
.
me gusta beber
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


KeyboardInterrupt: 

The correct translation is in the top 3 sentences found by beam search, but it's not the first. Since we're using a small vocabulary, the \[UNK] token is quite frequent, so you may want to penalize it (e.g., divide its probability by 2 in the beam search function): this will discourage beam search from using it too much.

We need to feed all the encoder's outputs to the `Attention` layer, so we must add `return_sequences=True` to the encoder: