# Sequence to sequence models

In [1]:
import os
import re
import string
import random

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from sklearn.utils import shuffle

## Import data

We'll use a language dataset provided by http://www.manythings.org/anki/ which contains Spanish sentences along with their English translations.

In [2]:
tf.keras.utils.get_file('spa-eng.zip',
                        'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
                        cache_dir='./',
                        cache_subdir='datasets',
                        extract=True)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


'./datasets/spa-eng.zip'

In [3]:
text_file = "datasets/spa-eng/spa.txt"
with open(text_file) as text:
    lines = text.read().split("\n")[:-1]
text_pairs = []
for line in lines:
    english, spanish = line.split("\t")
    spanish = "[start] " + spanish + " [end]"
    text_pairs.append((english, spanish))

In [4]:
text_pairs[310:315]

[("I'm free!", '[start] ¡Soy libre! [end]'),
 ("I'm free.", '[start] Yo soy libre. [end]'),
 ("I'm full.", '[start] Estoy lleno. [end]'),
 ("I'm full.", '[start] Estoy llena. [end]'),
 ("I'm full.", '[start] Ya me llené. [end]')]

In [5]:
text_parirs = shuffle(text_pairs)

num_train = int(0.7 * len(text_pairs))
num_valid = int(0.2 * len(text_pairs))

train_pairs = text_pairs[:num_train]
valid_pairs = text_pairs[num_train:num_train + num_valid]
test_pairs = text_pairs[num_train + num_valid:]

## Preprocessing

### Standardization & vectorization

In [6]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    # target sentences are longer since they start with [start].
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

In [7]:
batch_size = 64

def format_dataset(eng, spa):
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({
        "english": eng,
        "spanish": spa[:, :-1],
    }, spa[:, 1:])

def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
valid_ds = make_dataset(valid_pairs)

In [8]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['spanish'].shape: {inputs['spanish'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (64, 20)
inputs['spanish'].shape: (64, 20)
targets.shape: (64, 20)


## Sequence to sequence model

### Encoder

In [9]:
embed_dim = 256
latent_dim = 1024

encoder_input = keras.Input(shape=(None,), dtype="int64", name="english")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(encoder_input)
encoder_output = layers.Bidirectional(layers.GRU(latent_dim), merge_mode="sum")(x)

### Decoder

In [10]:
decoder_input = keras.Input(shape=(None,), dtype="int64", name="spanish")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(decoder_input)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoder_output)
x = layers.Dropout(0.5)(x)
next_target = layers.Dense(vocab_size, activation="softmax")(x)
seq2seq_rnn = keras.Model(inputs = [encoder_input, decoder_input], 
                          outputs = next_target)

In [11]:
seq2seq_rnn.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 english (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 spanish (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    3840000     ['english[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    3840000     ['spanish[0][0]']                
                                                                                              

In [12]:
seq2seq_rnn.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
seq2seq_rnn.fit(train_ds, epochs=4, validation_data=valid_ds)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fe6b9af3cd0>

In [13]:
spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = seq2seq_rnn.predict(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(5):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
I only wish there was some way I could repay you.
[start] solo necesito verte lo que te haya dicho [end]
-
The desire to fly in the sky like a bird inspired the invention of the airplane.
[start] las hojas se ponen el avión al 5 [end]
-
A person views things differently according to whether they are rich or poor.
[start] una persona es más o menos una persona pobre [end]
-
You shouldn't say such a thing in the presence of children.
[start] no deberías decir que tal gente tan ocupado [end]
-
How could you just walk out the door without saying goodbye?
[start] cómo puedes hacer adiós y adiós [end]
