In [1]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization


In [2]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [4]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))


In [5]:
for _ in range(5):
    print(random.choice(text_pairs))


('My room is upstairs on the left.', '[start] Mi cuarto está subiendo las escaleras a la izquierda. [end]')
('The day when we first met was a rainy day.', '[start] El día en que nos conocimos fue un día lluvioso. [end]')
('It would take forever for me to explain everything.', '[start] Me tomaría una eternidad explicarte todo. [end]')
('The boy broke the window with a baseball last weekend.', '[start] El niño rompió la ventana con una pelota de béisbol el fin de semana pasado. [end]')
('They spoke quietly so as not to wake the baby.', '[start] Ellos hablan en voz baja para no despertar al bebé. [end]')


In [6]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")


118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


In [None]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 20
batch_size = 64


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
)
spa_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)


In [8]:
train_eng_texts[0]

'I want her in my office.'

In [12]:
eng_vectorization(train_eng_texts[0])

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([  3,  32,  42,  10,  19, 426,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])>

In [15]:
eng_vectorization("I I")

<tf.Tensor: shape=(20,), dtype=int64, numpy=array([3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>

In [79]:
def format_dataset(eng, spa):
    eng = eng_vectorization(eng)
    spa = spa_vectorization(spa)
    return ({"encoder_inputs": eng, "decoder_inputs": spa[:, :-1],}, spa[:, 1:])

def make_dataset(pairs):
    eng_texts, spa_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()
    return dataset


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)


In [80]:
for inputs, targets in train_ds.take(1):
    print(inputs["encoder_inputs"])
    print(inputs["encoder_inputs"])

tf.Tensor(
[[  2  97  68 ...   0   0   0]
 [ 13   8  10 ...   0   0   0]
 [  6  35  26 ...   0   0   0]
 ...
 [  3 216 209 ...   0   0   0]
 [ 19 652  71 ...   0   0   0]
 [  5  41 260 ...   0   0   0]], shape=(64, 20), dtype=int64)
tf.Tensor(
[[  2  97  68 ...   0   0   0]
 [ 13   8  10 ...   0   0   0]
 [  6  35  26 ...   0   0   0]
 ...
 [  3 216 209 ...   0   0   0]
 [ 19 652  71 ...   0   0   0]
 [  5  41 260 ...   0   0   0]], shape=(64, 20), dtype=int64)


2022-05-18 05:51:22.507853: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [93]:
dataset_formatted[0]['encoder_inputs'][0]

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([  3,  32,  42,  10,  19, 426,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])>

In [94]:
dataset_formatted[0]['decoder_inputs'][0]

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([  2,   9,  46,  11,  23, 388,   3,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])>

In [95]:
dataset_formatted[1][0]

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([  9,  46,  11,  23, 388,   3,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])>