# Capítulo 7. Procesamiento de Lenguaje Natural

## <span style="color:green">0. Preparar el libro de trabajo </span>


In [1]:
#Importaciones comunes
import numpy as np
import pandas as pd

In [2]:
#Importar TF y Keras
import tensorflow as tf

In [3]:
#Preparar Matplotlib
import matplotlib as plt

In [5]:
#Semillas a 42
np.random.seed(42)
tf.random.set_seed(42)

In [6]:
#Print de versión
print(tf.__version__)

2.9.0


## <span style="color:green">1. Cargar el set de Datos e importar el dataset</span>

In [7]:
#Carga los datos a tu archivo de texto
with open("quijote.txt") as f:
    quijote_text = f.read()

In [8]:
#Imprime algún texto aleatorio del archivo (como el 147)
print(quijote_text[:100])

En un lugar de la Mancha, de cuyo nombre no quiero acordarme, no ha mucho
tiempo que vivía un hidalg


In [9]:
#Vamos viendo todos los caracters de "minúsculas" que trae el archivo
"".join(sorted(set(quijote_text.lower())))

'\n !"\'(),-.01234567:;?]abcdefghijlmnopqrstuvwxyz¡«»¿àáéíïñóùúü'

In [15]:
#Vamos a usar la capa de TextVectorization para mapear el quijote a caracteres
text_vec_layer = tf.keras.layers.TextVectorization(split = "character", standardize = "lower")

In [16]:
text_vec_layer.adapt([quijote_text])

In [17]:
encoded = text_vec_layer([quijote_text])[0]

In [18]:
#Computa el vocabulario de términos de String sobre el quijote
encoded[:100]

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([ 3,  7,  2, 11,  7,  2,  9, 11, 24,  4,  8,  2, 10,  3,  2,  9,  4,
        2, 15,  4,  7, 14, 22,  4, 16,  2, 10,  3,  2, 14, 11, 20,  5,  2,
        7,  5, 15, 21,  8,  3,  2,  7,  5,  2, 19, 11, 12,  3,  8,  5,  2,
        4, 14,  5,  8, 10,  4,  8, 15,  3, 16,  2,  7,  5,  2, 22,  4,  2,
       15, 11, 14, 22,  5, 17, 13, 12,  3, 15, 18,  5,  2, 19, 11,  3,  2,
       23, 12, 23, 25,  4,  2, 11,  7,  2, 22, 12, 10,  4,  9, 24],
      dtype=int64)>

In [19]:
#Abandona los tokens 0 y 1 (padding), no los usaremos
encoded -= 2

In [22]:
#Número total de caracteres distintos
n_tokens = text_vec_layer.vocabulary_size()-2

In [23]:
n_tokens

66

In [24]:
#Número total de caracteres en total
dataset_size = len(encoded)
dataset_size

2071088

In [None]:
#Usemos la clase de tokenizer de keras para codificar cada caracter como entero


## <span style="color:green">2. Separar el set en entrenamientos, pruebas y validación</span> ### 

In [26]:
#Crea una función que va a convertir el quijote en un dataset barajeado
def to_dataset(sequence, length, shuffle = False, seed = None, batch_size = 32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift = 1, drop_remainder = True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length+1))
    if shuffle:
        ds = ds.shuffle(100_000, seed = seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:,:-1],window[:,1:])).prefetch(1)

In [27]:
#Vamos a dividir el dataset en Train, valid y test usando la función anterior
length = 100
train_set = to_dataset(encoded[:1_000_000], length = length, shuffle = True, seed = 42 )
valid_set = to_dataset(encoded[1_000_000:1_060_000], length = length)
test_set = to_dataset(encoded[1_060_000:], length=length)

In [28]:
#Creamos un modelo sencillo
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = n_tokens, output_dim = 16),
    tf.keras.layers.GRU(128, return_sequences = True),
    tf.keras.layers.Dense(n_tokens, activation = "softmax")
])

In [29]:
#Compilamos 
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

In [31]:
#Dado que el modelo es tan grande, vamos a necesitar checkpoints
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "quijote_modelo", monitor="val_accuracy", save_best_only=True)

In [None]:
#Arma el checkpoint para guardar el modelo del quijote


## <span style="color:green">3. Construir y Entrenar el modelo Char-RNN</span>

In [None]:
#Ejecutamos el modelo. Nota que se puede tomar hasta 10 minutos por epoca, tal vez más
history = model.fit(train_set, validation_data = valid_set, epochs = 10, callbacks = [model_ckpt])

In [39]:
# Código para cargar el modelo ya entrenado (por si no lo quisiste entrenar)
model = tf.keras.models.load_model("quijote_modelo")

In [41]:
#Código para armar el modelo final
quijote_modelo = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X-2),
    model
])

## <span style="color:green">4. Generando Texto Falso</span>  

In [42]:
#Creamos una función llamada next char para crear texto nuevo 
def next_char(text, temperature = 1):
    y_proba = quijote_modelo.predict([text])[0,-1:]
    rescaled_logits = tf.math.log(y_proba)/temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples = 1)[0,0]
    return text_vec_layer.get_vocabulary()[char_id+2]

In [44]:
#Creamos una función llamada extend_text para crear la secuencia completa
def extend_text(text, n_chars = 50, temperature = 1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [45]:
#Cambia la semilla a 42
tf.random.set_seed(42)

In [47]:
#Probamos nuestra función nueva
print(extend_text("La virtud más es perseguida de los malos", temperature = 0.01))

La virtud más es perseguida de los malos de la mano de alguna de mi casa de la mano de alg


In [48]:
#Prueba 2
print(extend_text("La virtud más es perseguida de los malos", temperature=1))

La virtud más es perseguida de los malos hijos incledo el dovea ser su ignore, y no deje e


In [49]:
#Prueba 3
print(extend_text("La virtud más es perseguida de los malos", temperature=100))

La virtud más es perseguida de los malosgeóÍÁ,3óütó-4',úéh,;4)íÉ2"6v(ósá-!c¿üw.md?!.z"à?](


## <span style="color:green">5. Stateful RNN</span>

In [50]:
#Comenzamos definiendo una función que prepara el dataset para nuestro stateful RNN
def to_dataset_for_stateful_rnn(sequence,length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=length, drop_remainder=True)
    ds = ds.flat_map(lambda window: window.batch(length + 1)).batch(1)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [52]:
#Ahora separamos en train, valid y test
stateful_train_set = to_dataset_for_stateful_rnn(encoded[1_000_000:], length)
stateful_valid_set = to_dataset_for_stateful_rnn(encoded[1_000_000:1_600_000], length)
stateful_test_set = to_dataset_for_stateful_rnn(encoded[1_060_000:], length)

In [54]:
#OK sigue aplicar el modelo secuencial en keras - embedding, GRU, Dense
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16,
                              batch_input_shape=[1, None]),
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

In [53]:
#Al final de cada epoca debemos resetear los estados antes de vovler al inicio del texto
class ResetStatesCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [55]:
#Usa una celda diferente para guardar los checkpoints
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "qujote_stateful_modelo",
    monitor="val_accuracy",
    save_best_only=True)

In [56]:
#Y ahora compilamos y Ejecutamos el modelo

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

In [None]:
#Y ahora compilamos y ajustamos el modelo
history = model.fit(stateful_train_set, validation_data=stateful_valid_set,
                    epochs=10, callbacks=[ResetStatesCallback(), model_ckpt])

## <span style="color:green">6. Sentiment Analysis</span>

In [57]:
#Importa Tensroflow Datasetse como tfds
import tensorflow_datasets as tfds

In [58]:
#Carga los datos de train, valid y test
raw_train_set, raw_valid_set, raw_test_set = tfds.load(
    name = "imdb_reviews",
    split =["train[:90%]", "train[90%:]", "test"],
    as_supervised = True
)

In [59]:
#Arma el dataset de train - barajea con buffer de 5000, bachealo con 32 y prefecth
train_set = raw_train_set.shuffle(5000, seed = 42).batch(32).prefetch(1)

In [60]:
#Validación y test con batch de 32 y prefetch
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)

In [65]:
#Vamos viendo unos reviews de muestra - arma un loop que agarre 4 reviews y las decodifique
for review, label in raw_train_set.take(4):
    print(review.numpy().decode("utf-8"))
    print("label", label.numpy())

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
label 0
I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development 

In [67]:
#Define 1000 como tamaño de vocabulario
vocab_size = 1000

In [68]:
#Arma tu capa de textVectorization
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens = vocab_size)

In [69]:
#Aplica tu .adapt
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))

In [71]:
#Arma el modelo, capa de text_vec_layer, embedding, GRU y dense, embedding de 128
embed_size = 128
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(vocab_size, 128),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation = "sigmoid")
])

In [72]:
#Compila con crossentropy binaria y nadam
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

In [73]:
#Ejecuta tu modelo
history = model.fit(train_set, validation_data=valid_set, epochs=2)

Epoch 1/2
Epoch 2/2


In [75]:
#Evalúa tu modelo
model.evaluate(test_set)



[0.6941766142845154, 0.5]

In [77]:
#Predice un test set
model.predict(test_set.take(1))



array([[0.51518875],
       [0.51518875],
       [0.45218506],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875],
       [0.51518875]], dtype=float32)

In [78]:
#Checa los resultados que si eran de verad
for review,label in test_set.take(1):
    print(label)

tf.Tensor([1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 1 1 1], shape=(32,), dtype=int64)


### <span style="color:blue">6.1 Masking</span>

In [84]:
#Arma tu modelo con Masking
embed_size=128
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero = True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [85]:
#Compilalo
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

In [86]:
#Ajustalo
history = model.fit(train_set, validation_data=valid_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [87]:
#Evalualo
model.evaluate(test_set)



[0.3535629212856293, 0.8510400056838989]

In [88]:
#Predict al primer bache vs
model.predict(test_set.take(1))




array([[0.92258024],
       [0.978152  ],
       [0.05177713],
       [0.37396833],
       [0.9917114 ],
       [0.992811  ],
       [0.99797446],
       [0.9949511 ],
       [0.8283124 ],
       [0.9814276 ],
       [0.01334537],
       [0.64912254],
       [0.8617164 ],
       [0.14778893],
       [0.77455235],
       [0.8525385 ],
       [0.9904516 ],
       [0.41010854],
       [0.9301605 ],
       [0.00269966],
       [0.0019085 ],
       [0.98036754],
       [0.02791318],
       [0.72845066],
       [0.99120444],
       [0.30135474],
       [0.07820202],
       [0.516478  ],
       [0.26912013],
       [0.9400048 ],
       [0.8926644 ],
       [0.9854929 ]], dtype=float32)

In [89]:
#La realidad de los labels
for review,label in test_set.take(1):
    print(label)

tf.Tensor([1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 1 1 1], shape=(32,), dtype=int64)


## <span style="color:green">7. Encoder - Decoder para Traducción</span>

In [90]:
#Importa IO y lee el texto de traducciones inglés y español
import io
read_file= io.open("spa.txt", "r", encoding = "utf8")
text = read_file.read()

In [91]:
text[:100]

'Go.\tVe.\nGo.\tVete.\nGo.\tVaya.\nGo.\tVáyase.\nHi.\tHola.\nRun!\t¡Corre!\nRun.\tCorred.\nWho?\t¿Quién?\nFire!\t¡Fueg'

In [92]:
#Traite numpy y quita los caracteres superfluos en español. Arma las parejas y barajealas. Separalas en oraciones en, oraciones es
import numpy as np
text = text.replace("¡", "").replace("¿","")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42)
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)

In [97]:
#Imprime 3 oraciones de inglés/español
for i in range(3):
    print(sentences_en[i], "=>", sentences_es[i])

How boring! => Qué aburrimiento!
I love sports. => Adoro el deporte.
Would you like to swap jobs? => Te gustaría que intercambiemos los trabajos?


In [99]:
#Vocab Size de 1000 y max length de 50
vocab_size = 1000
max_length = 50

In [100]:
#Convierte ingles a números
text_vec_layer_en = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length=max_length)

In [101]:
#Convierte Español a números
text_vec_layer_es = tf.keras.layers.TextVectorization(vocab_size, output_sequence_length = max_length)

In [102]:
#Usa Adapt para computar un vocabulario de strings desde los tokens en tu vocabulario vectorizado (se tarda)
text_vec_layer_en.adapt(sentences_en)

In [103]:
text_vec_layer_es.adapt(([f"startofseq {s} endofseq" for s in sentences_es]))

In [104]:
#Regresa los 10 tokens mas usados de la capa de ingles
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [105]:
#Regresa los 10 tokens mas usados de la capa de español
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [106]:
#Define tus sets de train, validación, train y validación para el decoder, y las Ys
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

In [107]:
#Define las capas de entradas del encoder y del decoder
tf.random.set_seed(42)
encoder_inputs = tf.keras.layers.Input(shape=[], dtype = tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [108]:
#Ahora asignale tus textos vectorizados a esas capas nuevas
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

In [109]:
#Prepara una capa de embedding cada uno(decoder y encoder)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero = True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

In [116]:
#Y ahora mete tus input_ids a la capa de embedding
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [112]:
#Define el encoder como 1 sola capa LSTM de 512 neuronas, pasa tus embeddings por esa capa ye scupe los outputs y el state
encoder = tf.keras.layers.LSTM(512, return_state = True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [117]:
#Repite el proceso correspondiente para el decoder
decoder = tf.keras.layers.LSTM(512, return_sequences = True)
decoder_outputs = decoder(decoder_embeddings, initial_state = encoder_state)

In [118]:
#La capa de salida será una capa Densa del mismo tamaño que el vocabulario con activacion softmax, nos escupira la probabilidad Y de alguna palabra tomando los decoder outputs como entrada
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [120]:
# Arma tu modelo
model = tf.keras.Model(inputs = [encoder_inputs, decoder_inputs], outputs = [Y_proba])

In [121]:
#Compila
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

In [122]:
#Arma un checkpoint para guardar el modelo
tmodel_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "traductor_modelo", monitor="val_accuracy", save_best_only=True)

In [124]:
#Ejecuta a 10 epocas, nota que tienes que meter 2 sets de entrenamiento y validación para X (enc y dec) - se va a tardar como 20 minutos por epoca sin GPU! (1 min sin gpu)
history=model.fit((X_train, X_train_dec), Y_train, epochs = 10, validation_data=((X_valid, X_valid_dec), Y_valid), callbacks = [tmodel_ckpt])

Epoch 1/10



INFO:tensorflow:Assets written to: traductor_modelo\assets


INFO:tensorflow:Assets written to: traductor_modelo\assets


Epoch 2/10



INFO:tensorflow:Assets written to: traductor_modelo\assets


INFO:tensorflow:Assets written to: traductor_modelo\assets


Epoch 3/10



INFO:tensorflow:Assets written to: traductor_modelo\assets


INFO:tensorflow:Assets written to: traductor_modelo\assets


Epoch 4/10



INFO:tensorflow:Assets written to: traductor_modelo\assets


INFO:tensorflow:Assets written to: traductor_modelo\assets


Epoch 5/10



INFO:tensorflow:Assets written to: traductor_modelo\assets


INFO:tensorflow:Assets written to: traductor_modelo\assets


Epoch 6/10



INFO:tensorflow:Assets written to: traductor_modelo\assets


INFO:tensorflow:Assets written to: traductor_modelo\assets


Epoch 7/10



INFO:tensorflow:Assets written to: traductor_modelo\assets


INFO:tensorflow:Assets written to: traductor_modelo\assets


Epoch 8/10
Epoch 9/10
Epoch 10/10


In [135]:
# Vamos ahora a armar el traductor
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])
        X_dec = np.array(["startofseq" + translation])
        y_proba = model.predict((X, X_dec))[0,word_idx]
        predicted_word_id=np.argmax(y_proba)
        predicted_word =text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

In [136]:
#Prueba con i like soccer
translate("I like soccer")



'me gusta el fútbol'

In [138]:
#Prueba con una oración larga como i like soccer and going to the beach
translate("I like soccer and also going to the beach")



'[UNK] el fútbol y también ir a la playa'

### <span style="color:blue">7.1 Capas Recurrentes Bidireccionales</span>

In [None]:
#Arma un modelo secuencial con cuna capa GRU y una capa GRU bidireccional
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_state=True))


## <span style="color:green">8. Atención</span>

In [128]:
#Envuelve tu encoder en una capa bidireccional, como hace ratito
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_sequences = True, return_state = True))

In [129]:
#Vamos a volver a armar esta parte del modelo, casi igualita a la anterior, para que cache nuestra nueva onda bidireccional
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),tf.concat(encoder_state[1::2], axis=-1)]
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
ecoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [131]:
#Armamos las capas de atención en keras
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(attention_outputs)

In [132]:
#Y terminamos el modelo
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])

In [133]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

In [134]:
model.fit((X_train, X_train_dec), Y_train, epochs=10,
          validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ebc13b49d0>