## CH 16. Natural Language Processing with RNNs and Attention

### Generating Shakespearean Text Using a Character RNN

In [1]:
import tensorflow as tf

def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

def next_char(text, temperature=1):
    y_proba = shakespeare_model.predict([text])[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id +2]

def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

shakespeare_url = "http://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)

with open(filepath) as f:
    shakespeare_text = f.read()

text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")

text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

encoded -= 2
n_tokens = text_vec_layer.vocabulary_size() - 2

length = 100
tf.random.set_seed(42)

train_set = to_dataset(encoded[:1_000_000], length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_000_000 + 60_000], length)
test_set = to_dataset(encoded[1_000_000 + 60_000:], length)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "shakespeare_rnn.h5", 
    monitor="val_accuracy", 
    best_only=True
)

# history = model.fit(train_set, epochs=10, validation_data=valid_set, callbacks=[model_ckpt])

2025-04-07 08:04:06.833576: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-07 08:04:06.886372: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-07 08:04:06.886463: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-07 08:04:06.887769: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-07 08:04:06.893825: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-07 08:04:06.894905: I tensorflow/core/platform/cpu_feature_guard.cc:1

Exception: URL fetch failure on http://homl.info/shakespeare: 503 -- Unavailable, the server is paused.

### Sentiment Analysis for IMDb Reviews

In [1]:
!pip install tensorflow_datasets
import tensorflow as tf
import tensorflow_datasets as tfds

raw_train_set, raw_valid_set, raw_test_set = tfds.load(
    name="imdb_reviews",
    split=["train[:90%]", "train[90%:]", "test"],
    as_supervised=True
)
tf.random.set_seed(42)
train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)
valid_set = raw_valid_set.batch(32).prefetch(1)
test_set = raw_test_set.batch(32).prefetch(1)

vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda text, _: text))

embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),
    tf.keras.layers.GRU(128),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(
    loss="binary_crossentropy", 
    optimizer="nadam", 
    metrics=["accuracy"]
)
history = model.fit(train_set, epochs=2, validation_data=valid_set)

Collecting tensorflow_datasets
  Downloading tensorflow_datasets-4.9.8-py3-none-any.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting etils[edc,enp,epath,epy,etree]>=1.6.0
  Downloading etils-1.12.2-py3-none-any.whl (167 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.6/167.6 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-metadata
  Downloading tensorflow_metadata-1.16.1-py3-none-any.whl (28 kB)
Collecting immutabledict
  Downloading immutabledict-4.2.1-py3-none-any.whl (4.7 kB)
Collecting toml
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Collecting promise
  Downloading promise-2.3.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting array_record>=0.5.0
  Downloading array_record-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

### An Encoder–Decoder Network for Neural Machine Translation

In [3]:
import tensorflow as tf
import numpy as np
from pathlib import Path

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, extract=True, cache_dir=".")
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42)
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)

vocab_size =1000
max_length = 50
text_vec_layer_en = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, 
    output_sequence_length=max_length
)
text_vec_layer_es = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, 
    output_sequence_length=max_length
)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]]).numpy()
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]]).numpy()

encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(
    vocab_size, 
    embed_size,
    mask_zero=True
)
decoder_embedding_layer = tf.keras.layers.Embedding(
    vocab_size, 
    embed_size,
    mask_zero=True
)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=Y_proba)

model.compile(
    loss="sparse_categorical_crossentropy", 
    optimizer="nadam", 
    metrics=["accuracy"]
)

model.fit(
    (X_train, X_train_dec),
    Y_train,
    epochs=1, 
    validation_data=((X_valid, X_valid_dec), Y_valid)
)



<keras.src.callbacks.History at 0x7fcd250caaa0>

### Attention Is All You Need: The Original Transformer

In [3]:
import tensorflow as tf
import numpy as np
from pathlib import Path

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, extract=True, cache_dir=".")
text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42)
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)

vocab_size =1000
max_length = 50
text_vec_layer_en = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, 
    output_sequence_length=max_length
)
text_vec_layer_es = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, 
    output_sequence_length=max_length
)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]]).numpy()
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]]).numpy()

encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(
    vocab_size, 
    embed_size,
    mask_zero=True
)
decoder_embedding_layer = tf.keras.layers.Embedding(
    vocab_size, 
    embed_size,
    mask_zero=True
)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

max_length = 50
embed_size = 128
pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size)
batch_max_len_enc = tf.shape(encoder_embeddings)[1]
encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))
batch_max_len_dec = tf.shape(decoder_embeddings)[1]
decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_dec))

# Build the encoder for transformer achitecture
N = 2
num_heads = 2
dropout_rate = 0.1
n_units = 128
encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
Z_enc = encoder_in
embed_size = 128

for _ in range(N):
    skip = Z_enc
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_size,
        dropout=dropout_rate
    )
    Z_enc = attn_layer(Z_enc, value=Z_enc, attention_mask=encoder_pad_mask)
    Z_enc = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z_enc, skip]))
    skip = Z_enc
    Z_enc = tf.keras.layers.Dense(n_units, activation="relu")(Z_enc)
    Z_enc = tf.keras.layers.Dense(embed_size)(Z_enc)
    Z_enc = tf.keras.layers.Dropout(dropout_rate)(Z_enc)
    Z_enc = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z_enc, skip]))

# Build the decoder for transformer achitecture
decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
causal_mask = tf.linalg.band_part(tf.ones((batch_max_len_dec, batch_max_len_dec), dtype=tf.bool), -1, 0)

encoder_outputs = Z_enc
Z_dec = decoder_in
for _ in range(N):
    skip = Z_dec
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_size,
        dropout=dropout_rate
    )
    Z_dec = attn_layer(Z_dec, value=Z_enc, attention_mask=causal_mask & decoder_pad_mask)
    Z_dec = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z_dec, skip]))
    skip = Z_dec
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_size,
        dropout=dropout_rate
    )
    Z_dec = attn_layer(Z_dec, value=encoder_outputs, attention_mask=encoder_pad_mask)
    Z_dec = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z_dec, skip]))
    skip = Z_dec
    Z_dec = tf.keras.layers.Dense(n_units, activation="relu")(Z_dec)
    Z_dec = tf.keras.layers.Dense(embed_size)(Z_dec)
    Z_dec = tf.keras.layers.Dropout(dropout_rate)(Z_dec)
    Z_dec = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z_dec, skip]))

Y_proba = tf.keras.layers.Dense(vocab_size, activation="softmax")(Z_dec)

model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])

model.compile(
    loss="sparse_categorical_crossentropy", 
    optimizer="nadam", 
    metrics=["accuracy"]
)

model.fit(
    (X_train, X_train_dec),
    Y_train,
    epochs=1, 
    validation_data=((X_valid, X_valid_dec), Y_valid)
)




<keras.src.callbacks.History at 0x7f6670074910>

### Hugging Face’s Transformers Library

In [19]:
!pip install --upgrade pip
!pip install transformers
!pip install torch

from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier("I love you")
result



RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
cannot import name 'Backend' from 'torch._C._distributed_c10d' (unknown location)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=968d3c27-50e7-4d42-bdd9-442f6904c1c2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>