# Lab 8: Text generation with transformer

* [Keras example](https://keras.io/examples/generative/text_generation_with_miniature_gpt/)
* [Tensorflow example](https://www.tensorflow.org/text/tutorials/transformer)
* [mlnuggets](https://www.machinelearningnuggets.com/transformer-decoder/)

* [tensorflow GPU for cluster](https://www.tensorflow.org/install/pip)

In [2]:
# import setuptools.dist
import keras
import tensorflow as tf
from keras import layers, Model, Sequential, ops
from keras.layers import TextVectorization
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import re

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

2025-01-07 20:08:57.765573: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736244537.785412 3622431 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736244537.791619 3622431 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-07 20:08:57.812198: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]


In [4]:
name_url = ("Crime and Punishment", 'https://www.gutenberg.org/files/2554/2554-0.txt')

filepath = keras.utils.get_file(f'{name_url[0]}.txt', origin=name_url[1])
text_f = ''
with open(filepath, encoding='utf-8') as f:
    text_f = f.read()[10000:] # skip preface +-

text = text_f

text = re.sub(r"[\"\`\'\’\“\”]", r"", text_f)
text = re.sub(r"[\(\)]", r"", text_f)
text = re.sub(r"[\.\!\?]", "!", text)
text = re.sub(r"\s+", " ", text)

text_list = text.replace('\n', ' ').split('!')
text_list = list(map(lambda x: x.strip(), text_list))
print( len(text_list) )

text_list = list(filter(None, text_list))

import random
random.shuffle(text_list)

length = len(text_list)
text_train = text_list[:int(0.8*length)]
text_valid = text_list[int(0.8*length):]

20594


In [5]:
words = sorted([(len(a:=line.split(" ")), a, line) for line in text_list], reverse=1)
MAX_TOKENS = MAX_LEN = max([len(line.split(" ")) for line in text_list])

vectorize_layer = TextVectorization(
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

vectorize_layer.adapt(text_list)
vocab = vectorize_layer.get_vocabulary()
print(len(vocab))

word_from_id = tf.keras.layers.StringLookup(vocabulary=vocab, mask_token="", oov_token="[UNK]",  invert=True)

I0000 00:00:1736244541.868732 3622431 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31490 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:04:00.0, compute capability: 8.0
I0000 00:00:1736244541.871781 3622431 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 32374 MB memory:  -> device: 1, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:08:00.0, compute capability: 8.0
I0000 00:00:1736244541.874649 3622431 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 32362 MB memory:  -> device: 2, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:85:00.0, compute capability: 8.0
I0000 00:00:1736244541.877101 3622431 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 32374 MB memory:  -> device: 3, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:89:00.0, compute capability: 8.0


11697


In [6]:
BATCH_SIZE = 64
BUFFER_SIZE = 256

def preprocess(text_l: list):

    def preprocess_text(text):
        text = tf.expand_dims(text, -1)
        tokenized_sentences = vectorize_layer(text)
        x = tokenized_sentences[:, :-1]
        y = tokenized_sentences[:, 1:]
        return x, y
    
    return (
        tf.data.Dataset.from_tensor_slices(text_l)
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE)
            .map(preprocess_text)
            .prefetch(tf.data.AUTOTUNE)
    )

dataset_train = preprocess(text_train)
dataset_valid = preprocess(text_valid)

In [7]:
for X_train, y_train in dataset_train.take(1):
    pass
print(X_train.shape)
print(y_train.shape)

(64, 102)
(64, 102)


2025-01-07 20:09:02.732329: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:
def positional_encoding(length, depth):
    depth = depth / 2

    positions = np.arange(length)[:, np.newaxis]  # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :] / depth  # (1, depth)

    angle_rates = 1 / (10000**depths)  # (1, depth)
    angle_rads = positions * angle_rates  # (pos, depth)

    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [9]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

class CausalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output, attn_scores = self.mha(
            query=x, value=x, key=x, return_attention_scores=True, use_causal_mask=True
        )
        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [10]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(dff, activation="relu"),
                tf.keras.layers.Dense(d_model),
                tf.keras.layers.Dropout(dropout_rate),
            ]
        )
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.causal_self_attention = CausalSelfAttention(
            num_heads=num_heads, key_dim=d_model, dropout=dropout_rate
        )

        self.ffn = FeedForward(d_model, dff)

    def call(self, x):
        x = self.causal_self_attention(x=x)
        # Cache the last attention scores for plotting later
        self.last_attn_scores = self.causal_self_attention.last_attn_scores
        x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
        return x

In [11]:
class Decoder(tf.keras.layers.Layer):
    def __init__(
        self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1
    ):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [
            DecoderLayer(
                d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate
            )
            for _ in range(num_layers)
        ]

        self.last_attn_scores = None

    def call(self, x):
        # `x` is token-IDs shape (batch, target_seq_len)
        x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x)

        self.last_attn_scores = self.dec_layers[-1].last_attn_scores

        # The shape of x is (batch_size, target_seq_len, d_model).
        return x

In [12]:
X_train_emb = PositionalEmbedding(len(vocab), 512)(X_train)
sample_decoder = Decoder(
    num_layers=4, d_model=512, num_heads=8, dff=2048, vocab_size=len(vocab)
)

# Print the shapes.
print(X_train.shape)
print(X_train_emb.shape)
output = sample_decoder(x=X_train)
print(output.shape)

(64, 102)
(64, 102, 512)
(64, 102, 512)


In [13]:
class Transformer(tf.keras.Model):
    def __init__(
        self, *, num_layers, d_model, num_heads, dff, input_vocab_size, dropout_rate=0.1
    ):
        super().__init__()

        self.decoder = Decoder(
            num_layers=num_layers,
            d_model=d_model,
            num_heads=num_heads,
            dff=dff,
            vocab_size=input_vocab_size,
            dropout_rate=dropout_rate,
        )

        self.final_layer = tf.keras.layers.Dense(input_vocab_size)

    def call(self, inputs):
        # To use a Keras model with `.fit` you must pass all your inputs in the
        # first argument.
        x = inputs

        x = self.decoder(x)  # (batch_size, target_len, d_model)

        # Final linear layer output.
        logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

        try:
            # Drop the keras mask, so it doesn't scale the losses/metrics.
            # b/250038731
            del logits._keras_mask
        except AttributeError:
            pass

        # Return the final output and the attention weights.
        return logits

In [14]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
learning_rate = CustomSchedule(512)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)

def masked_loss(label, pred):
    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction="none"
    )
    loss = loss_object(label, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
    return loss


def masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred

    mask = label != 0

    match = match & mask

    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match) / tf.reduce_sum(mask)

In [15]:
transformer = Transformer(num_layers=4, d_model=512, num_heads=8, dff=2048, input_vocab_size=len(vocab))

In [16]:
transformer.summary()

In [17]:
transformer.compile(loss=masked_loss, optimizer=optimizer, metrics=[masked_accuracy])
history = transformer.fit(dataset_train, epochs=10, validation_data=dataset_valid)

Epoch 1/10


I0000 00:00:1736244570.840616 3624437 service.cc:148] XLA service 0x7f4098013000 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1736244570.840662 3624437 service.cc:156]   StreamExecutor device (0): NVIDIA A100-PCIE-40GB, Compute Capability 8.0
I0000 00:00:1736244570.840669 3624437 service.cc:156]   StreamExecutor device (1): NVIDIA A100-PCIE-40GB, Compute Capability 8.0
I0000 00:00:1736244570.840673 3624437 service.cc:156]   StreamExecutor device (2): NVIDIA A100-PCIE-40GB, Compute Capability 8.0
I0000 00:00:1736244570.840679 3624437 service.cc:156]   StreamExecutor device (3): NVIDIA A100-PCIE-40GB, Compute Capability 8.0
2025-01-07 20:09:31.055100: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
W0000 00:00:1736244571.610406 3624437 assert_op.cc:38] Ignoring Assert operator compile_loss/masked_loss/sparse_categorical_crosse

[1m  1/204[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:14:20[0m 22s/step - loss: 9.4069 - masked_accuracy: 0.0000e+00

I0000 00:00:1736244581.367369 3624437 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m203/204[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 95ms/step - loss: 8.9801 - masked_accuracy: 0.0205

W0000 00:00:1736244601.276260 3624440 assert_op.cc:38] Ignoring Assert operator compile_loss/masked_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert








[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step - loss: 8.9776 - masked_accuracy: 0.0205

W0000 00:00:1736244610.910983 3624440 assert_op.cc:38] Ignoring Assert operator compile_loss/masked_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
W0000 00:00:1736244613.819643 3624438 assert_op.cc:38] Ignoring Assert operator compile_loss/masked_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert











[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 175ms/step - loss: 8.9751 - masked_accuracy: 0.0206 - val_loss: 7.3582 - val_masked_accuracy: 0.0405
Epoch 2/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 103ms/step - loss: 6.9115 - masked_accuracy: 0.0538 - val_loss: 6.2171 - val_masked_accuracy: 0.1102
Epoch 3/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 103ms/step - loss: 5.9794 - masked_accuracy: 0.1213 - val_loss: 5.7610 - val_masked_accuracy: 0.1393
Epoch 4/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 105ms/step - loss: 5.4300 - masked_accuracy: 0.1528 - val_loss: 5.5649 - val_masked_accuracy: 0.1518
Epoch 5/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 105ms/step - loss: 5.0438 - masked_accuracy: 0.1773 - val_loss: 5.5241 - val_masked_accuracy: 0.1557
Epoch 6/10
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 105ms/step - loss: 4.7135 - masked_acc

In [18]:
transformer.summary()

In [20]:
class Generator(tf.Module):
    def __init__(
        self,
        tokenizer,
        vocabulary,
        transformer,
        max_new_tokens,
        temperature=0.0,
    ):
        self.tokenizer = tokenizer
        self.transformer = transformer
        self.vocabulary = vocabulary
        self.max_new_tokens = max_new_tokens
        self.temperature = temperature

    def __call__(self, sentence, max_length=MAX_TOKENS):
        sentence = self.tokenizer(sentence)
        sentence = tf.expand_dims(sentence, axis=0)
        encoder_input = sentence
        # `tf.TensorArray` is required here (instead of a Python list), so that the
        # dynamic-loop can be traced by `tf.function`.
        output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)

        print(f"Generating {self.max_new_tokens} tokens")
        for i in tf.range(self.max_new_tokens):
            output = tf.transpose(output_array.stack())
            predictions = self.transformer(encoder_input, training=False)

            # Select the last token from the `seq_len` dimension.
            predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.
            if self.temperature == 0.0:
                # greedy sampling, output always the same
                predicted_id = tf.argmax(predictions, axis=-1)
            else:
                predictions = predictions / self.temperature
                predicted_id = tf.random.categorical(predictions[0], num_samples=1)

            # Concatenate the `predicted_id` to the output which is given to the
            # decoder as its input.
            output_array = output_array.write(i + 1, predicted_id[0])
            encoder_input = tf.experimental.numpy.append(encoder_input, predicted_id[0])
            encoder_input = tf.expand_dims(encoder_input, axis=0)

        output = tf.transpose(output_array.stack())
        # The output shape is `(1, tokens)`.
        id_to_word = tf.keras.layers.StringLookup(
            vocabulary=self.vocabulary, mask_token="", oov_token="[UNK]", invert=True
        )

        print(f"Using temperature of {self.temperature}")
        text = id_to_word(output)
        tokens = output

        # `tf.function` prevents us from using the attention_weights that were
        # calculated on the last iteration of the loop.
        # So, recalculate them outside the loop.
        self.transformer(output[:, :-1], training=False)
        attention_weights = self.transformer.decoder.last_attn_scores

        return text, tokens, attention_weights

In [21]:
max_new_tokens = 50
temperature = 0.8
generator = Generator(
    vectorize_layer, vocab, transformer, max_new_tokens, temperature, 
)
def print_generation(sentence, generated_text):
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Generation":15s}: {generated_text}')
    print(f'{"Text":15s}: {sentence} {" ".join(list(map(lambda x: x.numpy().decode("utf-8"), generated_text[0])))}')


In [22]:
sentence = "Raskolnikov was a student who"
generated_text, generated_tokens, attention_weights = generator(sentence)
print_generation(sentence, generated_text)

Generating 50 tokens
Using temperature of 0.8
Input:         : Raskolnikov was a student who
Generation     : [[b'' b'than' b'a' b'moment' b'of' b'lizaveta' b'who' b'he' b'had' b'on'
  b'his' b'first' b'place' b'he' b'had' b'even' b'brought' b'him' b'a'
  b'hundred' b'roubles' b'on' b'his' b'heart' b'even' b'more' b'even'
  b'he' b'could' b'not' b'whether' b'he' b'blow' b'blow' b'he' b'was'
  b'simply' b'a' b'new' b'fellow' b'at' b'once' b'into' b'there' b'on'
  b'that' b'question' b'i' b'sum' b'of' b'everything']]
Text           : Raskolnikov was a student who  than a moment of lizaveta who he had on his first place he had even brought him a hundred roubles on his heart even more even he could not whether he blow blow he was simply a new fellow at once into there on that question i sum of everything
