In [20]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import os
from tensorflow.keras import layers
import string

In [21]:
BATCH_SIZE = 128
NUM_HEADS = 8
NUM_BLOCKS = 4
EMBED_DIM = 512
DENSE_DIM = 2048
DROPOUT_RATE = 0.2
CHUNK_LENGTH = 150

In [22]:
df = pd.read_csv('/kaggle/input/the-bards-best-a-character-modeling-dataset/train.csv')

In [23]:
text = df.values[0][0]
text = re.sub(r'\s+', ' ', str(text)).strip()

In [24]:
import re

words = text.split()

# Count unique words
unique_words = set(words)
print(f"Total words: {len(words)}")
print(f"Unique words: {len(unique_words)}")


Total words: 182499
Unique words: 23841


In [25]:
def chunk_text_by_words(text, max_words, stride=None):
    words = text.split()
    if stride is None:
        stride = max_words // 2
    chunks = []
    for i in range(0, len(words) - max_words, stride):
        chunk = ' '.join(words[i:i + max_words])
        chunks.append(chunk)
    return chunks

In [26]:
chunks = chunk_text_by_words(text, CHUNK_LENGTH+1, 15)

In [27]:
print(np.shape(chunks))

(12157,)


In [28]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 12050
sequence_length = CHUNK_LENGTH+1

vectorizer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=custom_standardization
)

vectorizer.adapt(chunks)

In [29]:
vocab = vectorizer.get_vocabulary()
print("Total unique tokens in vocabulary:", len(vocab))

Total unique tokens in vocabulary: 12050


In [30]:
def make_dataset(chunks):
    tokens = vectorizer(chunks)
    tokens_inp = tokens[:,:CHUNK_LENGTH]
    tokens_out = tokens[:,1:]
    ds = tf.data.Dataset.from_tensor_slices((tokens_inp,tokens_out))
    ds = ds.batch(BATCH_SIZE)
    ds = ds.shuffle(1024).prefetch(16).cache()
    return ds

In [31]:
ds = make_dataset(chunks)

In [32]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, vocab_size, output_dim):
        super().__init__()
        self.positional_embedding = tf.keras.layers.Embedding(input_dim = sequence_length, output_dim = output_dim, mask_zero=False)
        self.token_embedding = tf.keras.layers.Embedding(input_dim = vocab_size, output_dim= output_dim, mask_zero=True)
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embedding(inputs)
        embedded_positions = self.positional_embedding(positions)
        return embedded_tokens + embedded_positions

In [33]:
class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self, num_heads, embed_dim, dense_dim, dropout_rate):
        super().__init__()
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                           key_dim=embed_dim//num_heads)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dense_proj = tf.keras.models.Sequential([
            tf.keras.layers.Dense(dense_dim, activation='gelu'),
            tf.keras.layers.Dense(embed_dim)
        ])
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    def call(self, inputs):
        attn_out = self.attention(query=inputs,
                            key=inputs,
                            value=inputs,
                            use_causal_mask=True)
        norm1_out = self.layernorm1(attn_out+inputs)
        drop1_out = self.dropout1(norm1_out)
        dense_proj_out = self.dense_proj(drop1_out)
        norm2_out = self.layernorm2(drop1_out+dense_proj_out)
        drop2_out = self.dropout2(norm2_out)
        return drop2_out

In [34]:
inputs = tf.keras.layers.Input(shape=(None,))
embeddings = PositionalEmbedding(sequence_length, vocab_size, EMBED_DIM)(inputs)
x = embeddings
for layer in range(NUM_BLOCKS):
    x = TransformerDecoder(NUM_HEADS, EMBED_DIM, DENSE_DIM, DROPOUT_RATE)(x)
x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(vocab_size, activation='linear', kernel_initializer='glorot_uniform')(x)
transformer = tf.keras.models.Model(inputs, output)

In [35]:
transformer.summary()

In [None]:
import tensorflow.keras.backend as K

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=1e-3)

def perplexity(y_true, y_pred):
    cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
    return K.exp(K.mean(cross_entropy))

transformer.compile(loss = loss_fn,
                    metrics = ['accuracy', perplexity],
                    optimizer=opt)
transformer.fit(ds, epochs = 500)

Epoch 1/500


W0000 00:00:1762599395.367847      99 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
W0000 00:00:1762599395.367984      99 assert_op.cc:38] Ignoring Assert operator SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m63/95[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m11s[0m 365ms/step - accuracy: 0.0257 - loss: 7.5184 - perplexity: 2873.5398

W0000 00:00:1762599432.217085      96 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
W0000 00:00:1762599432.217170      96 assert_op.cc:38] Ignoring Assert operator SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 510ms/step - accuracy: 0.0265 - loss: 7.3784 - perplexity: 2388.6340
Epoch 2/500
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 366ms/step - accuracy: 0.0292 - loss: 6.8409 - perplexity: 940.9731
Epoch 3/500
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 368ms/step - accuracy: 0.0290 - loss: 6.8076 - perplexity: 909.0232
Epoch 4/500
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 369ms/step - accuracy: 0.0299 - loss: 6.7864 - perplexity: 889.9170
Epoch 5/500
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 369ms/step - accuracy: 0.0302 - loss: 6.7770 - perplexity: 881.4621
Epoch 6/500
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 369ms/step - accuracy: 0.0304 - loss: 6.7746 - perplexity: 880.1837
Epoch 7/500
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 369ms/step - accuracy: 0.0302 - loss: 6.7905 - perplexity: 893.8

In [None]:
import numpy as np
import tensorflow as tf

def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds + 1e-9) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

def generate_text(prompt, max_length=50, temperature=0.8):
    for _ in range(max_length):
        tokenized = vectorizer([prompt])
        preds = transformer(tokenized)
        preds = tf.nn.softmax(preds[0, -1, :]).numpy()
        next_id = sample_with_temperature(preds, temperature)
        print(next_id)
        next_word = vectorizer.get_vocabulary()[next_id]
        prompt += " " + next_word
        if next_word == "eos" or next_id == 0:
            break
    return prompt

printf(generate_text('my'))