In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import os
from tensorflow.keras import layers
import string

In [None]:
BATCH_SIZE = 64
NUM_HEADS = 12
NUM_BLOCKS = 12
EMBED_DIM = 768
DENSE_DIM = 3072
DROPOUT_RATE = 0.1
CHUNK_LENGTH = 150

In [None]:
df = pd.read_csv('/kaggle/input/the-bards-best-a-character-modeling-dataset/train.csv')

In [None]:
text = '<sos>' + df.values[0][0] + '<eos>'
text = re.sub(r'\s+', ' ', str(text)).strip()

In [None]:
import re

words = text.split()

# Count unique words
unique_words = set(words)
print(f"Total words: {len(words)}")
print(f"Unique words: {len(unique_words)}")


In [None]:
def chunk_text_by_words(text, max_words, stride=None):
    words = text.split()
    if stride is None:
        stride = max_words // 2
    chunks = []
    for i in range(0, len(words) - max_words, stride):
        chunk = ' '.join(words[i:i + max_words])
        chunks.append(chunk)
    return chunks

In [None]:
chunks = chunk_text_by_words(text, CHUNK_LENGTH+1)

In [None]:
strip_chars = string.punctuation + "Â¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 23850
sequence_length = CHUNK_LENGTH+1

vectorizer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardization=custom_standardization
)

vectorizer.adapt(chunks)

In [None]:
def make_dataset(chunks):
    tokens = vectorizer(chunks)
    tokens_inp = tokens[:,:CHUNK_LENGTH]
    tokens_out = tokens[:,1:]
    ds = tf.data.Dataset.from_tensor_slices((tokens_inp,tokens_out))
    ds = ds.batch(BATCH_SIZE)
    ds = ds.shuffle(1024).prefetch(16).cache()
    return ds

In [None]:
ds = make_dataset(chunks)

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, vocab_size, output_dim):
        super().__init__()
        self.positional_embedding = tf.keras.layers.Embedding(input_dim = sequence_length, output_dim = output_dim, mask_zero=False)
        self.token_embedding = tf.keras.layers.Embedding(input_dim = vocab_size, output_dim= output_dim, mask_zero=True)
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embedding(inputs)
        embedded_positions = self.positional_embedding(positions)
        return embedded_tokens + embedded_positions

In [None]:
class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self, num_heads, embed_dim, dense_dim, dropout_rate):
        super().__init__()
        self.attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                           key_dim=embed_dim//num_heads)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dense_proj = tf.keras.models.Sequential([
            tf.keras.layers.Dense(dense_dim, activation='relu'),
            tf.keras.layers.Dense(embed_dim)
        ])
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-5)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    def call(self, inputs):
        attn_out = self.attention(query=inputs,
                            key=inputs,
                            value=inputs,
                            use_causal_mask=True)
        norm1_out = self.layernorm1(attn_out+inputs)
        drop1_out = self.dropout1(norm1_out)
        dense_proj_out = self.dense_proj(drop1_out)
        norm2_out = self.layernorm2(drop1_out+dense_proj_out)
        drop2_out = self.dropout2(norm2_out)
        return drop2_out

In [None]:
inputs = tf.keras.layers.Input(shape=(None,))
embeddings = PositionalEmbedding(sequence_length, vocab_size, EMBED_DIM)(inputs)
x = embeddings
for layer in range(NUM_BLOCKS):
    x = TransformerDecoder(NUM_HEADS, EMBED_DIM, DENSE_DIM, DROPOUT_RATE)(x)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(vocab_size,activation='softmax')(x)
transformer = tf.keras.models.Model(inputs, output)

In [None]:
transformer.summary()

In [None]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
opt = tf.keras.optimizers.Adam(learning_rate=3e-4)

transformer.compile(loss = loss_fn,
                    metrics = ['accuracy'],
                    optimizer=opt)
transformer.fit(ds, epochs = 50)