# Building LLMs from scratch

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import keras.ops as K
import re
import tiktoken # Byte Pair Encoding 
import matplotlib.pyplot as plt
import plotly.graph_objects as go
fig = go.Figure(
    data=[go.Bar(y=[2, 1, 3])],
    layout_title_text="A Figure Displayed with fig.show()"
)
fig.show()

Load the shakespeare text dataset: contains 100,000 characters!

In [2]:
dataset = tfds.load(name='tiny_shakespeare')

train = dataset['train']
for text in train:
    x = text['text'].numpy().decode('utf-8')
print(x[:100])
print(f"\nlength of the entire text file: {len(x)}")

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

length of the entire text file: 1003854


2025-03-30 16:49:04.984277: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2025-03-30 16:49:04.984313: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:137] retrieving CUDA diagnostic information for host: ragab
2025-03-30 16:49:04.984317: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:144] hostname: ragab
2025-03-30 16:49:04.984475: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:168] libcuda reported version is: 560.35.3
2025-03-30 16:49:04.984497: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:172] kernel reported version is: 560.35.3
2025-03-30 16:49:04.984500: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:259] kernel version seems to match DSO: 560.35.3
2025-03-30 16:49:05.108383: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:376] The default buffer size is 2621

## Setting up a custom simple tokenizer 
Converts normal text into tokens using regex ----> then converts from tokens into token ids using a custom class

In [3]:
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', x)
print(tokens[:100])

vocabulary = sorted(set(tokens))

# Create a Dictionary with additional special tokens ("<|unk|>", "<|eos|>") 
# for an unkown word or the end of text (incase I train with multiple text sources).\

dictionary = {item:value for value, item in enumerate(vocabulary)}
dictionary["<|unk|>"] = len(dictionary)
dictionary["<|eos|>"] = len(dictionary)
#dictionary["<|bos|>"] = len(dictionary)
#dictionary["<|pad|>"] = len(dictionary)

['First', ' ', 'Citizen', ':', '', '\n', 'Before', ' ', 'we', ' ', 'proceed', ' ', 'any', ' ', 'further', ',', '', ' ', 'hear', ' ', 'me', ' ', 'speak', '.', '', '\n', '', '\n', 'All', ':', '', '\n', 'Speak', ',', '', ' ', 'speak', '.', '', '\n', '', '\n', 'First', ' ', 'Citizen', ':', '', '\n', 'You', ' ', 'are', ' ', 'all', ' ', 'resolved', ' ', 'rather', ' ', 'to', ' ', 'die', ' ', 'than', ' ', 'to', ' ', 'famish', '?', '', '\n', '', '\n', 'All', ':', '', '\n', 'Resolved', '.', '', ' ', 'resolved', '.', '', '\n', '', '\n', 'First', ' ', 'Citizen', ':', '', '\n', 'First', ',', '', ' ', 'you', ' ', 'know', ' ']


In [4]:
class SimpleTockenizer:
    """
    A simple tokenizer which using a dictionary converts the text into token ids 
    """
    def __init__(self, dictionary):
        self.dictionary = dictionary
        self.dictionary_reverse = {value:item for item, value in dictionary.items()}

    def encode(self, text):
        split = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = list()
        for item in split:
            try: 
                tokens.append(self.dictionary[item])
            except:
                tokens.append(self.dictionary["<|unk|>"])
                
        return tokens

    def decode(self, tokens):
        text = "".join([self.dictionary_reverse[token] for token in tokens])
        return text

In [5]:
tokenizer = SimpleTockenizer(dictionary)
tokens = tokenizer.encode(x)
print(tokens[:10])

text = tokenizer.decode(tokens)
print(text[:100])

[864, 2, 457, 11, 0, 1, 249, 2, 12630, 2]
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## Using the GPT2 tokenizer form the tiktoken library
The GPT2 tokenizer uses byte pair encoding which creates tokens for entire words and for sub-word characters

In [6]:
tiktok = tiktoken.get_encoding("gpt2")
integers = tiktok.encode(x, allowed_special={"<|eos|>"})
print(integers[:50])

strings = tiktok.decode(integers[:50])
print(strings)

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13, 198, 198, 5962, 22307, 25, 198, 1639, 389, 477, 12939, 2138, 284, 4656, 621, 284, 1145, 680, 30, 198, 198, 3237, 25, 198, 4965, 5634, 13]
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved.


## Setting up the input and target values using the Windowing technique

In [7]:
context_size = 4
for i in range(1, context_size+1):
    inputs = integers[:i]
    target = integers[i]
    print(tiktok.decode(inputs) + '------->' + tiktok.decode([target]))
    

First-------> Citizen
First Citizen------->:
First Citizen:------->

First Citizen:
------->Before


## Create a Custom DataLoader to load the Data into TensorFlow

In [8]:
class DataLoader:
    """
    A custom data loader which loads data using the tiktoken tokenizer
    into Tensorflow, creating input and target values using the windowing technique
    """
    def __init__(self, text, stride, max_length):
        self.input_ids = []
        self.target_ids = []

        tokenizer = tiktoken.get_encoding("gpt2")
        token_ids = tokenizer.encode(text, allowed_special={"<|eos|>"})
        
        for i in range(0, len(token_ids) - max_length, stride):
            self.input_ids.append(token_ids[i:i+max_length])
            self.target_ids.append(token_ids[i+1:i+1+max_length])
            
        # Convert lists to TensorFlow tensors
        self.input_ids = tf.convert_to_tensor(self.input_ids, dtype=tf.int32)
        self.target_ids = tf.convert_to_tensor(self.target_ids, dtype=tf.int32)
            
    def __len__(self):
        return len(self.input_ids)
        
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
    def load_data(self, batch_size=8, shuffle=False):
        dataset = tf.data.Dataset.from_tensor_slices((self.input_ids, self.target_ids)) 
        # Shuffle the dataset if required
        if shuffle:
            dataset = dataset.shuffle(buffer_size=buffer_size)

        # Batch the dataset
        dataset = dataset.batch(batch_size)

        # Prefetch the dataset for better performance
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset

In [9]:
data = DataLoader(x, 2, 10)
dataset = data.load_data(batch_size=1)

In [10]:
print(next(iter(dataset)))

(<tf.Tensor: shape=(1, 10), dtype=int32, numpy=
array([[ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,
           11]], dtype=int32)>, <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
array([[22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285]], dtype=int32)>)


## Creating a token embedding 

In [32]:
max_length = 100
stride = 4
batch_size = 8

data = DataLoader(x, stride, max_length)
dataset = data.load_data(batch_size)
next(iter(dataset))[0]

<tf.Tensor: shape=(8, 100), dtype=int32, numpy=
array([[ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,
           11,  3285,   502,  2740,    13,   198,   198,  3237,    25,
          198,  5248,   461,    11,  2740,    13,   198,   198,  5962,
        22307,    25,   198,  1639,   389,   477, 12939,  2138,   284,
         4656,   621,   284,  1145,   680,    30,   198,   198,  3237,
           25,   198,  4965,  5634,    13, 12939,    13,   198,   198,
         5962, 22307,    25,   198,  5962,    11,   345,   760,   327,
         1872,   385,  1526, 28599,   318,  4039,  4472,   284,   262,
          661,    13,   198,   198,  3237,    25,   198,  1135,   760,
          470,    11,   356,   760,   470,    13,   198,   198,  5962,
        22307,    25,   198,  5756,   514,  1494,   683,    11,   290,
          356],
       [ 8421,   356,  5120,   597,  2252,    11,  3285,   502,  2740,
           13,   198,   198,  3237,    25,   198,  5248,   461,    11,
         2740

In [34]:
vocab_size = 50257          # Size of the token dictionary
output_dim = 256            # Size of the embedding
context_len = max_length    # To create the positional embedding

# Create an embedding for each token (A vector representation of that token)
token_embedding = keras.layers.Embedding(vocab_size, output_dim)(next(iter(dataset))[0])

# Create a positional embedding which has information of the word position
pos_idx = tf.range(context_len)
pos_embedding = keras.layers.Embedding(context_len, output_dim)(pos_idx)

# Add the positional information to the original token embedding
input_embedding = token_embedding + pos_embedding

## Creating a Simple Self Attention Layer

In [13]:
def Attention(inputs):
    """
    A very simple implementation of the Self Attention Layer
    """

    # 1. Calculate the relationship between each input and all other inputs in the sequence
    attention_scores = tf.matmul(inputs, tf.transpose(inputs))

    # 2. Normalize the attention scores for better learning (better for gradient descent)
    norm_as = keras.layers.Softmax()(attention_scores)

    # 3. generate the final context vector by multiplying each attention score with 
    # its corresponding input and suming them up
    final_context_vec = tf.matmul(norm_as, inputs)

    return final_context_vec


In [14]:
input_embedding.shape


TensorShape([8, 100, 256])

In [None]:
class SelfAttention(keras.Layer):
    def __init__(self, dim, bias=True):
        super().__init__()

        # The dimention of the query, key and value weights
        self.dim = dim
        self.bias = bias

    def build(self, input_shape):
        # Initializing the query, key and value weights
        """
        Use the keras weight matrices to initialize the weights 
            self.Wq = self.add_weight((input_shape[-1], self.dim), name="Wq")
            self.Wk = self.add_weight((input_shape[-1], self.dim), name="Wk")
            self.Wv = self.add_weight((input_shape[-1], self.dim), name="Wv")
        """        

        # Use the keras dense layer for the weights initialization
        self.Wq = keras.layers.Dense(self.dim, use_bias=self.bias)
        self.Wk = keras.layers.Dense(self.dim, use_bias=self.bias)
        self.Wv = keras.layers.Dense(self.dim, use_bias=self.bias)

    def call(self, inputs):
        #Calculate keys, queries and values vectors 
        """
        Calculate the vectors using dot product
            keys = K.dot(inputs, self.Wk)
            queries = K.dot(inputs, self.Wq)
            values = K.dot(inputs, self.Wv)
        """
        # Calculate the vectors using the dense layer which is functionaly the same as doing a dot product
        keys = self.Wk(inputs)
        queries = self.Wq(inputs)
        values = self.Wv(inputs)

        attention_score = K.dot(queries, K.transpose(keys))

        attention_weights = K.softmax(attention_score / self.dim**0.5, axis=-1)

        context_vector = K.dot(attention_weights, values)

        return context_vector

In [81]:
class CasualSelfAttention(keras.Layer):
    def __init__(self, dim, bias=True):
        super().__init__()

        # The dimention of the query, key and value weights
        self.dim = dim
        self.bias = bias

    def build(self, input_shape):
        # Initializing the query, key and value weights
        # Use the keras dense layer for the weights initialization
        self.Wq = keras.layers.Dense(self.dim, use_bias=self.bias)
        self.Wk = keras.layers.Dense(self.dim, use_bias=self.bias)
        self.Wv = keras.layers.Dense(self.dim, use_bias=self.bias)

    def mask(self, attention_scores):
        # Create & Apply a mask on the attention scores
        context_length = attention_scores.shape[0]
        mask = K.triu(K.ones((context_length,context_length)), k=1)
        mask = K.cast(mask, tf.bool)
        
        # Apply the mask to the attention scores
        # Areas where the mask is true is set to -inf to ensure 0 when calculating the softmax
        masked_attention = K.where(mask, -np.inf, attention_scores)
        return masked_attention
        
    def call(self, inputs):
        #Calculate keys, queries and values vectors 
        # Calculate the vectors using the dense layer which is functionaly the same as doing a dot product
        keys = self.Wk(inputs)
        queries = self.Wq(inputs)
        values = self.Wv(inputs)
        
        attention_scores = K.dot(queries, K.transpose(keys))
        masked_attention = self.mask(attention_scores)

        attention_weights = K.softmax(masked_attention / self.dim**0.5, axis=-1)
        
        context_vector = K.dot(attention_weights, values)

        return context_vector 

In [82]:
self_attention = CasualSelfAttention(256)
self_attention(input_embedding[0])

<tf.Tensor: shape=(100, 256), dtype=float32, numpy=
array([[-0.02131266,  0.06561862,  0.0551003 , ..., -0.02843301,
         0.00802045, -0.02614996],
       [ 0.01608673,  0.06855912,  0.02218414, ..., -0.01544351,
        -0.01108414, -0.02163525],
       [ 0.03635468,  0.04834025,  0.01675467, ..., -0.00699243,
        -0.00565207, -0.01421074],
       ...,
       [ 0.01456432,  0.00418853, -0.01226066, ...,  0.01061585,
        -0.01198308, -0.00576923],
       [ 0.01436179,  0.00448983, -0.0117574 , ...,  0.01000056,
        -0.01217003, -0.00531628],
       [ 0.01350324,  0.00466125, -0.01120132, ...,  0.00958462,
        -0.01139006, -0.00473438]], dtype=float32)>

In [68]:
tf.bool()


TypeError: 'DType' object is not callable