# Building LLMs from scratch

In [44]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import keras.backend as K
import re
import tiktoken # Byte Pair Encoding 

Load the shakespeare text dataset: contains 100,000 characters!

In [55]:
dataset = tfds.load(name='tiny_shakespeare')

train = dataset['train']
for text in train:
    x = text['text'].numpy().decode('utf-8')
print(x[:100])
print(f"\nlength of the entire text file: {len(x)}")

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

length of the entire text file: 1003854


## Setting up a custom simple tokenizer 
Converts normal text into tokens using regex ----> then converts from tokens into token ids using a custom class

In [46]:
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', x)
print(tokens[:100])

vocabulary = sorted(set(tokens))

# Create a Dictionary with additional special tokens ("<|unk|>", "<|eos|>") 
# for an unkown word or the end of text (incase I train with multiple text sources).\

dictionary = {item:value for value, item in enumerate(vocabulary)}
dictionary["<|unk|>"] = len(dictionary)
dictionary["<|eos|>"] = len(dictionary)
#dictionary["<|bos|>"] = len(dictionary)
#dictionary["<|pad|>"] = len(dictionary)

['First', ' ', 'Citizen', ':', '', '\n', 'Before', ' ', 'we', ' ', 'proceed', ' ', 'any', ' ', 'further', ',', '', ' ', 'hear', ' ', 'me', ' ', 'speak', '.', '', '\n', '', '\n', 'All', ':', '', '\n', 'Speak', ',', '', ' ', 'speak', '.', '', '\n', '', '\n', 'First', ' ', 'Citizen', ':', '', '\n', 'You', ' ', 'are', ' ', 'all', ' ', 'resolved', ' ', 'rather', ' ', 'to', ' ', 'die', ' ', 'than', ' ', 'to', ' ', 'famish', '?', '', '\n', '', '\n', 'All', ':', '', '\n', 'Resolved', '.', '', ' ', 'resolved', '.', '', '\n', '', '\n', 'First', ' ', 'Citizen', ':', '', '\n', 'First', ',', '', ' ', 'you', ' ', 'know', ' ']


In [47]:
class SimpleTockenizer:
    """
    A simple tokenizer which using a dictionary converts the text into token ids 
    """
    def __init__(self, dictionary):
        self.dictionary = dictionary
        self.dictionary_reverse = {value:item for item, value in dictionary.items()}

    def encode(self, text):
        split = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = list()
        for item in split:
            try: 
                tokens.append(self.dictionary[item])
            except:
                tokens.append(self.dictionary["<|unk|>"])
                
        return tokens

    def decode(self, tokens):
        text = "".join([self.dictionary_reverse[token] for token in tokens])
        return text

In [48]:
tokenizer = SimpleTockenizer(dictionary)
tokens = tokenizer.encode(x)
print(tokens[:10])

text = tokenizer.decode(tokens)
print(text[:100])

[864, 2, 457, 11, 0, 1, 249, 2, 12630, 2]
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


## Using the GPT2 tokenizer form the tiktoken library
The GPT2 tokenizer uses byte pair encoding which creates tokens for entire words and for sub-word characters

In [49]:
tiktok = tiktoken.get_encoding("gpt2")
integers = tiktok.encode(x, allowed_special={"<|eos|>"})
print(integers[:50])

strings = tiktok.decode(integers[:50])
print(strings)

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13, 198, 198, 5962, 22307, 25, 198, 1639, 389, 477, 12939, 2138, 284, 4656, 621, 284, 1145, 680, 30, 198, 198, 3237, 25, 198, 4965, 5634, 13]
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved.


## Setting up the input and target values using the Windowing technique

In [50]:
context_size = 4
for i in range(1, context_size+1):
    inputs = integers[:i]
    target = integers[i]
    print(tiktok.decode(inputs) + '------->' + tiktok.decode([target]))
    

First-------> Citizen
First Citizen------->:
First Citizen:------->

First Citizen:
------->Before


## Create a Custom DataLoader to load the Data into TensorFlow

In [85]:
class DataLoader:
    """
    A custom data loader which loads data using the tiktoken tokenizer
    into Tensorflow, creating input and target values using the windowing technique
    """
    def __init__(self, text, stride, max_length):
        self.input_ids = []
        self.target_ids = []

        tokenizer = tiktoken.get_encoding("gpt2")
        token_ids = tokenizer.encode(text, allowed_special={"<|eos|>"})
        
        for i in range(0, len(token_ids) - max_length, stride):
            self.input_ids.append(token_ids[i:i+max_length])
            self.target_ids.append(token_ids[i+1:i+1+max_length])
            
        # Convert lists to TensorFlow tensors
        self.input_ids = tf.convert_to_tensor(self.input_ids, dtype=tf.int32)
        self.target_ids = tf.convert_to_tensor(self.target_ids, dtype=tf.int32)
            
    def __len__(self):
        return len(self.input_ids)
        
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
    def load_data(self, batch_size=8, shuffle=False):
        dataset = tf.data.Dataset.from_tensor_slices((self.input_ids, self.target_ids)) 
        # Shuffle the dataset if required
        if shuffle:
            dataset = dataset.shuffle(buffer_size=buffer_size)

        # Batch the dataset
        dataset = dataset.batch(batch_size)

        # Prefetch the dataset for better performance
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset

In [90]:
data = DataLoader(x, 2, 10)
dataset = data.load_data(batch_size=1)

In [91]:
print(next(iter(dataset)))

(<tf.Tensor: shape=(1, 10), dtype=int32, numpy=
array([[ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,
           11]], dtype=int32)>, <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
array([[22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285]], dtype=int32)>)


## Creating a token embedding 

In [108]:
max_length = 100
stride = 4
batch_size = 8

data = DataLoader(x, stride, max_length)
dataset = data.load_data(batch_size)

In [113]:
vocab_size = 50257
output_dim = 256
context_len = max_length

token_embedding = keras.layers.Embedding(vocab_size, output_dim)(next(iter(dataset))[0])
# print(embedding(next(iter(dataset))[0].numpy()))


pos_idx = tf.range(context_len)
pos_embedding = keras.layers.Embedding(context_len, output_dim)(pos_idx)
print(pos_embedding)

input_embedding = token_embedding + pos_embedding

tf.Tensor(
[[-0.04819577  0.03086002  0.04165006 ... -0.03921388  0.03153236
  -0.02050692]
 [-0.03224732  0.00572438 -0.0472178  ...  0.02273817  0.01425034
  -0.03599213]
 [ 0.0080161   0.0394375  -0.04151511 ...  0.00200493  0.04924547
   0.03099601]
 ...
 [ 0.01490137  0.0039796  -0.01900916 ... -0.04527375  0.01180929
  -0.02467388]
 [ 0.04875417 -0.00100584  0.03920699 ...  0.02465645  0.02856841
  -0.00609944]
 [ 0.04975339  0.03046734 -0.04690708 ...  0.02372551 -0.00453681
   0.02626635]], shape=(100, 256), dtype=float32)


## Creating a Simple Self Attention Layer

In [117]:
def Attention(inputs):
    """
    A very simple implementation of the Self Attention Layer
    """

    # 1. Calculate the relationship between each input and all other inputs in the sequence
    attention_scores = tf.matmul(inputs, tf.transpose(inputs))

    # 2. Normalize the attention scores for better learning (better for gradient descent)
    norm_as = keras.layers.Softmax()(attention_scores)

    # 3. general the final context vector by multiplying each attention score with its corresponding input and suming them up
    final_context_vec = tf.matmul(norm_as, inputs)

    return final_context_vec


In [119]:
input_embedding.shape


TensorShape([8, 100, 256])

In [2]:
class SelfAttention(keras.Layer):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def build(self, input_shape):
        self.Wq = self.add_weight((input_shape[-1], self.dim), name="Wq")
        self.Wk = self.add_weight((input_shape[-1], self.dim), name="Wk")
        self.Wv = self.add_weight((input_shape[-1], self.dim), name="Wv")

    def call(self, inputs):
        self.keys = K.dot(self.Wk, inputs)
        self.queries = K.dot(self.Wq, inputs)
        self.values = K.dot(self.Wv, inputs)

        self.attention_score = K.dot(K.T)

SyntaxError: incomplete input (2688713549.py, line 11)