<a href="https://colab.research.google.com/github/AliAch04/build-llm-from-scratch/blob/main/LLMScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Short Story as text sample Into python

**Step 1 : Create Tokens**

In [1]:
import os
path1 = "/content/sample_data/the-verdict.txt"
path2 = "/content/drive/MyDrive/llm/the-verdict.txt"

# Check if the first path exists
if os.path.exists(path1):
    file_path = path1
    print(f"Using primary path: {file_path}")
else:
    file_path = path2
    print(f"Primary path not found. Using secondary path: {file_path}")

try:
    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()

except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

print('Number of characters : ', len(raw_text))
# print first 100 characters
print(raw_text[:99])

Primary path not found. Using secondary path: /content/drive/MyDrive/llm/the-verdict.txt
Number of characters :  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


- Tokenize the whole characters (20479) into individual words and special characters Then turn into embeddings

- Split text into list of tokens based on white text or special characters ...

In [2]:
import re # Regular expression

text_test = "Hello, world. this, is a test!"
result = re.split(r'([,.!]|\s)', text_test)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'this', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '!', '']


- Remove redundants characters safely

In [3]:
#result = [item for item in result if item not in ['', ' ']]
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'this', ',', 'is', 'a', 'test', '!']


- In our context we are removing the white-spaces because our text structure doesnt need it (working on simple sample of text)

- Full process

In [4]:
text_test = 'I HAD always! thought Jack Gisburn rather? a cheap genius--though a good fellow enough--so it was no! '
# Tokenization sheme
result = re.split(r'([.,:;!_?"]|--|\s)', text_test)
result = [i for i in result if i.strip()]
print(result)

['I', 'HAD', 'always', '!', 'thought', 'Jack', 'Gisburn', 'rather', '?', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', '!']


- Apply the tokenizer on the Story

In [5]:
preprocessed = re.split(r'([,.:;_!?"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:20])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was']


In [6]:
print('Total number of tokens : ', len(preprocessed))

Total number of tokens :  4690


**Step 2 : Create Tokens IDs**

- Create list of tokens and sort them alphabetically to determine the Vocabilary size

In [7]:
all_words = sorted(set(preprocessed))
print(all_words[:15])
print(len(all_words))

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And']
1130


- Create the Vocabulary itself

In [8]:
vocab = {token : i for i, token in enumerate(all_words)}


In [9]:
for i, v in enumerate(vocab.items()):
  if i > 25:
    break
  print(v)

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)


- Emplement Tokenizer class

In [10]:
class SimpleTokenizer:
  def __init__(self):
    self.str_to_int = vocab
    self.int_to_str = {id:t for t, id in vocab.items()}

  def encoder(self, txt):
    processed = re.split(r'([.,;?()!_:\'"]|--|\s)', txt)

    processed = [item.strip() for item in processed if item.strip()]

    ids = [self.str_to_int[t] for t in processed]
    return ids

  def decoder(self, ids):
    txt = " ".join([self.int_to_str[id] for id in ids])
    # Prevent the whitespace before the punctuation marks
    txt = re.sub(r'\s+([,."\'?!()])', r'\1', txt)
    txt = re.sub(r'(["\'])\s+(\w+)', r'\1\2', txt)
    return txt

token = SimpleTokenizer()
print(token.encoder('Ah! At "Among" !'))
print(token.decoder(token.encoder('Ah! \'At\':"Among " !')))


[12, 0, 18, 1, 13, 1, 0]
Ah!'At' :"Among"!


- To address words not in the vocabulary, the vocabulary needs to be extended.

**Adding Special Context Tokens**

- Modify the Tokenizer to handle unkonwn words (<|unk|> <|endoftext|>)

In [11]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(['<|unk|>', '<|endoftext|>'])

vocab = {t:i for i, t in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [12]:
for item in list(vocab.items())[-5:]:
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|unk|>', 1130)
('<|endoftext|>', 1131)


- Implementing SimpleTokenizerV2

In [13]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:t for t,i in vocab.items()}

  def encoder(self, txt):
    # Split Text into tokens
    preprocessed = re.split(r'([.,;?()!_:\'"]|--|\s)', txt)
    # Clear the Tokens from whitespaces
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    # Identifiy the unknown words
    preprocessed = [
        item if item in self.str_to_int else '<|unk|>'
        for item in preprocessed
    ]
    # Map the tokens with thier ids
    ids = [self.str_to_int[s] for s in preprocessed]

    return ids

  def decoder(self, ids):
    txt = " ".join([self.int_to_str[i] for i in ids])
    txt = re.sub(r'\s+([,.;:!?()"\'])', r'\1', txt)

    return txt

In [14]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = 'Salam! I love tea so much. What about you?'
text2 = 'Yes! i love tea and coffee.'
text = " <|endoftext|> ".join((text1, text2))

print(text)

Salam! I love tea so much. What about you? <|endoftext|> Yes! i love tea and coffee.


In [15]:
print(tokenizer.encoder(text))
print(tokenizer.decoder(tokenizer.encoder(text)))

[1130, 0, 53, 1130, 975, 908, 691, 7, 109, 118, 1126, 10, 1131, 112, 0, 1130, 1130, 975, 157, 1130, 7]
<|unk|>! I <|unk|> tea so much. What about you? <|endoftext|> Yes! <|unk|> <|unk|> tea and <|unk|>.


- More Special tokens : [BOS] [EOS] [PAD]

- For GPT models they doesnt use <|unk|> tokens. Instead they uses a tokenizer called **Byte Pair Encoding** which beaks words into subword units.

**Byte Pair Encoding**

- Use pyhton open-source library '**tiktoken**' 'https://github.com/openai/tiktoken'

In [16]:
#!pip3 install tiktoken

In [17]:
# Import the tiktoken package and check its version
import importlib
import tiktoken

print('Tiktoken version : ', importlib.metadata.version("tiktoken"))

Tiktoken version :  0.12.0


In [18]:
# Instantiate BPE tokenizer

In [19]:
tokenizer = tiktoken.get_encoding("gpt2")

In [20]:
text = ("Hello! do you like tea? <|endoftext|> The sun rose quickly this morning."
"what do you thinkofme me?")

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(ids)
print(tokenizer.decode([1326, 502]))  # Should show "me"

[15496, 0, 466, 345, 588, 8887, 30, 220, 50256, 383, 4252, 8278, 2952, 428, 3329, 13, 10919, 466, 345, 892, 1659, 1326, 502, 30]
me me


In [21]:
text = tokenizer.decode(ids)

print(text)

Hello! do you like tea? <|endoftext|> The sun rose quickly this morning.what do you thinkofme me?


- More example to illustrate BPE tokenizer deals with unknown tokens

In [22]:
ids =  tokenizer.encode('Awki ingze')
print(ids)

txt = tokenizer.decode(ids)
print(txt)

[23155, 4106, 5347, 2736]
Awki ingze


**Creating Input-target pairs**

*Implement a data loader that fetchs the input-target pairs using a sliding window approach*

- Tokenize the whole dataset (The verdict) using BPE tokenizer

In [23]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))
print(len(raw_text))

5145
20479


- Define the context size (how many tokens in the input)



In [24]:
enc_example = enc_text[50:] # gives intresting result
context_size = 4 # so the model will look at 4 sequences of words to predict the next word in the sequence!

x = enc_example[:context_size]
y = enc_example[1:context_size+1]
# the target is just the input shifted by 1 (so we can capture the next-word that we need to predict)

print(f'{x}')
print(f'     {y}')

[290, 4920, 2241, 287]
     [4920, 2241, 287, 257]


- Create the next-word prediction tasks

In [25]:
for i in range(1, context_size+1):
  context = enc_example[:i]
  next_word = enc_example[i] # supposed to predict
  print(f'{context}->{next_word}')

[290]->4920
[290, 4920]->2241
[290, 4920, 2241]->287
[290, 4920, 2241, 287]->257


In [26]:
for i in range(1, context_size+1):
  context = enc_example[:i]
  next_word = enc_example[i]
  print(tokenizer.decode(context),'->',tokenizer.decode([next_word]))

 and ->  established
 and established ->  himself
 and established himself ->  in
 and established himself in ->  a


- Turning the inputs and targets as PyThorch tensors (required for the income optimazation procedures)

**Implementing a Data Loader**

- The methodology is:


1.   Firstly, tokenize the text
2.   Using the sliding window to make two overlapping sequences of the text (lenght of each row is the context size)
3.   List item
4.   List item





In [27]:
from torch.utils.data import DataLoader, Dataset
import torch

class GPTDataset(Dataset):
  def __init__(self, txt, tokenizer, max_lenght, stride):
     self.input_ids = []
     self.target_ids = []

     token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})

     for i in range(0, len(token_ids) - max_lenght, stride):
        input_chunk = token_ids[i:i + max_lenght]
        target_chunk = token_ids[i+1:i + max_lenght+1]
        self.input_ids.append(torch.tensor(input_chunk))
        self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
     return len(self.input_ids)

  # accessing item by [index]
  def __getitem__(self, idx):
     return self.input_ids[idx], self.target_ids[idx]

- Implement a function to generate input-target pairs for our Dataset

In [29]:
def create_dataloader(txt, batch_size=4, max_lenght=256,
                         stride=128, shuffle=False, drop_last=True,
                         num_workers=0):
  # Initiate the tokenizer
  tokenizer = tiktoken.get_encoding("gpt2")

  # Create the Dataset
  dataset = GPTDataset(txt, tokenizer, max_lenght, stride)

  # Create the Dataloader
  dataloader = DataLoader(dataset, batch_size=batch_size,
                          shuffle=shuffle, drop_last=drop_last,
                          num_workers=num_workers)

  return dataloader

- Test the dataloader

In [40]:
# Read the text
with open('/content/drive/MyDrive/llm/the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_text = f.read()

# Convert the Dataloader into iterator
import torch
print('PyTorch version : ', torch.__version__)

dataloader = create_dataloader(raw_text, batch_size=1, max_lenght=4, stride=1)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version :  2.8.0+cu126
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [41]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]
