##1.Tokenizing text
* The text that we will tokenize for LLM training is "Notes from the underground" a short story by Fyodor Dostoevsky.

In [9]:
import urllib.request
url = "https://www.gutenberg.org/cache/epub/600/pg600.txt"
input_file ="pg600.txt"
urllib.request.urlretrieve(url, input_file)

('pg600.txt', <http.client.HTTPMessage at 0x79311d47c590>)

In [10]:
##looking into the contents of the short story
with open('pg600.txt','r') as f:
  raw_text = f.read()
print("Total number of characters:",len(raw_text))
print(raw_text[:99])

Total number of characters: 259118
﻿The Project Gutenberg eBook of Notes from the Underground
    
This ebook is for the use of anyone


* Our goal is to tokenize the 259118 character story into individual words and special characters that we can turn into an embedding for LLM training.

In [11]:
##splitting text on whitespace commas and periods characters
import re
text = "Hello, world. Is this-- a test?"
result = re.split(r'[,:;_!"()\'] | -- |\s',text)
result = [item.strip()for item in result if item.strip()] #removing redundant characters
print(result)

['Hello', 'world.', 'Is', 'this--', 'a', 'test?']


In [12]:
##applying tokenization to the Notes from the underground story
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

55343


* `55343` represents the number of text in this text(without whitespaces)

In [13]:
#printing the first 40 tokens
print(preprocessed[:40])

['\ufeffThe', 'Project', 'Gutenberg', 'eBook', 'of', 'Notes', 'from', 'the', 'Underground', 'This', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'United', 'States', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.', 'You']


## 1.1 Converting tokens into token IDs.


* This step is an intermediate step before converting the token Ids into embedding vectors.
* First we build a vocabulary, the vocabulary defines how we can map each unique word and special character to a unique integer.

In [14]:
#creating a list of all unique tokens and sorting them alphabeitcally to determine the vocabulary size
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 5972


In [15]:
##creating a vocabulary
vocab = {token:integer for integer,token in enumerate(all_words)}
for i,item in enumerate(vocab.items()):
  print(item)
  if i >= 50:
    break

('!', 0)
('#600]', 1)
('$1', 2)
('$5', 3)
('(', 4)
(')', 5)
('*', 6)
('***', 7)
(',', 8)
('-', 9)
('.', 10)
('000', 11)
('1', 12)
('108', 13)
('1500', 14)
('1996', 15)
('2', 16)
('20%', 17)
('2001', 18)
('2021', 19)
('26', 20)
('3', 21)
('30', 22)
('4', 23)
('5', 24)
('50', 25)
('501', 26)
('596-1887', 27)
('6', 28)
('60', 29)
('64-6221541', 30)
('7', 31)
('8', 32)
('801', 33)
('809', 34)
('84116', 35)
('9', 36)
('90', 37)
(':', 38)
(';', 39)
('?', 40)
('A', 41)
('ACTUAL', 42)
('AGREE', 43)
('AGREEMENT', 44)
('AND', 45)
('ANY', 46)
('ANYTHING', 47)
('ASCII”', 48)
('About', 49)
('Additional', 50)


* The vocab dictionary contains individual tokens assocaited with unique integer labels.

In [16]:
##encode class that splits text into tokens and carries out string to integer mapping to produce token id via it's vocabular
class Encoder():
  def __init__(self,vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self,text):
    preprocessed = re.split(r'([,.:;"?\()] |--|\s)',text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids =[self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self,ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'([,.:;"?\()] |--)',r'\1',text)
    return text

In [17]:
tokenizer = Encoder(vocab)
text = raw_text[:100]
ids = tokenizer.encode(text)
print(ids)

[5971, 417, 234, 1993, 3673, 365, 2473, 5167, 554, 529, 2014, 3060, 2408, 5167, 5459, 3673, 826]


In [18]:
#decoding the Ids
print(tokenizer.decode(ids))

﻿The Project Gutenberg eBook of Notes from the Underground This ebook is for the use of anyone


##1.3 Adding Special context tokens.

* We add special tokens to modify our tokenizer to handle unknown words and usage of special context tokens.
* These special tokens can include markers for unknown words and documents boundaries.

In [19]:
#modifying the tokenizer to handle two special tokens <unk> <|endoftext|>, by adding them to our list of all unique words.
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<unk>","<|endoftext|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab.items()))

5974


In [20]:
#checking if they have been added
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('”', 5969)
('•', 5970)
('\ufeffThe', 5971)
('<unk>', 5972)
('<|endoftext|>', 5973)


In [21]:
class Encoder2:
  def __init__(self,vocab):
    self.str_to_int = vocab
    self.int_to_str = {integer:token for token,integer in vocab.items()}

  def encode(self,text):
    preprocessed = re.split(r'([,.:;!?"()\]|--|\s)',text)
    preprocessed = [item.strip() for item in preprocessed if item is not None and item.strip()]
    preprocessed = [item if item in self.str_to_int else "<unk>" for item in preprocessed]

    ids = [self.str_to_int[s] for s in preprocessed ]
    return ids

  def decode(self,ids):
    text = " ".join([self.int_to_str[i] for i in ids])

    # Remove space before punctuation characters including ']' and '\'
    text = re.sub(r'\s+([,.:;!?"()\\])', r'\1', text)
    # Replace ' -- ' with '--' to correctly re-form the token
    text = text.replace(" -- ", "--")
    return text

In [22]:
text1 = "Hello, do you like GPUs?"
text2 = "In sunset i like sleeping."
text = "<|endoftext|>".join((text1,text2))
print(text)

Hello, do you like GPUs?<|endoftext|>In sunset i like sleeping.


In [58]:
import re

class Encoder2:
  def __init__(self,vocab):
    self.str_to_int = vocab
    self.int_to_str = {integer:token for token,integer in vocab.items()}

  def encode(self,text):
    # Corrected regex pattern: `--` is now treated as a separate alternation.
    # `]` is escaped to be treated as a literal character.
    # The single quote ' is now escaped to avoid 'unterminated string literal' error.
    preprocessed = re.split(r'([,.:;!?\'"()\\]|\s|--)',text)
    preprocessed = [item.strip() for item in preprocessed if item is not None and item.strip()]
    preprocessed = [item if item in self.str_to_int else "<unk>" for item in preprocessed]

    ids = [self.str_to_int[s] for s in preprocessed ]
    return ids

  def decode(self,ids):
    text = " ".join([self.int_to_str[i] for i in ids])

    # Remove space before punctuation characters including ']' and '\'
    text = re.sub(r'\s+([,.:;!?\'"()\\])', r'\1', text)
    # Replace ' -- ' with '--' to correctly re-form the token
    text = text.replace(" -- ", "--")
    return text

# Now instantiate and use the corrected tokenizer
tokenizer = Encoder2(vocab)
print(tokenizer.encode(text))

[5972, 8, 1898, 5748, 3243, 3505, 40, 5973, 273, 5167, 5972, 5972, 5972, 10]


In [26]:
print(tokenizer.decode(tokenizer.encode(text)))

<unk>, do you like <unk>? <unk> <unk> <unk> like <unk>.


##1.4 Byte Pair Encoding

* The BPE tokenizer was used to train LLMs such as GPT-2,GPT-3 and the original models used in ChatGPT

In [27]:
##installing tiktoken algorithm that implements BPE
!pip install tiktoken



In [31]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

text = (
    "Hello, do you like morning? <|endoftext|> In the sunlit terraces"
    "of someunkownPlace."
)
integers = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 3329, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 2954, 593, 27271, 13]


In [33]:
##decoding integers back to strings
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like morning? <|endoftext|> In the sunlit terracesof someunkownPlace.


##1.5 Data Sampling with a sliding window


* Now we implement a data loader that fetches the input-target pairs from the training dataset using a sliding window approach.

In [36]:
with open('pg600.txt','r',encoding='utf-8') as f:
  raw_text = f.read()
encoded_text = tokenizer.encode(raw_text)
print(f"Number of token is:{len(encoded_text)}")

Number of token is:67308


In [37]:
## Removing first 50 tokens from the dataset
encoded_sample = encoded_text[50:]

In [38]:
##creating input-targett pairs by creating variables x(input) and y(output)
context_size = 4 #number of tokens to be included in the input
x = encoded_sample[:context_size]
y = encoded_sample[1:context_size+1]
print(f"X:{x}")
print(f"y:      {y}")

X:[13, 921, 743, 4866]
y:      [921, 743, 4866, 340]


In [39]:
##creating a next word prediction loop
for i in range(1,context_size+1):
  context = encoded_sample[:i]
  desired = encoded_sample[i]
  print(context,"----->", desired)

[13] -----> 921
[13, 921] -----> 743
[13, 921, 743] -----> 4866
[13, 921, 743, 4866] -----> 340


In [42]:
##sliding window task on decode token Ids
for i in range(1,context_size+1):
  context = encoded_sample[:i]
  desired = encoded_sample[i]
  print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

. ---->  You
. You ---->  may
. You may ---->  copy
. You may copy ---->  it


In [43]:
## creating a dataset for batched inputs and targets
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self,txt,tokenizer,max_length,stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt)##tokenizes text

    for i in range(0,len(token_ids) - max_length,stride):#using sliding window to chuk the book into overlapping sequences of max_length
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self,idx):
    return self.input_ids[idx], self.target_ids[idx]


In [44]:
## creating a dataloader to generate batches with input-with pairs
def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)
  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers
  )
  return dataloader

In [47]:
with open("pg600.txt","r",encoding="utf-8") as f:
  raw_text = f.read()

dataloader = create_dataloader_v1(
    raw_text,
    batch_size=1,
    max_length=8,
    stride=2,
    shuffle=False
)
data_iter = iter(dataloader) #converts dataloader into a python iterator to fetch the next entry
first_batch = next(data_iter)
print(first_batch)

[tensor([[  171,   119,   123,   464,  4935, 20336, 46566,   286]]), tensor([[  119,   123,   464,  4935, 20336, 46566,   286, 11822]])]


* The `first_batch` variable contains two tensors: the first tensor stores the input token ID, and the second tensor stores the target token IDs.
* Note that an input size4 is quite small and onl chosen for simplicity since LLMs are trained with input size atleast 256.

In [48]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  123,   464,  4935, 20336, 46566,   286, 11822,   422]]), tensor([[  464,  4935, 20336, 46566,   286, 11822,   422,   262]])]


* We have used low batch_size because high batch sizes require high memory during training.

In [49]:
##using batch size greater than 1
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=8,max_length=4,
    stride=2,
    shuffle=False
)
data_iter = iter(dataloader)
inputs,targets = next(data_iter)
print("Inputs:\n",inputs)
print("Target:\n",targets)

Inputs:
 tensor([[  171,   119,   123,   464],
        [  123,   464,  4935, 20336],
        [ 4935, 20336, 46566,   286],
        [46566,   286, 11822,   422],
        [11822,   422,   262, 22153],
        [  262, 22153,   198,   220],
        [  198,   220,   220,   220],
        [  220,   220,   220,   198]])
Target:
 tensor([[  119,   123,   464,  4935],
        [  464,  4935, 20336, 46566],
        [20336, 46566,   286, 11822],
        [  286, 11822,   422,   262],
        [  422,   262, 22153,   198],
        [22153,   198,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   198,  1212]])


##1.6 Creating token embeddings

In [50]:
input_ids = torch.tensor([2,3,5,1])

In [51]:
vocab_size = 6
output_dim = 3

In [52]:
##using vocab_size and output_dim to instantiate an embedding layer
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size,output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [53]:
##Embedding token id 3
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [55]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


##1.7 encoding positions of words

* Embedding vectors on their own don't capture the full information about tokens like their position.
* To fix this use position-aware embedding either relative or absolute positional embeddings.
* The positonal embedding will be concatenated with our token embeddings to create an enriched input embedding which not only shows token identity but its position

In [56]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size,output_dim)


In [59]:
## instantiating the dataloader
max_length =4
dataloader = create_dataloader_v1(
    raw_text,batch_size=8,max_length=max_length,
    stride=max_length,shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs) # Corrected from 'input' to 'inputs'
print("\nInputs shape:\n",inputs.shape)

Token IDs:
 tensor([[  171,   119,   123,   464],
        [ 4935, 20336, 46566,   286],
        [11822,   422,   262, 22153],
        [  198,   220,   220,   220],
        [  220,   198,  1212, 47179],
        [  318,   329,   262,   779],
        [  286,  2687,  6609,   287],
        [  262,  1578,  1829,   290]])

Inputs shape:
 torch.Size([8, 4])


* The token ID tensor is 8X4 dimensional, meaning that the data batch consists of 8 text samples with four tokens each

In [60]:
##using the embedding laer to embed these token IDs into 256 dimensional
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [61]:
##coding the absolute embedding approach
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length,output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [62]:
##enriched input embedding
input_embeddings = token_embeddings +pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
