In [1]:
with open("the-verdict.txt","r",encoding="utf-8") as f : 
    raw_text = f.read()
print("number of characters : " , len(raw_text))

number of characters :  20479


In [2]:
import re 

text = "hello there, nice place! is it yours?"
result = re.split(r'([,.!?:;"()\']|--|\s)',text)
result = [item.strip() for item in result if item.strip()]
print(result)

['hello', 'there', ',', 'nice', 'place', '!', 'is', 'it', 'yours', '?']


In [3]:
preprocessed = re.split(r'([,.!?:;"(_)\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"', 'Of', 'course', 'it', "'", 's', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'", 'way', 'up', ';', 'but', 'I', 'don', "'", 't', 'think', 'of', 'that', ',

In [4]:
print(len(preprocessed))

4690


In [5]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [6]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [7]:
for i,item in enumerate(vocab.items()) : 
    print(item)
    if i>3 : 
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)


In [8]:
class SimpleTokenizerV1:
    def __init__(self,vocab) : 
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self,text):
        preprocessed = re.split(r'([,.!?:;"(_)\']|--|\s)',text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        
        return ids
    
    def decode(self,ids):
        text= "".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"\'()])',r'\1',text)
        return text

In [9]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)  

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [10]:
print(tokenizer.decode(ids=ids))

"It'sthelasthepainted,youknow,"Mrs.Gisburnsaidwithpardonablepride.


In [11]:
text = "Hello , how are you ?"
ids = tokenizer.encode(text) ## the word "Hello" does'nt exist in the original vocabulary , so an error would occur 

KeyError: 'Hello'

In [12]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>" , "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [13]:
for i,item in enumerate(list(vocab.items())[-5:]) : 
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [14]:
class SimpleTokenizerV2:
    def __init__(self,vocab) : 
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self,text) : 
        preprocessed = re.split(r'([,.!?:;"(_)\']|--|\s)',text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = text = re.sub(r'\s+([,.;?!"\'()])',r'\1',text)
        return text

In [15]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [16]:
text = "Hello , how are you ?"
ids = tokenizer.encode(text) ## the word "Hello" does'nt exist in the original vocabulary , with the second version of the tokenizer 
                             ##it will be replaced by UNK special token
print(ids)
print(tokenizer.decode(ids))

[1131, 5, 560, 169, 1126, 10]
<|unk|>, how are you?


Byte pair Encoding - BPE

In [17]:
import importlib
import importlib.metadata
import tiktoken
print("tiktoken library version : "  , importlib.metadata.version("tiktoken"))

tiktoken library version :  0.9.0


In [18]:
tokenizer = tiktoken.get_encoding("gpt2")

In [19]:
text = (
"Hello, do you like tea? <|endoftext|> In the sunlit terraces"
"of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
print(" ")
strings = tokenizer.decode(integers)
print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
 
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [20]:
## The BPE tokenizer can handle unfamiliar words without the need of the <unk> token
## if it encounters an unfamiliar word during tokenization it breaks it down into sub-word and character tokens that it can handles 

text_unk = "jqsdfjkjb klkzefoii"
print(text_unk)
encoded = tokenizer.encode(text_unk)
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

jqsdfjkjb klkzefoii
[73, 48382, 7568, 73, 42421, 65, 479, 75, 74, 89, 891, 78, 4178]
jqsdfjkjb klkzefoii


Input-target Pairs

In [21]:
with open("the-verdict.txt", "r" , encoding="utf-8") as f : 
    raw_text = f.read()

encoded_text = tokenizer.encode(raw_text)
print(len(encoded_text)) ## length of the vocabulary

5145


In [22]:
encoded_sample = encoded_text[50:]

context_size = 10 

x = encoded_sample[:context_size]
y = encoded_sample[1:context_size+1]

print("X :" , x)
print("Y :", y)

X : [290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686]
Y : [4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976]


In [23]:
## sliding window strategy 
for i in range(1,context_size+1): 
    context = encoded_sample[:i]
    desired = encoded_sample[i]
    
    print(context , "----->" , desired)

[290] -----> 4920
[290, 4920] -----> 2241
[290, 4920, 2241] -----> 287
[290, 4920, 2241, 287] -----> 257
[290, 4920, 2241, 287, 257] -----> 4489
[290, 4920, 2241, 287, 257, 4489] -----> 64
[290, 4920, 2241, 287, 257, 4489, 64] -----> 319
[290, 4920, 2241, 287, 257, 4489, 64, 319] -----> 262
[290, 4920, 2241, 287, 257, 4489, 64, 319, 262] -----> 34686
[290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686] -----> 41976


In [24]:
for i in range(1,context_size+1): 
    context = encoded_sample[:i]
    desired = encoded_sample[i]
    
    print(tokenizer.decode(context) , "----->" , tokenizer.decode([desired]))

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a
 and established himself in a ----->  vill
 and established himself in a vill -----> a
 and established himself in a villa ----->  on
 and established himself in a villa on ----->  the
 and established himself in a villa on the ----->  Riv
 and established himself in a villa on the Riv -----> iera


Coding the DataLoader

In [25]:
from torch.utils.data import Dataset ,DataLoader
import torch

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride) : ## we need the stride to determine where to start the next input-output pair
          
        self.input_ids=[]
        self.target_ids=[]
        
        token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
        for i in range(0,len(token_ids)-max_length,stride): # We are using sliding window to chunk the book into overlapping sequences of max_length
            input_chunk = token_ids[i:i+max_length]        
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        
    def __len__(self): ## total number of raws in the dataset   
        return len(self.input_ids)
    
    def __getitem__(self,idx) : ## return a single raw from the dataset 
        return self.input_ids[idx] , self.target_ids[idx]
            

In [26]:
## primarily we need the DataLoader to perform parallel processing (analyze multiple batches at one time)
def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0) :
    ## for the batch_size : it is used for the model to update its parameters after each batch_size inputs being processed
    ## num_workers is for parallel processing over multiple threads of the cpu
    
    tokenizer = tiktoken.get_encoding("gpt2")
    ## creating the dataset
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)
    
    ## creating the dataloader 
    
    Data_Loader = DataLoader( ## this method will check the __getitem__() and __len__() method in the  GPTDatasetV1 dataset class 
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    
    return Data_Loader ## input-output pairs 

In [27]:
data_loader = create_dataloader_v1(raw_text)
print(len(data_loader))
for idx , (x,y) in enumerate(data_loader) : 
    print(f"Batch {idx+1} : ",x,y)

9
Batch 1 :  tensor([[   13,   198,   198,  ...,   286,   262,  5385],
        [ 3347, 27846,   503,  ...,  2951,   625,     0],
        [10197,   832,   262,  ...,  9074,    13,   402],
        [   11,   508,   550,  ...,   526,   198,   198]]) tensor([[  198,   198,     1,  ...,   262,  5385, 41186],
        [27846,   503,  2048,  ...,   625,     0, 19672],
        [  832,   262, 46475,  ...,    13,   402,   271],
        [  508,   550, 18459,  ...,   198,   198,  3347]])
Batch 2 :  tensor([[1092,  517,  621,  ...,  423, 1309,  257],
        [2612, 4369,   11,  ...,  655, 4030,  465],
        [1310, 1165,  881,  ...,   13,  366, 2215],
        [ 383, 8631, 3872,  ..., 1813,  284,  423]]) tensor([[  517,   621,   611,  ...,  1309,   257,  1310],
        [ 4369,    11,   523,  ...,  4030,   465,  2951],
        [ 1165,   881, 40642,  ...,   366,  2215,   673],
        [ 8631,  3872,   373,  ...,   284,   423,   520]])
Batch 3 :  tensor([[  673,  1908,   329,  ...,   514,  2474,   198],

In [28]:
dataloader = create_dataloader_v1(raw_text,batch_size=2,max_length=4,stride=1,shuffle=False)
## it is common to train LLMs with much bigger input sizes (at least 256)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)
## small batches require less memory resources but lead to more noisy model updates 
## the batch_size is a trade-off and a hyperparameter to experiment with when training LLMs 

[tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807]]), tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619]])]
[tensor([[2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402]]), tensor([[1464, 1807, 3619,  402],
        [1807, 3619,  402,  271]])]


In [29]:
dataloader = create_dataloader_v1(raw_text,batch_size=7,max_length=4,stride=1,shuffle=False)
## it is common to train LLMs with much bigger input sizes (at least 256)
data_iter = iter(dataloader)
first_batch_input , first_batch_target = next(data_iter)
print(f"input : \n")
print(first_batch_input)
print(f"target : \n")
print(first_batch_target)

####Be Careful with the stride !!! => more bigger stride leads to overfitting (because of the overlapping between inputs!!!!!!)

input : 

tensor([[   40,   367,  2885,  1464],
        [  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138]])
target : 

tensor([[  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257]])


Word Embedding

In [30]:
import torch.nn as nn
input_ids = torch.tensor([2,3,5,1])
vocab_size = 6
output_dim = 3 

torch.manual_seed(123)
embedding_layer = nn.Embedding(vocab_size,output_dim)

In [31]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [32]:
print(embedding_layer(torch.tensor([3]))) ## the embedding matrix is like a look up table 

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [33]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [34]:
vocab_size = 50257
output_dim = 256 ## size of the vector after embedding
##this id 3 times smaller than the GPT2-small embedding size = 768
 
 ## embedding layer 
token_embedding_layer = nn.Embedding(vocab_size,output_dim)

In [35]:
## initializing the DataLoader
context_length = 4 
dataloader = create_dataloader_v1(raw_text,batch_size=8,
            max_length=context_length,stride=context_length,shuffle=False)

data_iter = iter(dataloader)
inputs , targets = next(data_iter)
print(f"token IDs : \n",inputs)
print(f"Inputs shape : \n",inputs.shape)

token IDs : 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Inputs shape : 
 torch.Size([8, 4])


In [36]:
inputs , targets = next(data_iter)
print(f"token IDs : \n",inputs)
print(f"Inputs shape : \n",inputs.shape)

token IDs : 
 tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]])
Inputs shape : 
 torch.Size([8, 4])


In [37]:
## ==> now for the embedding
embedded_batch = token_embedding_layer(inputs[0:3])
print(embedded_batch.shape)

torch.Size([3, 4, 256])


In [38]:
## for the positional encoding , we need another embedding layer with the same size as the previous one
pos_embedding_layer = nn.Embedding(context_length,output_dim)
print(pos_embedding_layer.weight)

Parameter containing:
tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        [-0.6754, -0.4628,  1.4323,  ...,  0.8139, -0.7088,  0.4827]],
       requires_grad=True)


In [39]:
pos_embeddings = pos_embedding_layer(torch.arange(context_length)) ## ==> look up table
print(pos_embeddings.shape)

torch.Size([4, 256])
