In [18]:
import urllib.request

In [19]:
## Download a sample text

In [20]:
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")

file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x1df142b0b80>)

In [21]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [22]:
## Tokenize the text

In [23]:
import re

In [24]:
text = "Hello, world. This, is a test life?"
result = re.split(r"(\s)", text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'life?']


In [25]:
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'life', '?', '']


In [26]:
result = [item for item in result if item.strip()]

print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', 'life', '?']


In [27]:
# apply to the verdict text
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

print(len(preprocessed))
print(preprocessed[:30])


4649
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [28]:
## Converting tokens into token IDs

In [29]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1159


In [30]:
vocab = {token:integer for integer,token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindle:', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)


In [31]:
## Simple tokenizer class

class SimpleTokenizerV1:
    def __init__(self, vocab) -> None:
        self.str_to_int = vocab
        self.int_to_str = {vocab[key]:key for key in vocab.keys()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids


    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids]) 
        
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [32]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)


[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136, 773, 812, 7]


In [33]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [34]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

In [35]:
## Add special tokens:
##### unknown word token
##### end of sentence token

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
 
print(len(vocab.items()))

1161


In [36]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1156)
('your', 1157)
('yourself', 1158)
('<|endoftext|>', 1159)
('<|unk|>', 1160)


In [37]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int
                        else "<|unk|>" for item in preprocessed]
 
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
 
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text


In [38]:
tokenizerv2 = SimpleTokenizerV2(vocab)

In [39]:
text = "Hello, do you like tea? What about you?"
print(tokenizerv2.encode(text))

[1160, 5, 362, 1155, 642, 1000, 10, 113, 122, 1155, 10]


In [40]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))

print(text)
print(tokenizerv2.encode(text))
print(tokenizerv2.decode(tokenizerv2.encode(text)))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1160, 5, 362, 1155, 642, 1000, 10, 1159, 57, 1013, 981, 1009, 738, 1013, 1160, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [41]:
"""
[BOS] (beginning of sequence)—This token marks the start of a text. It signifies to the LLM where a piece of content begins.

[EOS] (end of sequence)—This token is positioned at the end of a text and is especially useful when concatenating multiple unrelated texts, 
        similar to <|endoftext|>. For instance, when combining two different Wikipedia articles or books, the [EOS] token indicates where one article ends and
        the next one begins.

[PAD] (padding)—When training LLMs with batch sizes larger than one, the batch might contain texts of varying lengths. 
        To ensure all texts have the same length, the shorter texts are extended or “padded” using the [PAD] token, up to the length of the longest text in the batch.

<|endoftext|> token that we can use to separate two unrelated text sources.


Note that the tokenizer used for GPT models does not need any of these tokens mentioned here but only uses an <|endoftext|> token for simplicity. 
The <|endoftext|> is analogous to the [EOS] token mentioned earlier. Also, <|endoftext|> is used for padding as well. 
However, as we'll explore in subsequent chapters, when training on batched inputs, we typically use a mask, meaning we don’t attend to padded tokens. 
Thus, the specific token chosen for padding becomes inconsequential.
"""

"\n[BOS] (beginning of sequence)—This token marks the start of a text. It signifies to the LLM where a piece of content begins.\n\n[EOS] (end of sequence)—This token is positioned at the end of a text and is especially useful when concatenating multiple unrelated texts, \n        similar to <|endoftext|>. For instance, when combining two different Wikipedia articles or books, the [EOS] token indicates where one article ends and\n        the next one begins.\n\n[PAD] (padding)—When training LLMs with batch sizes larger than one, the batch might contain texts of varying lengths. \n        To ensure all texts have the same length, the shorter texts are extended or “padded” using the [PAD] token, up to the length of the longest text in the batch.\n\n<|endoftext|> token that we can use to separate two unrelated text sources.\n\n\nNote that the tokenizer used for GPT models does not need any of these tokens mentioned here but only uses an <|endoftext|> token for simplicity. \nThe <|endoftext

In [42]:
## Byte pair encoding

"""
The BPE tokenizer was used to train LLMs such as GPT-2, GPT-3, and the original model used in ChatGPT.
"""

'\nThe BPE tokenizer was used to train LLMs such as GPT-2, GPT-3, and the original model used in ChatGPT.\n'

In [43]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.7.0


In [44]:
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer)

<Encoding 'gpt2'>


In [45]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

strings = tokenizer.decode(integers)

print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [46]:
## Data sampling with a sliding window

In [47]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))
print(enc_text[:50])

5145
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11]


In [51]:
context_size = 4

enc_sample = enc_text[50:]
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [54]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))
    print('------')

[290] ----> 4920
 and ---->  established
------
[290, 4920] ----> 2241
 and established ---->  himself
------
[290, 4920, 2241] ----> 287
 and established himself ---->  in
------
[290, 4920, 2241, 287] ----> 257
 and established himself in ---->  a
------


In [56]:
## Dataloader for pytorch - efficient and makes tensors
import torch
from torch.utils.data import Dataset, DataLoader
 
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
 
        token_ids = tokenizer.encode(txt)
 
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
 
    def __len__(self):
        return len(self.input_ids)
 
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [57]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    
    ## tokenizer definition
    tokenizer = tiktoken.get_encoding("gpt2")
    
    ## dataset definition
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    ## create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )
 
    return dataloader

In [65]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
 
dataloader = create_dataloader_v1(raw_text, batch_size=2, max_length=5, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)

print(first_batch)

[tensor([[  40,  367, 2885, 1464, 1807],
        [ 367, 2885, 1464, 1807, 3619]]), tensor([[ 367, 2885, 1464, 1807, 3619],
        [2885, 1464, 1807, 3619,  402]])]


In [70]:
print(first_batch[0])
print(first_batch[1])

tensor([[  40,  367, 2885, 1464, 1807],
        [ 367, 2885, 1464, 1807, 3619]])
tensor([[ 367, 2885, 1464, 1807, 3619],
        [2885, 1464, 1807, 3619,  402]])


In [71]:
## Create token embeddings

In [73]:
torch.manual_seed(123)

# test purposes
vocab_size = 6
output_size = 3



embedding_layer = torch.nn.Embedding(vocab_size, output_size)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [75]:
embedding_layer(torch.tensor([3, 1, 5, 2]))

tensor([[-0.4015,  0.9666, -1.1481],
        [ 0.9178,  1.5810,  1.3010],
        [-2.8400, -0.7849, -1.4096],
        [ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)

In [78]:
## Encoding word positions
##   Two options: relative positional embeddings and absolute positional embeddings

##   Absolute: For each position in the input sequence, a unique embedding is added to the token’s embedding to convey its exact location.
##   Relational: The emphasis of relative positional embeddings is on the relative position or distance between tokens. 
#                This means the model learns the relationships in terms of “how far apart” rather than "at which exact position." 
#                The advantage here is that the model can generalize better to sequences of varying lengths, even if it hasn’t seen such lengths during training.

In [79]:
output_dim = 256
vocab_size = 50257

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [83]:
max_length = 4

dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
   stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)


Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [84]:
token_embeddings = token_embedding_layer(inputs)

print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [89]:
## Positional embeddings
context_length = max_length

pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

## positional embedding tensor
pos_embeddings = pos_embedding_layer(torch.arange(context_length))


print(pos_embeddings.shape)

torch.Size([4, 256])


In [90]:
## Final Stage
##    Embedding + Positional Embedding

In [91]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [97]:
from IPython.core.display import Image, display
display(Image(url='Tokenization_Overview.png', width=600, unconfined=True));

  from IPython.core.display import Image, display
