In [2]:
import re

In [3]:
with open("Harry Potter and the Sorcerers Stone.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

In [4]:
print("Total number of characters:", len(raw_text))
print(raw_text[:100])

Total number of characters: 263976
M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly norm


# Classi Tokenizer

In [5]:
class Tokenizer:

    TOKEN_PATTERN = r'([,.:;?\-!"()\']|\s)'
    END_OF_TEXT = "<|endoftext|>"
    UNKNOWN_TOKEN = "<|unk|>"

    def __init__(self, raw_text):
        self.raw_text = raw_text
        self.tokens = self.get_tokens(self.raw_text)

    def get_tokens(self, text):
        tokens = re.split(self.TOKEN_PATTERN, text)
        tokens =[t.strip() for t in tokens if t.strip()]
        self.tokens = tokens + [self.END_OF_TEXT, self.UNKNOWN_TOKEN]
        self.idx_to_token = {i: t for i, t in enumerate(self.tokens)}
        self.token_to_idx = {t: i for i, t in enumerate(self.tokens)}

    def encode(self, text):
        tokens = re.split(self.TOKEN_PATTERN, text)
        tokens = [t.strip() for t in tokens if t.strip()]
        return [
            self.token_to_idx.get(t, self.token_to_idx[self.UNKNOWN_TOKEN])
            for t in tokens
        ] + [self.token_to_idx[self.END_OF_TEXT]]

    def decode(self, indices):
        tokens = [self.idx_to_token[i] for i in indices]
        text = " ".join(tokens)
        text = re.sub(r'\s([,.:;?\-!"()\'])', r"\1", text)
        return text

In [6]:
tokenizer = Tokenizer(raw_text)

text = "Harry Potter is a wizard."
encoded = tokenizer.encode(text)

print("Encoded text:", encoded)

decoded = tokenizer.decode(encoded)
print("Decoded text:", decoded)

Encoded text: [54275, 50599, 51951, 54345, 40688, 54362, 54363]
Decoded text: Harry Potter is a wizard. <|endoftext|>


In [7]:
text = "Harry Potter is in the palace."
encoded = tokenizer.encode(text)
print("Encoded text:", encoded)

decoded = tokenizer.decode(encoded)
print("Decoded text:", decoded)

Encoded text: [54275, 50599, 51951, 54263, 54354, 54364, 54362, 54363]
Decoded text: Harry Potter is in the <|unk|>. <|endoftext|>


# Byte pair encoding (BPE)

We have three types of tokenization:

**1. Word-level Tokenization:** Splitting text into words based on spaces and punctuation. the problem with this type is out-of-vocabulary (OOV), as well as we might diffrent meaning of similar words for example `[play, played]` or `[boy, boys]`

**2. Character based tokenization:** one the advantages of this approach is having small vocabulary size (~256 for english), however, the problem is that we lose the meaning associated with words, and the tokenized sequence is much longer   

**3. SubWord-based tokenization:** in this approach we have initialy two rules
- Rule 1: do not split frequently used words into smaller subwords.
- Rule 2: split rare words into smaller meaningful subwords. (e.g played will be `played` and `ed`)

So in general, sub-word tokenization helps the model learn that words with the same root word are similar in meaning e.g `token`, `tokens` and `tokenization`

Also, It helps the model learn that `colenization` and `specialization` are made of diffrent root words but have same suffix `ization` and are used in similar syntactic situations.

**Byte pair encoding (BPE)** is a sub-word tokenization algorithm

*Example*: 
let's say we have a dataset consists of these words
`{"old": 7, "older": 3, "lowest": 4, "finest": 9}`
- For the preprocessing step we add `<\w>` at the end of each word:
    - `{"old<\w>": 7, "older<\w>": 3, "lowest<\w>": 9, "finest<\w>": 13}`
    - then make a character count table 
    
    <img src="token-table-1.png" width="400">

    then we look for the most frequent pair of tokens, we merge them and perform the same iteration again & again, until we reach the token limit or iteration limit, in the image below we found that the pair 'es' occur frequent times, so we add it as a new token then we update the occurences

    <img src="token-table-2.png" width="400">

    then we found that 'est' is most common

    <img src="token-table-3.png" width="400">

    then 'est<\w>' *(Note that <\w> helps the algorithm understand the diffrence between **est**imate and high**est**)*

    <img src="token-table-4.png" width="400">  


**(To be continued)**



In [8]:
import tiktoken

In [9]:
tokenizer = tiktoken.get_encoding("gpt2")

In [10]:
text = "Hello how are you? <|endoftext|> the sky is blue."

encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print("Encoded text:", encoded)

Encoded text: [15496, 703, 389, 345, 30, 220, 50256, 262, 6766, 318, 4171, 13]


In [11]:
tokenizer.decode(encoded)

'Hello how are you? <|endoftext|> the sky is blue.'

In [12]:
tokenizer.encode("Akwirw ier")

[33901, 86, 343, 86, 220, 959]

In [13]:
tokenizer.decode([959])

'ier'

## Data sampling with a sliding window

In [14]:
with open("Harry Potter and the Sorcerers Stone.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

raw_text = raw_text[53:]
enc_text = tokenizer.encode(raw_text)

In [15]:
context_size = 4
x = enc_text[:context_size]
y = enc_text[1:context_size+1]
print(f"x : {x}")
print(f"y :      {y}")

x : [22474, 6613, 284, 910]
y :      [6613, 284, 910, 326]


In [16]:
for i in range(1, context_size + 1):
    context = enc_text[:i]
    target = enc_text[i]
    print(f"context: {context} ----> target: {target}") 

context: [22474] ----> target: 6613
context: [22474, 6613] ----> target: 284
context: [22474, 6613, 284] ----> target: 910
context: [22474, 6613, 284, 910] ----> target: 326


In [17]:
for i in range(1, context_size + 1):
    context = enc_text[:i]
    target = enc_text[i]
    print(f"context: {tokenizer.decode(context)} ----> target: {tokenizer.decode([target])}")

context: were ----> target:  proud
context: were proud ----> target:  to
context: were proud to ----> target:  say
context: were proud to say ----> target:  that


In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text) # Tokenize the entire text

        # Create input-target pairs using sliding window
        for i in range(0, len(token_ids) - max_length, stride):
            input_seq = token_ids[ i : i+max_length ]
            target_seq = token_ids[ i+1 : i+max_length+1 ]
            self.input_ids.append(torch.tensor(input_seq))
            self.target_ids.append(torch.tensor(target_seq))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [19]:
def create_data_loader(text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(text, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        drop_last=drop_last # drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training
    )

    return dataloader

In [20]:
with open("Harry Potter and the Sorcerers Stone.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

# NOTE: If we set the stride equal to the input window size, we can prevent overlaps between the batches
dataloader = create_data_loader(raw_text, max_length=4, stride=2, shuffle=False)

data_iter = iter(dataloader)

first_batch = next(data_iter)

print(f"first batch: {first_batch}")

first batch: [tensor([[  44,  374,   13,  290],
        [  13,  290, 9074,   13],
        [9074,   13,  360, 1834],
        [ 360, 1834, 1636,   11]]), tensor([[ 374,   13,  290, 9074],
        [ 290, 9074,   13,  360],
        [  13,  360, 1834, 1636],
        [1834, 1636,   11,  286]])]


## Creating token embeddings

We initialize the embedding weights with random values as a preliminary step. This initialization serves as the starting point for the LLM's learning process. We will optimize the embedding weights later on.

In [21]:
inputs_ids = torch.tensor([2, 5, 3, 5])

vocab_size = 6 
output_dim = 3 # embedding size

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [22]:
embedding_layer(inputs_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-2.8400, -0.7849, -1.4096],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096]], grad_fn=<EmbeddingBackward0>)

## Encoding word positions

The self-attention mechanism , doesn't have a notion of position or order for the tokens within a sequence.
The way the previously introduced embedding layer works is that the same token ID always gets mapped to the same vector representation, regardless of where the token ID is
positioned in the input sequence.

Absolute positional embeddings are directly associated with specific positions in a sequence. For each position in the input sequence, a unique embedding is added to the
token's embedding to convey its exact location. For instance, the first token will have a specific positional embedding, the second token another distinct embedding, and so on

Instead of focusing on the absolute position of a token, the emphasis of relative positional embeddings is on the relative position or distance between tokens. **This means the model
learns the relationships in terms of "how far apart" rather than "at which exact position.**

In [24]:
vocab_size = tokenizer.n_vocab
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

max_length = 4

dataloader = create_data_loader(
    text=raw_text,
    batch_size=8,
    max_length=max_length,
    stride=max_length,
    shuffle=False
)

data_iter = iter(dataloader)

inputs, targets = next(data_iter)

print(f"Token IDs:\n {inputs}")
print(f"Input shape: {inputs.shape}")

Token IDs:
 tensor([[   44,   374,    13,   290],
        [ 9074,    13,   360,  1834],
        [ 1636,    11,   286,  1271],
        [ 1440,    11,  4389, 16809],
        [ 9974,    11,   547,  6613],
        [  284,   910,   326,   484],
        [  547,  7138,  3487,    11],
        [ 5875,   345,   845,   881]])
Input shape: torch.Size([8, 4])


In [25]:
token_embedding = token_embedding_layer(inputs)
token_embedding.shape

torch.Size([8, 4, 256])

The input to the pos_embeddings is usually a placeholder vector torch.arange(context_length), which contains a sequence of
numbers 0, 1, ..., up to the maximum input length âˆ’ 1. The context_length is a variable
that represents the supported input size of the LLM. Here, we choose it similar to the
maximum length of the input text. In practice, input text can be longer than the supported
context length, in which case we have to truncate the text.

In [26]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


As we can see, the positional embedding tensor consists of four 256-dimensional vectors.
We can now add these directly to the token embeddings, where PyTorch will add the 4x256-
dimensional pos_embeddings tensor to each 4x256-dimensional token embedding tensor in
each of the 8 batches:

In [27]:
input_embeddings = token_embedding + pos_embeddings

**While token embeddings provide consistent vector representations for
each token, they lack a sense of the token's position in a sequence. To
rectify this, two main types of positional embeddings exist: absolute and
relative. OpenAI's GPT models utilize absolute positional embeddings that
are added to the token embedding vectors and are optimized during the
model training.**