In [4]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


# Tokenizing text

In [5]:
import re
text = "Hello, world. This, is a test."
result = re.split(r"(\s)", text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


split by punctuation as well

In [6]:
result = re.split(r"([,.]|\s)", text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


remove whitespaces

In [7]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


handling all types of special characters

In [8]:
text = "Hello, world. Is this-- a test?"
result = re.split(r"([,.:;?_!\"()\']|--|\s)", text)
result = [ item.strip() for item in result if item.strip()]
result

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']

Apply this tokenization on the text of interest

In [9]:
preprocessed = re.split(r"([,;.:?_!\"()']|--|\s)", raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [10]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


# Converting tokens into token IDs

Building vocabulary for token to integer mapping

In [11]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [12]:
vocab = {token: integer for integer, token in enumerate(all_words)}

for i, item in enumerate(vocab):
    print(item)
    if i >= 10: break

!
"
'
(
)
,
--
.
:
;
?


*Apply vocabulary to convert new text into token ids*
**Tokenizer Class**

In [13]:
class SimpleTokenizerV1:
    """
    Tokenizer class with an encode method that splits text into tokens and carries out the string-to-integer
    mapping to produce toke IDs via the vocabulary
    """
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # remove spaces before specified punctuation
        return text

In [14]:
tokenizer = SimpleTokenizerV1(vocab=vocab)
text = """"it's the last he painted, you know,"
        Mrs. Gisburn said with pardonable pride"""

ids = tokenizer.encode(text)
print(ids)

[1, 585, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793]


In [15]:
print(tokenizer.decode(ids))

" it' s the last he painted, you know," Mrs. Gisburn said with pardonable pride


In [16]:
# Apply encoding to new text sample not contained in the training test
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

# Adding special context tokens

We need to modify the tokenizer to handle **unknown words**. We also need to address the usage and addition of **special context tokens** that can enhance a model's understanding of context or other relevant information in the text

In [17]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token: integer for integer, token in enumerate(all_tokens)}

print(len(vocab))

1132


In [18]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [19]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i: s for s, i in vocab.items() }
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int
                        else "<|unk|>" for item in preprocessed]

        ids = [self.str_to_int[item] for item in preprocessed]
        return ids

    def decode(self, ids):  

        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.;:?!"()\'])', r'\1', text) # remove spaces before specified punctuation
        return text      

In [20]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [21]:
tokenizer = SimpleTokenizerV2(vocab=vocab)
print(tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [22]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


Additional special tokens:
- <code>[BOS]</code> *Beginning of sequence*
- <code>[EOS]</code> *End of sequence*
- <code>[PAD]</code> *Padding*

# Byte Pair Encoding

In [23]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [24]:
# instatiate BPE tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

In [25]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [26]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


2 Observations:
1. The <|endoftext|> token is assigned arelatively large token ID.
   The original model used in ChatGPT, has a total vocabulary size of 50,257, with <|endoftext|> being assigned the largest token ID
2. BPE tokenizer encodes and decodes unknown words correctly.
   

In [27]:
# exercise
temp = "Akwirw ier."
print(tokenizer.encode(temp))
print(tokenizer.decode(tokenizer.encode(temp)))

[33901, 86, 343, 86, 220, 959, 13]
Akwirw ier.


# Data sampling with a sliding window

In [28]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text)) 

5145


In [29]:
# we remove the first 50 tokens because Sebastian said so
enc_sample = enc_text[50:]

In [30]:
context_size = 4 # this determines how many tokens are included in the input
x = enc_sample[:context_size]
y = enc_sample[1: context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [31]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [32]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


before we can turn the tokens into embeddings we need implement an efficient data loader that iterates over the input dataset and returns the inputs and targets as PyTorch tensors
Note: for efficient data loader implementation, we will use Pytorch's built-in *Dataset* and *DataLoader* classes

In [33]:
import torch
from torch.utils.data import Dataset, DataLoader

In [38]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt) # tokenize the entire text

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i + max_length]
            target_chunk = token_ids[i+1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [39]:
def create_dataloader_v1(txt, batch_size=4, max_lenght=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt=txt, tokenizer=tokenizer, max_length=max_lenght, stride=stride)
    dataloader = DataLoader(dataset=dataset, 
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=num_workers)
    
    return dataloader

In [40]:
batch_size = 1
max_length = 4

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(txt=raw_text,
                                  batch_size=batch_size,
                                  max_lenght=max_length,
                                  stride=1,
                                  shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [42]:
# understand the meaning of stride
second_batch = next(data_iter)
print(second_batch)
# the stride setting dictates the number of positions the inputs shift across batches,
# emulating a sliding window approach

[tensor([[2885, 1464, 1807, 3619]]), tensor([[1464, 1807, 3619,  402]])]


In [44]:
# how can we use the dataloader to sample with a batch size greater than 1
dataloader = create_dataloader_v1(
    txt=raw_text,
    batch_size=8,
    max_lenght=4,
    stride=4,
    shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs: \n", inputs)
print("\nTargets: \n", targets)

# now that we increase the stride to 4 to utilize the dataset fully (we don't skip a single word)
# This avoids any overlap between the batches since more overlap could lead to increased overfitting

Inputs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets: 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


# Create token embeddings

In [46]:
# raw_text --> tokenization --> converting tokens to IDs --> converting IDs into embedding vectors
#NOTE: Minor shortcoming of LLMs, their self-attention mechanism doesn't have a notion 
# of position or order for the tokens within a sequence
# The way previously introduced embedding layer workds is tat the same token ID always gets mapped to the same vector representation,
# regardless of where the token ID is positioned in the input sequence

In [47]:
vocab_size = 50_257 # BPE tokenizer has a vocabulary size of 50,257 IDs
output_dim = 256 # embedding size, GPT-3 embedding size is 12,288
token_embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=output_dim)

In [49]:
# using the previous token_embedding_layer, if we sample data from the data loader, 
# we embed each token in each batch into a 256-dimensional vector
# batch_size of 8, with 4 tokens each, the result will be an 8 X 4 X 256 tensor

max_length = 4

dataloader = create_dataloader_v1(txt=raw_text,
                                  batch_size=8,
                                  max_lenght=max_length,
                                  stride=max_length,
                                  shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs: \n", inputs)
print("\nInputs shape: \n", inputs.shape)

Token IDs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape: 
 torch.Size([8, 4])


In [51]:
# Let's now use the embedding layer to embed these token IDs into 256-dimensional vectors

token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [53]:
# For a GPT model's absolute embedding approach, we just need to create another embedding layer that has the same embedding dimension as the token_embedding_layer
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(num_embeddings=context_length, embedding_dim=output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [54]:
# We can now add these directly to the token embeddigns, where PyTorch will add the 4x256-dimensional pos_embeddings tensor
# to each 4x256-dimensional token embedding tensor in each of the eight batches
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
