In this notebook we will convert the raw text data into tokens

Tokens into embeddings

<img src="images/01.png" height="300px">

In [81]:
from importlib.metadata import version
import torch
import tiktoken
from torch.utils.data import Dataset, DataLoader
import re

print("Version of torch is :" , version("torch"))
print("Version of tiktoken is :" , version("tiktoken"))

Version of torch is : 2.5.1
Version of tiktoken is : 0.8.0


In [82]:
class SimpleTokenizer:
    def __init__ (self,path):
        self.path=path
        self.vocab=None
        self.generate_vocab()
        self.str_to_int = self.vocab
        self.int_to_str = {i:s for s,i in self.vocab.items()}
        
    def generate_vocab(self):
        raw_text=self.generate_rawtext()
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        all_words = sorted(set(preprocessed))
        self.vocab = {token:integer for integer,token in enumerate(all_words)}

    def generate_rawtext(self):
        with open(self.path,'r',encoding='utf-8') as file:
            raw_text=file.read()
            return raw_text
        
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'([,.:;?_!"()\']|--|\s)', r'\1', text)
        return text

In [83]:
costum_tokenizer = SimpleTokenizer("the-verdict.txt")

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

ids = costum_tokenizer.encode(text)
print(ids)
print(costum_tokenizer.decode(ids))

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It ' s the last he painted , you know , " Mrs . Gisburn said with pardonable pride .


BytePair Encoding

- GPT-2 uses BytePair encoding (BPE) as its tokenizer
- it allows the model to break down words that aren't in its predefined vocabulary into smaller subword units or even individual characters, enabling it to handle out-of-vocabulary words
- For instance, if GPT-2's vocabulary doesn't have the word "unfamiliarword," it might tokenize it as ["unfam", "iliar", "word"] or some other subword breakdown, depending on its trained BPE merges
- we are using the BPE tokenizer from OpenAI's open-source tiktoken library, which implements its core algorithms in Rust to improve computational performance


<img src="images/02.png" height="300px">

<img src="images/03.png" height="300px" width="600px">

In [87]:

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt)

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1] 
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [88]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=False,
                         num_workers=0):

    # Initialize the tokenizer
    #tokenizer = tiktoken.get_encoding("gpt2")
    tokenizer=costum_tokenizer

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [89]:
with open("the-verdict.txt",'r',encoding='utf-8') as file:
    raw_text=file.read()
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  53,   44,  149, 1003],
        [  57,   38,  818,  115],
        [ 256,  486,    6, 1002],
        [ 115,  500,  435,  392],
        [   6,  908,  585, 1077],
        [ 709,  508,  961, 1016],
        [ 663, 1016,  535,  987],
        [   5,  568,  988,  538]])

Targets:
 tensor([[  44,  149, 1003,   57],
        [  38,  818,  115,  256],
        [ 486,    6, 1002,  115],
        [ 500,  435,  392,    6],
        [ 908,  585, 1077,  709],
        [ 508,  961, 1016,  663],
        [1016,  535,  987,    5],
        [ 568,  988,  538,  722]])
