#Creating input target pairs

In [6]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    def encode(self,text):
        preprocessed = preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [item.strip() for item in preprocessed if item.split()]
        preprocessed = [
            item if item in self.str_to_int
            else "|unk|" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        #Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])',r'\1', text)
        return text

In [7]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.11.0


In [8]:
tokenizer = tiktoken.get_encoding("gpt2")

In [9]:
with open(r"D:\Machine Learning\LLM from Scratch\the-verdict.txt","r",encoding = "utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))


5145


#We will remove the first 50 tokens from the dataset for demonstrationm purpose as it results into something interseting

In [10]:
enc_sample = enc_text[50:]

In [11]:
#The context size determines how many tokens are included in the input
context_size = 4 #(Length of the input)
#The context size of 4 means that the model is trained to look at a sequence of 4 words(or tokens)
#The input x is 4 tokens[1,2,3,4] and the target y is the next 4 tokens [2,3,4,5]

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:    {y}")

x: [290, 4920, 2241, 287]
y:    [4920, 2241, 287, 257]


# processing the input along with the targets hich are the inputs shifted by one position we can then create the next word prediction tasks:

In [12]:
print("Four prediction Tasks: ")
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context,"--->",desired)

Four prediction Tasks: 
[290] ---> 4920
[290, 4920] ---> 2241
[290, 4920, 2241] ---> 287
[290, 4920, 2241, 287] ---> 257


In [13]:
# Now same thing is with the text as well
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context),"--->",tokenizer.decode([desired]))

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


#Implementing the dataloader

In [14]:
#For efficient data loader implementation we will use Pytorchs built in dataset and dataloader classes

#Tokenize the entire text
#Use a sliding window to chunk the book into overlapping seq of max_length
#Return the total number of rows in the dataset
#Return the single ow from the dataset

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []

        #Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})


        #USing a sliding window approach
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    

    def __getitem__(self,idx):

        return self.input_ids[idx], self.target_ids[idx]
    


In [33]:
#Now we will use the following code that will use the GPTDatasetv1 to load the input s in batches via a pytorch dataloder
def create_dataloader_v1(txt,batch_size = 4,max_length = 256,stride = 128, shuffle = True, drop_last = True, num_worker = 0):

    tokenizer = tiktoken.get_encoding("gpt2")

    #Create dataset
    dataset = GPTDatasetV1(txt,tokenizer, max_length, stride)

    #Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_worker
    )

    return dataloader



In [34]:
with open(r"D:\Machine Learning\LLM from Scratch\the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()

In [36]:
#Convert the dataloader into the python iterator to fetch the next entry via python built in next() function
import torch
from torch.utils.data import Dataset, DataLoader
print("Pytorch version:", torch.__version__)
Dataloader = create_dataloader_v1(
    raw_text, batch_size = 1, max_length = 4, stride = 1, shuffle = False
)

data_iter = iter(Dataloader)
first_batch = next(data_iter)
print(first_batch)

Pytorch version: 2.5.1+cu121
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [37]:
#If batch size > 1
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n",inputs)
print("\nTargets:\n",targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
