# Sample data PreProcessing

In [None]:
import tiktoken

#Import dataset
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

#Initialize tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

#Split sample text into tokens(words) and encode them to get tokens IDs of all words.
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [None]:
#Implement Class to create dataset class
#This class implemetation is part of "Creating Input-Target" pairs
#This class cretes input and output tensors (multi dimensional matrix)

import torch 
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        #Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        #Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0,len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [5]:
#Implement data loader function
#To return input and output tensors DataLoader calls __getitem__ function of GPTDatasetV1.
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create Dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader