# Chapter 2, part 2 - Data loader for pretrain LLM
In the previous part we implemented tokenizers to transform raw text into integer token IDs for further processing, in this part we look into how to load torch dataset from a text and how to generate self-labeling data loaders. 


In [6]:
# let's use tiktoken as tokenizer
import tiktoken
from importlib.metadata import version
print("tiktoken version:", version("tiktoken"))

tokenizer = tiktoken.get_encoding("gpt2")


tiktoken version: 0.7.0


In [7]:
# open the text and transfer it to token IDs
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5146


In [8]:
# remove the first 50 tokens as they are not as interesting as the following part?
enc_sample = enc_text[50:]
print(len(enc_sample))

5096


One of the easiest and most intuitive ways to create the input-target pairs for the next-world prediction task is to create two variables, x and y, where x contains the input tokens and y contains the targets, which are the inputs shifted by 1:

In [13]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

# now let's visualize one training datum
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]
 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


A PyTorch Dataset class and a DataLoader method to load training data

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        
        self.input_ids = []
        self.target_ids = []
        
        # convert all string txt to token ids
        token_ids = tokenizer.encode(txt)
        
        # here max_length is the length of the sampling window, it's the same as context_size in the previous cell
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    

def create_dataloader_v1(txt,
                         batch_size=4,
                         max_length=256,
                         stride=128,
                         shuffle=True,
                         drop_last = True,
                         num_workers=0) -> DataLoader:
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

Now let's use the above code

In [21]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
# visualize the 1st batch
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]
