In [25]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
with open('./archive/theverdict.txt') as f:
    raw_text = f.read()
    
tiktokenizer = tiktoken.encoding_for_model("gpt2")
tokens = tiktokenizer.encode(raw_text)
len(tokens)

5145

In [None]:
context_limit = 10

for i in range(1,context_limit+1):
    X = tokens[:i]
    y = tokens[i]
    print(tiktokenizer.decode(X),"  >  " ,tiktokenizer.decode([y]))

I   >    H
I H   >   AD
I HAD   >    always
I HAD always   >    thought
I HAD always thought   >    Jack
I HAD always thought Jack   >    G
I HAD always thought Jack G   >   is
I HAD always thought Jack Gis   >   burn
I HAD always thought Jack Gisburn   >    rather
I HAD always thought Jack Gisburn rather   >    a


In [13]:
class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(txt, allowed_special= {"<|endoftext|>",})
        
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [26]:
dataset = GPTDataset(raw_text, tiktokenizer, max_length = 4, stride = 4)
dataloader = DataLoader(dataset, batch_size = 8, shuffle = False, drop_last = True, num_workers = 0)

In [33]:
for chunk in dataloader:
    input_data, target = chunk
    print(input_data[0], target[0])
    break

tensor([  40,  367, 2885, 1464]) tensor([ 367, 2885, 1464, 1807])
