In [29]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

In [30]:
tokenizer = tiktoken.get_encoding("gpt2")

In [31]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
enc_sample = enc_text[50:]

In [32]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [33]:
enc_sample = torch.tensor(enc_sample)
enc_sample

tensor([ 290, 4920, 2241,  ...,  286, 1242,  526])

In [34]:
enc_sample.unfold(0, context_size, 1)

tensor([[ 290, 4920, 2241,  287],
        [4920, 2241,  287,  257],
        [2241,  287,  257, 4489],
        ...,
        [ 803,  674, 1611,  286],
        [ 674, 1611,  286, 1242],
        [1611,  286, 1242,  526]])

In [35]:
# Create token sequences using PyTorch unfold
token_sequences = enc_sample.unfold(0, context_size, 1)

# Create data pairs
data = list(zip(token_sequences[:-1], token_sequences[1:]))

In [36]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)
        token_ids = token_ids
        token_ids = torch.tensor(token_ids)
        token_sequences = token_ids.unfold(0, max_length, stride)
        self.input_ids = token_sequences[:-1]
        self.target_ids = token_sequences[1:]

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [37]:
def create_dataloader_v1(
        txt, batch_size=4, max_length=256, 
        stride=128, shuffle=True, drop_last=True,
        num_workers=0,
    ):
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    return DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle,
        drop_last=drop_last,
    )

In [38]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(raw_text[50:], batch_size=8, max_length=4, stride=2, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[ 268, 3754,  438, 2016],
        [ 438, 2016,  257,  922],
        [ 257,  922, 5891, 1576],
        [5891, 1576,  438,  568],
        [ 438,  568,  340,  373],
        [ 340,  373,  645, 1049],
        [ 645, 1049, 5975,  284],
        [5975,  284,  502,  284]]), tensor([[ 438, 2016,  257,  922],
        [ 257,  922, 5891, 1576],
        [5891, 1576,  438,  568],
        [ 438,  568,  340,  373],
        [ 340,  373,  645, 1049],
        [ 645, 1049, 5975,  284],
        [5975,  284,  502,  284],
        [ 502,  284, 3285,  326]])]


In [23]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[4920, 2241,  287,  257]]), tensor([[2241,  287,  257, 4489]])]
