In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

import tiktoken

In [2]:
torch.cuda.is_available(), torch.cuda.get_device_name()

(True, 'NVIDIA T1200 Laptop GPU')

In [3]:
with open("./the_things.txt", encoding="utf-8") as file:
    raw_text = file.read()

len(raw_text)

39181

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
token_ids = tokenizer.encode(raw_text)
len(token_ids)

9275

In [6]:
class BookDataset(Dataset):
    def __init__(self, token_ids, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        for i in range(0, len(token_ids) - max_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i : i + max_length]))
            self.target_ids.append(torch.tensor(token_ids[i + 1 : i + 1 + max_length]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [7]:
book_ds = BookDataset(token_ids, 256, 128)

In [8]:
book_loader = DataLoader(book_ds, batch_size=4, shuffle=True, drop_last=True)

In [10]:
GPT_CONFIG = {
    "vocab_size": tokenizer.n_vocab,
    "context_length": 1024,
    "embed_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "dropout_rate": 0.1,
    "qkv_bias": False,
}