In [1]:
import PyPDF2

with open("The_Name_of_the_Wind.pdf", "rb") as f:
    reader = PyPDF2.PdfReader(f)
    raw_text = ""
    for page in reader.pages:
        raw_text += page.extract_text()

print("Total number of characters:", len(raw_text))
print(raw_text[:99])


Total number of characters: 1394574
Table of Contents
 
 
Title Page
Copyright Page
Dedication
 
CHAPTER ONE - A Place for Demons
CHAPT


In [2]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

419127


In [4]:
enc_sample = enc_text[50:]


In [5]:
context_size = 4 #length of the input
#The context_size of 4 means that the model is trained to look at a sequence of 4 words (or tokens) 
#to predict the next word in the sequence. 
#The input x is the first 4 tokens [1, 2, 3, 4], and the target y is the next 4 tokens [2, 3, 4, 5]

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [3791, 9624, 198, 41481]
y:      [9624, 198, 41481, 376]


In [6]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[3791] ----> 9624
[3791, 9624] ----> 198
[3791, 9624, 198] ----> 41481
[3791, 9624, 198, 41481] ----> 376


In [7]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

New ----> arre
Newarre ----> 

Newarre
 ----> CHAPTER
Newarre
CHAPTER ---->  F


In [None]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]