In [2]:
import tiktoken

In [3]:
tokenizer=tiktoken.get_encoding("gpt2")

In [4]:
with open("Data/02 Harry Potter and the Chamber of Secrets.txt","r",encoding="utf-8") as f:
    raw_text=f.read()

In [5]:
enc_text=tokenizer.encode(raw_text)

In [6]:
enc_text[:50]

[3673,
 329,
 262,
 717,
 640,
 11,
 281,
 4578,
 550,
 5445,
 503,
 625,
 12607,
 379,
 1271,
 1440,
 11,
 4389,
 16809,
 9974,
 13,
 1770,
 13,
 27820,
 360,
 1834,
 1636,
 550,
 587,
 266,
 4233,
 287,
 262,
 1903,
 2250,
 286,
 262,
 3329,
 416,
 257,
 7812,
 11,
 289,
 12494,
 7838,
 422,
 465,
 26301,
 5850,
 447]

In [7]:
tokenizer.decode(enc_text[:50])

'Not for the first time, an argument had broken out over breakfast at number four, Privet Drive. Mr. Vernon Dursley had been woken in the early hours of the morning by a loud, hooting noise from his nephew Harryï¿½'

In [8]:
context_size=4

x=enc_text[0:context_size]
y=enc_text[1:context_size+1]

print(f"x: {x}")
print(f"y: {y}")

x: [3673, 329, 262, 717]
y: [329, 262, 717, 640]


In [9]:
for i in range(0,context_size):
    print(f"{enc_text[:i+1]} ==> {enc_text[i+1]}")

[3673] ==> 329
[3673, 329] ==> 262
[3673, 329, 262] ==> 717
[3673, 329, 262, 717] ==> 640


In [10]:
for i in range(0,context_size):
    current=enc_text[:i+1]
    next_=enc_text[i+1]
    print(f"{tokenizer.decode(current)} ==> {tokenizer.decode([next_])}")

Not ==>  for
Not for ==>  the
Not for the ==>  first
Not for the first ==>  time


In [11]:
import torch

In [12]:
from torch.utils.data import Dataset

class GptDataset(Dataset):
    def __init__(self,text,tokenizer,max_length,stride):
        self.input_ids=[]
        self.target_ids=[]

        tokens=tokenizer.encode(text,allowed_special={"<|endoftext|>"})

        for i in range(0,len(tokens)-max_length,stride):
            input_id=tokens[i:i+max_length]
            target_id=tokens[i+1:i+max_length+1]

            self.input_ids.append(torch.Tensor(input_id))
            self.target_ids.append(torch.Tensor(target_id))
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index],self.target_ids[index]



In [13]:
from torch.utils.data import DataLoader

def create_dataloaders(text,tokenizer,batch_size=4,max_length=256,stride=128,
                       shuffle=True,drop_last=True,num_workers=0):
        dataset=GptDataset(text=text,tokenizer=tokenizer,max_length=max_length,stride=stride)

        dataloader=DataLoader(dataset,
                              batch_size=batch_size,
                              shuffle=shuffle,
                              drop_last=drop_last,
                              num_workers=num_workers)
        
        return dataloader


In [14]:
with open("Data/03 Harry Potter and the Prisoner of Azkaban.txt","r",encoding="utf-8") as f:
        text_corpus=f.read()

In [15]:
dataloader=create_dataloaders(text_corpus,tokenizer, batch_size=1, max_length=4, stride=1, shuffle=False)

In [16]:
iter_data=iter(dataloader)

In [17]:
iter_data

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x25427659310>

In [18]:
first_batch=next(iter_data)

In [19]:
first_batch

[tensor([[18308., 14179.,   373.,   257.]]),
 tensor([[14179.,   373.,   257.,  4047.]])]