In [1]:
import torch
import tiktoken

In [2]:
with open ("Amontillado.txt", "r") as f:
    raw_text = f.read()

raw_text[:50]

'The thousand injuries of Fortunato I had borne as '

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")


In [4]:
enc_text = tokenizer.encode(raw_text)

In [5]:
print(enc_text[:20])

[464, 7319, 6821, 286, 376, 1922, 5549, 314, 550, 28068, 355, 314, 1266, 714, 26, 475, 618, 339, 44716, 2402]


In [6]:
print(tokenizer.decode(enc_text[:20]))

The thousand injuries of Fortunato I had borne as I best could; but when he ventured upon


In [13]:
for i in range(1,10):
    print("INPUT:", tokenizer.decode(enc_text[:i]), "TARGET:", tokenizer.decode([enc_text[i]]))

INPUT: The TARGET:  thousand
INPUT: The thousand TARGET:  injuries
INPUT: The thousand injuries TARGET:  of
INPUT: The thousand injuries of TARGET:  F
INPUT: The thousand injuries of F TARGET: ortun
INPUT: The thousand injuries of Fortun TARGET: ato
INPUT: The thousand injuries of Fortunato TARGET:  I
INPUT: The thousand injuries of Fortunato I TARGET:  had
INPUT: The thousand injuries of Fortunato I had TARGET:  borne


## Data Set 

In [9]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]



## Dataloader

In [14]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [11]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)


Inputs:
 tensor([[  464,  7319,  6821,   286],
        [  376,  1922,  5549,   314],
        [  550, 28068,   355,   314],
        [ 1266,   714,    26,   475],
        [  618,   339, 44716,  2402],
        [13277,    11,   314, 19982],
        [15827,    13,   921,    11],
        [  508,   523,   880,   760]])

Targets:
 tensor([[ 7319,  6821,   286,   376],
        [ 1922,  5549,   314,   550],
        [28068,   355,   314,  1266],
        [  714,    26,   475,   618],
        [  339, 44716,  2402, 13277],
        [   11,   314, 19982, 15827],
        [   13,   921,    11,   508],
        [  523,   880,   760,   262]])


## Token Embeddings (Vectors)

In [None]:
#Tiktoken vocab size is 100256
