## Working with Text Data

Tokenizing Text

In [59]:
import torch
import tiktoken

In [62]:
from torch.utils.data import Dataset, DataLoader

In [99]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        super().__init__()

        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        # sliding window based chunking, with overlapping sequences of max_length 
        for i in range(0, len(token_ids) - max_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i:(max_length+i)]))
            self.target_ids.append(torch.tensor(token_ids[(i+1):(i+max_length+1)]))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [100]:
def create_dataloader_v1(text, batch_size, max_length, stride, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers, 
    )

    return dataloader

In [101]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [102]:
tokenizer = tiktoken.get_encoding("gpt2")
vocab_size = tokenizer.n_vocab

output_dims = 256
context_length = 1024

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dims)
pos_embeddings_layer = torch.nn.Embedding(context_length, output_dims)

In [103]:
batch_size = 8
max_length = 4

In [104]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=batch_size, max_length=max_length, stride=max_length
)

In [109]:
i = 0
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embeddings_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break

In [110]:
input_embeddings.shape

torch.Size([8, 4, 256])