# Data Loading Pipeline

- we use a short text instead of thousands Go of data to be able to run it locally.
- we use pytorch Dataset  and dataloader classes to load and process the data into batches for further 
- we use a tiktoken encoder to encode text in tokens and tokens_id
- we initialise the embedding layer for generating embeddings


In [2]:
from importlib.metadata import version
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.3.0+cu118
tiktoken version: 0.11.0


In [4]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids =[]
        self.target_ids = []

        #tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        #we use a sliding window to chunk the text into overlapping sequences of max_length
        for i in range (0, len(token_ids)- max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size, max_length, stride, shuffle=True, drop_last=True, num_workers=0):

    #initializing the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")
    #we create the dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    #we create the dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


#instatiate and testing

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text = f.read()

vocab_size = 50257 #vocab size of gpt2 tokenizer
output_dim = 256 #embedding dimension we chose
context_length = 1024 #max length of input sequence

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim) #positional embedding layer to encode the position of each token in the sequence

batch_size = 4
max_length = 4 # we length for data in the dataset

dataloader = create_dataloader_v1(
    text,
    batch_size=batch_size,
    max_length = max_length,
    stride= max_length # meaning no overlapping
)


In [5]:
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x) #token embeddings
    pos_embeddings = pos_embedding_layer(torch.arange(max_length)) #positional embeddings

    input_embeddings = token_embeddings + pos_embeddings #final input embeddings by adding token and positional embeddings

    break

In [6]:
print(input_embeddings.shape) #torch.Size([4, 4, 256])

torch.Size([4, 4, 256])
