<table style="width:100%">
<tr>
<td style="vertical-align:middle; text-align:left;">
<font size="2">
Supplementary code for the <a href="http://mng.bz/orYv">Build a Large Language Model From Scratch</a> book by <a href="https://sebastianraschka.com">Sebastian Raschka</a><br>
<br>Code repository: <a href="https://github.com/rasbt/LLMs-from-scratch">https://github.com/rasbt/LLMs-from-scratch</a>
</font>
</td>
<td style="vertical-align:middle; text-align:left;">
<a href="http://mng.bz/orYv"><img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/cover-small.webp" width="100px"></a>
</td>
</tr>
</table>


# The Main Data Loading Pipeline Summarized

The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).

This notebook contains the main takeaway, the data loading pipeline without the intermediate steps.

Packages that are being used in this notebook:

In [None]:
# NBVAL_SKIP
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.4.0
tiktoken version: 0.7.0


In [None]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size, max_length, stride,
                         shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

vocab_size = 50257
output_dim = 256
context_length = 1024


token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

batch_size = 8
max_length = 4
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=batch_size,
    max_length=max_length,
    stride=max_length
)


for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    print(token_embeddings.shape)

    pos_embeddings = pos_embedding_layer(torch.arange(max_length))
    print(pos_embeddings.shape)

    input_embeddings = token_embeddings + pos_embeddings

    break

print(input_embeddings.shape)

torch.Size([8, 4, 256])
torch.Size([4, 256])
torch.Size([8, 4, 256])


In [14]:
pos_embedding_layer(torch.arange(max_length)).shape

torch.Size([4, 256])

# **My Experiments for understanding...**
* **Especially, `POS Encoding`**

In [228]:
# @17Swagat My Version:

# Absolute Positional Embedding

import tiktoken
from torch.utils.data import Dataset, DataLoader
from torch import tensor

tokenizer = tiktoken.get_encoding('gpt2')

with open('./the-verdict.txt', 'r') as file:
    raw_text = file.read()

TOKENS = tokenizer.encode(raw_text) # 5145

class GPTStyleDataset_V1(Dataset):
    def __init__(self, tokensEncodings, max_length = 4, stride = 1):
        self.input_tokens = []
        self.target_tokens = []

        for i in range(0, len(tokensEncodings) - max_length, stride):
            # Question: Why this logic works?? Need to think deeply
            self.input_tokens.append(tensor(tokensEncodings[i: i+max_length]))
            self.target_tokens.append(tensor(tokensEncodings[i+1: i+max_length + 1]))
    
    def __len__(self):
        return len(self.input_tokens)
    
    def __getitem__(self, index):
        return (self.input_tokens[index], self.target_tokens[index])

def create_dataloader(txt):
    encodings = tokenizer.encode(txt) # [1 2 3 ...]
    dataset = GPTStyleDataset_V1(encodings, max_length=4, stride=1)
    dataloader = DataLoader(dataset, batch_size=4, shuffle=False, drop_last=False)
    return dataloader

dataloader = create_dataloader(raw_text)
data_iter = iter(dataloader)

In [238]:
len(tokenizer.encode(raw_text))

5145

In [243]:
torch.max(tensor(TOKENS))

tensor(50085)

In [271]:
from torch.nn import Embedding

vocab_size = 50086 #59000
token_dim = 128

for batch in dataloader:
    input, target = batch

    # Token-Embedding
    token_embedding_layer = Embedding(vocab_size, token_dim)
    input_token_embedding = token_embedding_layer(input)

    # Positional-Embedding: "Absolute Positional Embedding"
    pos_embedding_layer = Embedding(num_embeddings=4, embedding_dim=token_dim)
    input_pos_embedding = pos_embedding_layer(torch.arange(4))

    # Input Embedding:
    input_embedding = input_token_embedding + input_pos_embedding
    print(input_token_embedding.shape)
    print(input_pos_embedding.shape)
    print(input_embedding.shape)

    # break

torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size([4, 128])
torch.Size([8, 4, 128])
torch.Size([8, 4, 128])
torch.Size

In [256]:
# pos_embedding_layer.weight.shape
input_pos_embedding.shape

torch.Size([4, 128])