# Small Language Model (SLM) - Data Preparation & Sampling
This notebook covers loading and preprocessing text data, tokenization using GPT-2 BPE, and creating a PyTorch dataset with sliding-window sampling to generate input-target sequences for training a small language model.


### Simple Tokenizer Class (For Intution)

In [10]:
import re
from typing import List, Dict


In [12]:
# Sample dataset (tiny text corpus for demo)
sample_corpus = [
    "Hello, World. I am Akash!",
    "I am learning to tokenize text.",
    "Hello again, World."
]


In [13]:
# Regex-based tokenization
def regex_tokenizer(text: str) -> List[str]:
    """
    Tokenize a text string into words and punctuation marks,
    while removing whitespace tokens.

    Example:
    Input:  "Hello, World. I am Akash!"
    Output: ['Hello', ',', 'World', '.', 'I', 'am', 'Akash', '!']
    """
    # \w+ matches words (letters, digits, underscore)
    # [^\w\s] matches any punctuation (not word, not space)
    # This ensures words and punctuation are separate tokens
    tokens = re.findall(r"\w+|[^\w\s]", text)
    return tokens

# Test tokenizer
for sentence in sample_corpus:
    print(f"Input: {sentence}")
    print(f"Tokens: {regex_tokenizer(sentence)}\n")


Input: Hello, World. I am Akash!
Tokens: ['Hello', ',', 'World', '.', 'I', 'am', 'Akash', '!']

Input: I am learning to tokenize text.
Tokens: ['I', 'am', 'learning', 'to', 'tokenize', 'text', '.']

Input: Hello again, World.
Tokens: ['Hello', 'again', ',', 'World', '.']



In [7]:
# Simple tokenizer class
class SimpleTokenizer:
    """
    A basic word-level tokenizer:
    - Builds vocabulary from a corpus
    - Encodes text to list of token IDs
    - Decodes IDs back to text
    - Handles unknown tokens with <|unk|>
    """

    def __init__(self):
        # special tokens
        self.unk_token = "<|unk|>"
        self.pad_token = "<|pad|>"

        # token mappings
        self.token2id: Dict[str, int] = {}
        self.id2token: Dict[int, str] = {}

    def build_vocab(self, corpus: List[str]) -> None:
        """
        Build vocabulary from a list of text strings.
        Uses regex tokenizer to split words/punctuations.
        """
        vocab = set()
        for text in corpus:
            tokens = regex_tokenizer(text)
            vocab.update(tokens)

        # Add special tokens first
        vocab = [self.pad_token, self.unk_token] + sorted(vocab)

        # Create mappings
        self.token2id = {tok: idx for idx, tok in enumerate(vocab)}
        self.id2token = {idx: tok for tok, idx in self.token2id.items()}

    def encode(self, text: str) -> List[int]:
        """
        Convert text into list of token IDs.
        Unknown tokens are mapped to <|unk|>.
        """
        tokens = regex_tokenizer(text)
        return [self.token2id.get(tok, self.token2id[self.unk_token]) for tok in tokens]

    def decode(self, ids: List[int]) -> str:
        """
        Convert list of IDs back to text string.
        Joins tokens with space, but you may customize
        spacing rules depending on your use case.
        """
        tokens = [self.id2token.get(i, self.unk_token) for i in ids]
        return " ".join(tokens)


In [8]:

# Initialize tokenizer and build vocab
tokenizer = SimpleTokenizer()
tokenizer.build_vocab(sample_corpus)

print("Vocabulary size:", len(tokenizer.token2id))
print("Sample of vocab items:", list(tokenizer.token2id.items())[:15], "\n")

# Encode text
sentence = "Hello, I am learning NLP."
encoded = tokenizer.encode(sentence)
print("Original sentence:", sentence)
print("Encoded IDs:", encoded)

# Decode back
decoded = tokenizer.decode(encoded)
print("Decoded back:", decoded)


Vocabulary size: 16
Sample of vocab items: [('<|pad|>', 0), ('<|unk|>', 1), ('!', 2), (',', 3), ('.', 4), ('ChatGPT', 5), ('Hello', 6), ('I', 7), ('World', 8), ('again', 9), ('am', 10), ('is', 11), ('learning', 12), ('text', 13), ('to', 14)] 

Original sentence: Hello, I am learning NLP.
Encoded IDs: [6, 3, 7, 10, 12, 1, 4]
Decoded back: Hello , I am learning <|unk|> .


### Tiktoken - The Byte Pair Encoding Tokenizer

In [6]:
# install dependencies
%pip install -q torch tiktoken datasets KaggleHub


In [7]:

import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken
from pathlib import Path
import kagglehub


In [18]:
#Download dataset from GitHub
# "The Verdict" dataset
import requests

# Download the text file
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"
response = requests.get(url)
text = response.text

# Save locally (optional)
with open("the-verdict.txt", "w", encoding="utf-8") as f:
    f.write(text)

# Check character and word stats
print(f"Number of characters: {len(text)}")
print(f"Number of words: {len(text.split())}")
print(f"First 300 characters preview:\n{text[:300]}")



Number of characters: 20479
Number of words: 3634
First 300 characters preview:
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would ha


In [23]:
tokenizer = tiktoken.get_encoding("gpt2")
print(f"Number of tokens: {len(tokenizer.encode(text))}")

# If you'd like to experiment with "allowed_special", here's an example
# enc_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

Number of tokens: 5145


### Dataset and Dataloader

In [9]:
# SLMDataset with sliding window

class SLMDataset(Dataset):
    """
    Small Language Model dataset with sliding window tokenization.

    Args:
        text (str): full raw text.
        tokenizer (tiktoken.Encoding): tokenizer object.
        max_length (int): sequence length (including input & target).
        stride (int): overlap size between chunks.
    """
    def __init__(self, text: str, tokenizer, max_length: int, stride: int):
        self.input_ids = []
        self.target_ids = []

        # Encode full text into token IDs
        token_ids = tokenizer.encode(text)

        # Sliding window chunks
        start = 0
        while start + max_length < len(token_ids):
            chunk = token_ids[start:start + max_length]

            # Input is everything except the last token
            inp = chunk[:-1]
            # Target is everything except the first token (shifted by 1)
            tgt = chunk[1:]

            self.input_ids.append(torch.tensor(inp, dtype=torch.long))
            self.target_ids.append(torch.tensor(tgt, dtype=torch.long))

            start += stride

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "target_ids": self.target_ids[idx]
        }


In [20]:
# DataLoader creation
def create_dataloader(
    text: str,
    tokenizer,
    batch_size: int = 4,
    max_length: int = 256,
    stride: int = 128,
    shuffle: bool = True,
    drop_last: bool = True,
    num_workers: int = 0
):
    """
    Create a PyTorch DataLoader from raw text with sliding window sampling.
    """
    dataset = SLMDataset(text, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

# Example usage
dataloader = create_dataloader(raw_text, tokenizer, batch_size=4, max_length=128, stride=64)

for batch in dataloader:
    print("Batch input shape:", batch["input_ids"].shape)
    print("Batch target shape:", batch["target_ids"].shape)
    print("Example input IDs:", batch["input_ids"][0][:20])
    print("Example target IDs:", batch["target_ids"][0][:20])
    break


Batch input shape: torch.Size([4, 127])
Batch target shape: torch.Size([4, 127])
Example input IDs: tensor([  339,  3947,   284,   307,  1592,  2259,   739,   438, 14363,   898,
         9408,   355,   281,  2134,   329,  5482,  4447,   290,   753,  1072])
Example target IDs: tensor([ 3947,   284,   307,  1592,  2259,   739,   438, 14363,   898,  9408,
          355,   281,  2134,   329,  5482,  4447,   290,   753,  1072,    13])


In [34]:
# visualization utility:  Intuitive
def visualize_sequence_pairs(dataset, tokenizer, idx=0, max_steps=10):
    """
    Show input sequence -> next target (both tokens and IDs).

    Args:
        dataset: SLMDataset object
        tokenizer: tiktoken Encoding
        idx: which dataset example to visualize
        max_steps: number of steps to show
    """
    example = dataset[idx]
    input_ids = example["input_ids"]
    target_ids = example["target_ids"]

    print("          Token view \n")
    print("Input Tokens --> Target Tokens")
    for step in range(min(max_steps, len(input_ids))):
        inp_tokens = tokenizer.decode(input_ids[: step + 1].tolist())
        tgt_token = tokenizer.decode([target_ids[step].item()])
        print(f"{inp_tokens} --> {tgt_token}")

    print("\n        ID view \n")
    print("Input Token IDs --> Target Token IDs")
    for step in range(min(max_steps, len(input_ids))):
        inp_ids = input_ids[: step + 1].tolist()
        tgt_id = target_ids[step].item()
        print(f"{inp_ids} --> {tgt_id}")

# Example usage
dataset = SLMDataset(raw_text, tokenizer, max_length=32, stride=16)
print("    Visualization:  Intuitive \n")
visualize_sequence_pairs(dataset, tokenizer, idx=0, max_steps=8)


    Visualization:  Intuitive 

          Token view 

Input Tokens --> Target Tokens
I -->  H
I H --> AD
I HAD -->  always
I HAD always -->  thought
I HAD always thought -->  Jack
I HAD always thought Jack -->  G
I HAD always thought Jack G --> is
I HAD always thought Jack Gis --> burn

        ID view 

Input Token IDs --> Target Token IDs
[40] --> 367
[40, 367] --> 2885
[40, 367, 2885] --> 1464
[40, 367, 2885, 1464] --> 1807
[40, 367, 2885, 1464, 1807] --> 3619
[40, 367, 2885, 1464, 1807, 3619] --> 402
[40, 367, 2885, 1464, 1807, 3619, 402] --> 271
[40, 367, 2885, 1464, 1807, 3619, 402, 271] --> 10899


In [36]:
# PyTorch tensors that come out of the dataset

def show_batch_tensors(dataloader, tokenizer, n_batches=1):
    """
    Show raw tensors (input_ids and target_ids) from a dataloader,
    and decode them into a matrix of text tokens.

    Args:
        dataloader: PyTorch DataLoader from create_dataloader
        tokenizer: tiktoken Encoding
        n_batches: how many batches to display
    """
    for i, batch in enumerate(dataloader):
        if i >= n_batches:
            break

        input_ids = batch["input_ids"]
        target_ids = batch["target_ids"]

        print(f"                 === Batch {i} ===")
        print("Input tensor shape :", input_ids.shape)
        print("Target tensor shape:", target_ids.shape)
        print("\nInput IDs:\n", input_ids)
        print("\nTarget IDs:\n", target_ids)

        # Decode each row to text
       #print("\n=== Decoded input sequences ===")
       #for row in input_ids:
       #    print(tokenizer.decode(row.tolist()))

       #print("\n=== Decoded target sequences ===")
       #for row in target_ids:
       #    print(tokenizer.decode(row.tolist()))

        #print("=" * 80)


print("       Visualization:  Actual \n")
show_batch_tensors(dataloader, tokenizer, n_batches=1)


       Visualization:  Actual 

                 === Batch 0 ===
Input tensor shape : torch.Size([4, 127])
Target tensor shape: torch.Size([4, 127])

Input IDs:
 tensor([[  550,   587, 11191,   416,  3499,  1466,    25,   484,   550, 26546,
          1068,   465,  1242,    11,   340,   550,   587,   302,  1144,   287,
           262,  3024,    12,  4803,   286,   511,   512,  1741,    13,   843,
           340,   373,  4361,  5048,   425,   284,  3465,   644,  1245,   262,
           366, 25124,  3101,  8137,   286, 16957,  1696,   414,     1,   357,
            40,  9577,  4544,  9325,   701,     8,   373,  1719,   319,   683,
            13,   198,   198,    40,   423,  4750,   326,  9074,    13,   402,
           271, 10899,   373,  5527,    26,   290,   340,   373,  3393, 34953,
           856,   326,   607,  5229,   373, 37895,   422,   428, 25179,   257,
         19217,   475,  8904, 14676,    13,   632,   318,    11,   355,   257,
          3896,    11,   262,   661,   508, 4098

### Input Embeddings [Token Embeddings + Positional Embeddings]

In [51]:
#Initialize word embeddings for a sentence

import torch
import torch.nn as nn

Toy_sentence= ["I","live","in","India"]
token_ids = torch.tensor([2, 5, 7, 9])  # token IDs for each word
vocab_size = 10
embed_dim = 8

# Word embedding layer
word_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)

# Get embeddings for the sentence
word_embeds = word_embedding(token_ids)
print("Word embeddings shape:", word_embeds.shape)  # (sequence_length, embed_dim)
for i, embed in enumerate(word_embeds):
    print(f"{Toy_sentence[i]} => \n token ID: {token_ids[i]} \n Embedding: {embed}")


Word embeddings shape: torch.Size([4, 8])
I => 
 token ID: 2 
 Embedding: tensor([ 2.1047,  0.1839, -0.3299, -0.5873,  0.1215, -0.9598,  0.7810, -1.3862],
       grad_fn=<UnbindBackward0>)
live => 
 token ID: 5 
 Embedding: tensor([-1.0277,  0.5010,  1.1364, -0.2753, -0.5766, -2.0788,  0.4422,  0.5329],
       grad_fn=<UnbindBackward0>)
in => 
 token ID: 7 
 Embedding: tensor([-0.3570,  0.1095, -1.2302,  0.6774, -0.6796, -0.1230,  1.0147, -0.5500],
       grad_fn=<UnbindBackward0>)
India => 
 token ID: 9 
 Embedding: tensor([ 0.9634, -0.5005, -0.6265, -1.0725,  0.7942, -0.0557, -0.9855, -0.9396],
       grad_fn=<UnbindBackward0>)


In [54]:
# Initialize positional embeddings (absolute positional embeddings)

seq_length = len(token_ids)

# Positional embedding layer
pos_embedding = nn.Embedding(num_embeddings=seq_length, embedding_dim=embed_dim)

# Get positional embeddings for each position
positions = torch.arange(seq_length)        # [0, 1, 2, 3]
pos_embeds = pos_embedding(positions)

print("Positional embeddings shape:", pos_embeds.shape)  # (sequence_length, embed_dim)



# Combine word + positional embeddings

input_embeds = word_embeds + pos_embeds
print("\n\n Input embeddings shape:", input_embeds.shape)  # (sequence_length, embed_dim)



Positional embeddings shape: torch.Size([4, 8])


 Input embeddings shape: torch.Size([4, 8])


In [55]:
# Full Embedding Layer

# Hyperparameters
vocab_size = 50257       # GPT-2 tokenizer vocab size
output_dim = 256         # embedding dimension
batch_size = 8
max_length = 4           # sequence length

# Word embeddings for the batch
word_embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=output_dim)
batch_token_ids = torch.randint(0, vocab_size, (batch_size, max_length))
word_embeds = word_embedding_layer(batch_token_ids)  # shape: (8, 4, 256)

# Absolute positional embeddings
context_length = max_length
pos_embedding_layer = nn.Embedding(num_embeddings=context_length, embedding_dim=output_dim)

# torch.arange(context_length) gives: [0, 1, 2, 3]
pos_indices = torch.arange(context_length)
pos_embeds = pos_embedding_layer(pos_indices)        # shape: (4, 256)

# Expand positional embeddings for batch
pos_embeds = pos_embeds.unsqueeze(0).expand(batch_size, -1, -1)  # shape: (8, 4, 256)

# Final input embeddings
input_embeddings = word_embeds + pos_embeds
print("Input embeddings shape:", input_embeddings.shape)  # (8, 4, 256)


Input embeddings shape: torch.Size([8, 4, 256])
