# Setup

Importing the dataset, tokenizer and other stuff

## Load Dataset

In [1]:
from datasets import load_dataset
ds = load_dataset("Salesforce/wikitext", "wikitext-2-v1")

In [2]:
import random
random.seed(42)

# Get train-val-test splits from 'ds'
train_dataset = ds["train"]
validation_dataset = ds["validation"]
test_dataset = ds["test"]

# Get some info in the splits
print(len(train_dataset), len(validation_dataset), len(test_dataset))

# Random samples from the training dataset
print(random.choice(train_dataset["text"]))

# Checking the data-struct of the 'text' column
print(type(train_dataset["text"]))

# Get some generic info the dataset(s)
print(train_dataset)  # Checking the "columns" of the dataset
# Get the total number of words
c = 0
for sample in train_dataset["text"]:
    c += len(sample.split(" "))
print(f"Found ({c}) words in train-dataset")

36718 3760 4358
 <unk> , Ireland is divided between the Republic of Ireland ( officially named Ireland ) , which covers five @-@ <unk> of the island , and Northern Ireland , which is part of the United Kingdom , in the northeast of the island . In 2011 the population of Ireland was about 6 @.@ 4 million , ranking it the second @-@ most populous island in Europe after Great Britain . Just under 4 @.@ 6 million live in the Republic of Ireland and just over 1 @.@ 8 million live in Northern Ireland . 

<class 'list'>
Dataset({
    features: ['text'],
    num_rows: 36718
})
Found (2112395) words in train-dataset


In [4]:
train_dataset, validation_dataset, test_dataset

(Dataset({
     features: ['text'],
     num_rows: 36718
 }),
 Dataset({
     features: ['text'],
     num_rows: 3760
 }),
 Dataset({
     features: ['text'],
     num_rows: 4358
 }))

## Tokenizer

In [19]:
# Setup and load tokenizer class

import regex as re
import warnings
from tqdm.auto import tqdm
import pickle
from typing import List

class BPETokenizerV2:
    def __init__(self, texts: List[str]):
        """
        Creates a BPETokenizerV1 instance using regex-based tokenization.
        Args:
            texts (List[str]): List of input strings.
        """
        self.gpt2_pat = re.compile(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")

        _text = " ".join(texts)
        self.splits = self.gpt2_pat.findall(_text)

        self.split_tokens = [list(tok.encode("utf-8")) for tok in self.splits]

        self.__built = False
        self._vocab = None
        self._merges = None

    def _get_stats(self, tokens):
        """
        Counts occurrences of byte pairs in the tokenized list.
        """
        pairs = {}
        for split in tokens:
            for pair in zip(split, split[1:]):
                pair = tuple(pair)  # FIX: was getting a type error otherwise
                pairs[pair] = pairs.get(pair, 0) + 1
        return pairs

    def _merge(self, tokens, pair, idx):
        """
        Merges a given byte pair in each split separately.
        """
        new_tokens = []
        for split in tokens:
            new_split = []
            i = 0
            while i < len(split):
                if i < len(split) - 1 and (split[i], split[i+1]) == pair:
                    new_split.append(idx)
                    i += 2
                else:
                    new_split.append(split[i])
                    i += 1
            new_tokens.append(new_split)
        return new_tokens

    def __build_tokenizer(self, vocab_size):
        """
        Builds the BPE tokenizer's vocabulary.
        """
        assert vocab_size >= 256, "Vocabulary size must be at least 256 for byte-level tokens."
        
        vocab = {i: bytes([i]) for i in range(256)}
        merges = {}

        n_merges = vocab_size - 256
        ids = self.split_tokens.copy()

        initial_token_count = sum(len(split) for split in ids)

        for i in tqdm(range(n_merges), leave=False, desc="Merging"):
            stats = self._get_stats(ids)
            if not stats:
                self.vocab_size = 256 + i
                break
            top_pair = max(stats, key=stats.get)
            idx = 256 + i
            merges[top_pair] = idx
            ids = self._merge(ids, top_pair, idx)
            vocab[idx] = vocab[top_pair[0]] + vocab[top_pair[1]]

        final_token_count = sum(len(split) for split in ids)

        # Print some info after tokenizer is built
        print(f"Before length: {initial_token_count}")
        print(f"After length: {final_token_count}")
        print(f"Compression ratio: {(initial_token_count / final_token_count):.3f}")

        self._vocab = vocab
        self._merges = merges
        self.__built = True

    def fit(self, vocab_size: int, texts: List[str] = None):
        """
        Builds the tokenizer's vocabulary using the given texts.
        """
        if texts:
            warnings.warn("Using .fit with new texts is discouraged. Pass texts during initialization.")
            _text = " ".join(texts)
            self.splits = self.gpt2_pat.findall(_text)
            self.split_tokens = [list(tok.encode("utf-8")) for tok in self.splits]

        self.__build_tokenizer(vocab_size)

    def encode(self, text: str):
        """
        Encodes a given text into a sequence of token IDs.
        """
        assert self.__built, "Tokenizer must be built using `fit` before encoding."

        # Step 1: Split and encode text using regex and bytes
        splits = self.gpt2_pat.findall(text)
        split_tokens = [list(tok.encode("utf-8")) for tok in splits]

        encoded_ids = []
        for tokens in split_tokens:
            while len(tokens) >= 2:
                stats = self._get_stats([tokens])  # Compute within-split stats
                pair = min(stats, key=lambda p: self._merges.get(p, float('inf')), default=None)
                if pair is None or pair not in self._merges:
                    break
                idx = self._merges[pair]
                tokens = self._merge([tokens], pair, idx)[0]  # Apply merge
            encoded_ids.extend(tokens)  # Append the final tokens to the result

        return encoded_ids

    def decode(self, ids: List[int]):
        """
        Decodes a list of token IDs back into a string.
        """
        assert self.__built, "Tokenizer must be built using `fit` before decoding."

        tokens = b"".join(self._vocab[idx] for idx in ids)
        text = tokens.decode("utf-8", errors="replace")
        return text
    
    def save(self, file_path: str):
        """
        Save the tokenizer's vocab and merges to a file.
        Args:
            file_path: Path to save at.
        """
        with open(file_path, 'wb') as f:
            pickle.dump({'vocab': self._vocab, 'merges': self._merges}, f)
        print(f"[INFO] Tokenizer saved to {file_path}")



def load_BPETokenizerV2(file_path: str = "./bpe_tokenizer_v1_train_dataset.pth"):
    """
    Load the BPE (V2) tokenizer from a saved file without requiring texts in the constructor.
    Args:
        file_path (str): Path to the saved tokenizer file. (default = ./bpe_tokenizer_v1_train_dataset.pth)
    Returns:
        BPETokenizerV2 instance: A loaded tokenizer instance with vocab and merges.
    """
    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    # Uninitialized instance of the tokenizer
    tokenizer = object.__new__(BPETokenizerV2)

    # Set vocab and merges directly
    tokenizer._vocab = data['vocab']
    tokenizer._merges = data['merges']
    setattr(tokenizer, '_BPETokenizerV2__built', True)  # FIX for name mangling

    # Initialize necessary attributes that are otherwise set in '__init__'
    tokenizer.gpt2_pat = re.compile(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
    print(f"[INFO] Tokenizer loaded from: {file_path}")
    return tokenizer


In [22]:
import os
from pathlib import Path

# Setup tokenizer path
current_dir = Path(os.getcwd())
tokenizer_path = None
for i in (current_dir.parent / "tokenizers").glob("bpe_tokenizer_v1_train_dataset.pth"):
    tokenizer_path = i
    print(f"[INFO] Found tokenizer path: {tokenizer_path}")
    print("---"*5)

# Create a tokenizer instance using the pre-trained tokenizer
tokenizer = load_BPETokenizerV2(tokenizer_path)
tokenizer

[INFO] Found tokenizer path: /Users/dhruvnandigam/Desktop/Dhruv/Programing/NN/Inqueropedia/Inqueropedia/tokenizers/bpe_tokenizer_v1_train_dataset.pth
---------------
[INFO] Tokenizer loaded from: /Users/dhruvnandigam/Desktop/Dhruv/Programing/NN/Inqueropedia/Inqueropedia/tokenizers/bpe_tokenizer_v1_train_dataset.pth


<__main__.BPETokenizerV2 at 0x16840d3a0>

In [23]:
# Testing the tokenizer's outputs
_temp = 'Sample string with words for a natural       language tokenizer '
output = ""
for n, i in enumerate(tokenizer.encode(_temp)):
    col = "\033[101m"
    if n % 2 == 0:
        col = "\033[106m"
    output += f"{col}{tokenizer.decode([i])}"
print(output)

[106mS[101mamp[106mle[101m str[106ming[101m with[106m w[101mords[106m for[101m a[106m n[101mat[106mural[101m [106m [101m [106m [101m [106m [101m l[106mang[101mu[106mage[101m to[106mk[101men[106miz[101mer[106m 


The output from above suggests that the tokenizer hasn't been trained properly, perhaps training it for longer will help.\
But we wil use this version of the tokenizer for now as training the tokenizer takes a long time.

In [27]:
# Check for data integrity with the loaded tokenizer
random.seed(42)
_sample = random.choice(train_dataset["text"])
print(f"Random sample:\n{_sample}")
print("---"*5)
print(f"Encoded sample: {tokenizer.encode(_sample)}")
print("---"*5)
print(f"Decoded match: {tokenizer.decode(tokenizer.encode(_sample)) == _sample}")

Random sample:
 <unk> , Ireland is divided between the Republic of Ireland ( officially named Ireland ) , which covers five @-@ <unk> of the island , and Northern Ireland , which is part of the United Kingdom , in the northeast of the island . In 2011 the population of Ireland was about 6 @.@ 4 million , ranking it the second @-@ most populous island in Europe after Great Britain . Just under 4 @.@ 6 million live in the Republic of Ireland and just over 1 @.@ 8 million live in Northern Ireland . 

---------------
Encoded sample: [286, 284, 62, 263, 1899, 364, 1926, 1108, 758, 260, 682, 112, 710, 295, 279, 1899, 371, 1148, 1106, 1377, 1899, 370, 263, 460, 1599, 115, 1201, 340, 286, 284, 62, 279, 260, 1883, 263, 288, 387, 417, 1783, 1899, 263, 460, 364, 601, 279, 260, 955, 1275, 1247, 263, 282, 260, 1754, 257, 471, 279, 260, 1883, 270, 424, 1659, 260, 1885, 279, 1899, 323, 747, 679, 570, 564, 1116, 263, 402, 863, 290, 381, 260, 827, 340, 719, 950, 378, 509, 1883, 282, 1431, 613, 389, 738

In [30]:
tokenizer.decode([286, 284, 62])

' <unk>'

The above sample of encoding and decoding with the tokenizer also indicates some issues in the way it was 'trained'.\
Tokens such as: "<unk>" and other 'special' tokens need to be assigned with a single id, making the tokenizer more efficient and allow models to perform better.

These are some changes that are to be made for the next tokenizer train run.\
As of now, we will stick to this 'inefficient' tokenizer and update it later.

# Modeling

Running some model tests with the following models and model-specs

|Model|Architecture|Tokenizer|Block-size|Embedding-size|Head-size|Results|
|-|-|-|-|-|-|-|
|Model-1|bi-gram|bpe-tokenizer-v2|-|-|-|-|
|Model-2|transformer|bpe-tokenizer-v2|32|32|16|-|
|Model-3|transformer|bpe-tokenizer-v2|32|64|16|-|
|Model-4|transformer|bpe-tokenizer-v2|32|32|32|-|

## Setup

Setup things like torch and other deps

In [53]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, Dataset

import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from timeit import default_timer as timer
from typing import List, Dict, Tuple

# Global training params
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")  # Use for Apple silicone
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use for Nvidia GPU's

vocab_size = len(tokenizer._vocab)  # Need to fix: add this property in the tokenizer class



## Model-1 (bi-gram)

In [189]:
# NOTE: this dataset is universal b/w all the models unless the tokenizer changes

class BigramDataset(Dataset):
    """
    Thin wrapper object for a list of ids.
    """
    def __init__(self, texts: List[str], tokenizer, verbose: int = 0):
        """
        Creates a torch-Dataset instance for the given text data.
        Args:
            texts (List[str]): List of strings for dataset.
            tokenizer: Tokenizer to encode the strings.
        """
        if verbose > 0:
            print(f"[INFO] Building dataset...")
            st = timer()
        
        # Build the dataset here
        self.tokenizer = tokenizer
        tokenized_texts = []  # Concat all the samples in 'texts' and tokenize them
        for text in tqdm(texts, desc="Tokenizing texts", leave=False):
            tokenized_texts.extend(tokenizer.encode(text))

        self.data = torch.tensor(tokenized_texts, dtype=torch.long)
        
        if verbose > 0:
            et = timer()
            print(f"{(et-st):.5f} sec to build the dataset.")
            print(f"[INFO] Dataset built")
            print("---"*5)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create dataset(s)
train_dataset_1 = BigramDataset(train_dataset["text"], tokenizer, verbose=1)
validation_dataset_1 = BigramDataset(validation_dataset["text"], tokenizer, verbose=1)

print(f"[INFO] Created the datasets | train-length: {len(train_dataset_1)} | validation-length: {len(validation_dataset_1)}")

[INFO] Building dataset...


Tokenizing texts:   0%|          | 0/36718 [00:00<?, ?it/s]

12.58152 sec to build the dataset.
[INFO] Dataset built
---------------
[INFO] Building dataset...


Tokenizing texts:   0%|          | 0/3760 [00:00<?, ?it/s]

1.29557 sec to build the dataset.
[INFO] Dataset built
---------------
---------------
[INFO] Created the datasets | train-length: 3519701 | validation-length: 362195


In [190]:
print(train_dataset_1[:].shape)#[:10]
print(train_dataset_1[:].shape)#[:10]
print(train_dataset_1[:10])  # Viwe the first 10 samples

torch.Size([3519701])
torch.Size([3519701])
tensor([ 301,  536, 1655,  121,  406,   97,  493,  820,  295,  691])


In [333]:
def get_batch(dataset: str,
              block_size: int,
              batch_size: int = batch_size,
              seed: int or None = None,
              debug: bool = False):
    """
    Loads a single batch of data from 'split'.
    NOTE: doesn't move the batch to device
    """
    if seed is not None:
        torch.manual_seed(seed)
    
    data = dataset
    idx = torch.randint(len(data) - block_size, (batch_size,))
    
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx])

    # DEBUG
    # -------------------------------- #
    if debug:
        print("|----- user settings -----|")
        print(idx)
        print(x.shape)
        print(y.shape)
        print(block_size)
        print(batch_size)
        
        print("")
        
        print("|----- output checks -----|")
        print(f"expected: {(batch_size, block_size)} | got: {x.shape}")
        print(f"expected: {(batch_size, block_size-1)} | got: {x.shape}")
        print(f"x elements (single batch): {x[0]}")
        print(f"x elements (single batch): {y[0]}")
        print(f"expected (true) | got: {all(x[0][1:] == y[0][:-1])}")

        print("")
        
        print("|----- debug complete -----|\n")
        
    # -------------------------------- #

    return x, y

get_batch(train_dataset_1,
          block_size=8,
          batch_size=4,
          seed=42,
          debug=True)


|----- user settings -----|
tensor([ 137841, 3504164,  442996, 2954875])
torch.Size([4, 8])
torch.Size([4, 8])
8
4

|----- output checks -----|
expected: (4, 8) | got: torch.Size([4, 8])
expected: (4, 7) | got: torch.Size([4, 8])
x elements (single batch): tensor([ 313,  325, 1122,  731,  289,  285,  321,  276])
x elements (single batch): tensor([ 325, 1122,  731,  289,  285,  321,  276,  324])
expected (true) | got: True

|----- debug complete -----|



(tensor([[ 313,  325, 1122,  731,  289,  285,  321,  276],
         [ 261,  282,  300,  352,   99,  336,   97, 2029],
         [ 296,  262,  331,  116,  717,  348, 1331,  747],
         [ 446,  368,  316, 1854,  293,  946,  431, 1864]]),
 tensor([[ 325, 1122,  731,  289,  285,  321,  276,  324],
         [ 282,  300,  352,   99,  336,   97, 2029,  263],
         [ 262,  331,  116,  717,  348, 1331,  747,  502],
         [ 368,  316, 1854,  293,  946,  431, 1864,  946]]))

In [334]:
class BigramV1(nn.Module):
    """Bigram model for sequence generation"""
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb = nn.Embedding(vocab_size, vocab_size)

    def forward(self, X: torch.Tensor, targets: torch.Tensor or None = None):
        """
        Performs a forward pass.
        Expected tensor of shape: (B,1) | (batch_size, 1)
        """
        logits = self.emb(X)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx: List[int], max_new_tokens: int):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)   # logits: (B,T,C)
            logits = logits[:, -1, :]  # (B,C)
            pred_probs = F.softmax(logits, dim=-1)
            _idx = torch.multinomial(pred_probs, num_samples=1)
            idx = torch.cat((idx, _idx), dim=1)  # (B,T+1)
        return idx


In [335]:
# Generate on a new bi-gram model
model_1 = BigramV1(vocab_size)
_temp = torch.tensor([tokenizer.encode("Hello")])

_temp = model_1.generate(
    _temp,
    max_new_tokens=50
)

tokenizer.decode(_temp.tolist()[0])


'Hello 200ploy b died eff earlyaffivision ex$ Pol use manyron first be Thereason command Can Amorm island Anim extensP real play men October 2011 night Black concn music eightZ June part named Church cour positionchoolains< caus'

In [349]:
# Training the model
# Hyperparms
batch_size = 32
block_size = 64
epochs = 1000

model_1 = BigramV1(vocab_size)
model_1.to(device)
model_1.compile(backend="aot_eager")
optimizer = optim.Adam(model_1.parameters(), lr=5e-2)

# Train loop
for i in tqdm(range(epochs), desc="training", leave=False):
    xb, yb = get_batch(train_dataset_1, block_size=block_size, batch_size=batch_size)
    xb, yb = xb.to(device), yb.to(device)

    logits, loss = model_1(xb, yb)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

training:   0%|          | 0/1000 [00:00<?, ?it/s]

In [350]:
_temp = torch.tensor([tokenizer.encode("Hello")]).to(device)

with torch.no_grad():
    _temp = model_1.generate(_temp, max_new_tokens=100)

tokenizer.decode(_temp.tolist()[0])

'Helloresents to the German line . The Hened it , replace it is estim America . At the authority around the I never baders sentence . However , the flight vessels sud camonial nat as a series was one @-@ cert . <unk> on 15 ; his discussia . Op early September 10 @-@ force Harin gun respons Pan ... " He went on Femelebridge in rare'

In [351]:
@torch.no_grad()
def estimate_loss(model: nn.Module,
                  dataset,
                  iters: int):
    losses = []
    model.eval()
    for _ in tqdm(range(iters), desc="Estimating loss", leave=False):
        xb, yb = get_batch(dataset, block_size, batch_size)
        xb, yb = xb.to(device), yb.to(device)
        logits, loss = model(xb, yb)
        losses.append(loss.item())
    out = np.mean(losses)
    return out

In [354]:
estimate_loss(model_1, validation_dataset_1, iters=100)

Estimating loss:   0%|          | 0/100 [00:00<?, ?it/s]

np.float64(4.298511743545532)

In [332]:
validation_dataset_1[26]

tensor(285)

Will not really try to improve this model from here. This is like a simple baseline model, we will start building the Transformer model from now.

## Setup transformer

All the classes and other deps for building a custom transformer model.\
This model will be built from scratch, and will not use the implementation from PyTorch...

We will start with a single attention head (sa-head) and build the mha from there on...

In [405]:
class Head(nn.Module):
    """Single self-attention head of head-size"""
    def __init__(self, head_size: int, dropout: float=0.2):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Expected shape: (B,T,C)
        B,T,C = x.shape
        
        k = self.key(x)   # (B,T,C) | C: head_size
        q = self.query(x) # (B,T,C) | C: head_size
        v = self.value(x) # (B,T,C) | C: head_size
        
        out = q @ k.transpose(-2, -1)*T**-0.5  # (B,T,C) @ (B,C,T) -> (B,T,T)
        
        out = out.masked_fill(self.tril[:T, :T] == 0, float("-inf"))  # (B,T,T)
        out = F.softmax(out, dim=-1)
        out = self.dropout(out)
        out = out @ v  # (B,T,T) @ (B,T,C) -> (B,T,C) | C: head_size

        return out

n_embed = 32
Head(head_size=16)(torch.randn(4, 8, n_embed)).shape

torch.Size([4, 8, 16])