Byte tokenizer.

Tokenizer → encodes text to IDs.

For training the model, the pipeline must: encode -> batch -> feed the model

In [35]:
class ByteTokenizer:
    """
    UTF-8 byte tokenizer: every byte (0–255) is a token.
    Reserve extra IDs for special tokens (eos, pad).
    """
    def __init__(self):
        self.vocab_size = 258
        self.eos_token_id = 256
        self.pad_token_id = 257

    def encode(self, text: str):
        b = text.encode("utf-8", errors="ignore")
        return list(b) + [self.eos_token_id]

    def decode(self, ids):
        b = bytes([i for i in ids if i < 256])
        return b.decode("utf-8", errors="replace")

In [36]:
# text to see how the ByteTokenizer works - encoding

tokenizer = ByteTokenizer()

tokens = tokenizer.encode("hello world!!!")

print(tokens)

[104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100, 33, 33, 33, 256]


In [37]:
# text to see how the ByteTokenizer works - decoding

text = tokenizer.decode(tokens)

print(text)

hello world!!!


Prepare the dataset for GPT

Dataset → creates input-target pairs for LM training.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, block_size=16):  #  here, block_size = context_length
        self.tokenizer = tokenizer
        self.block_size = block_size

        # Flatten all tokens into one big sequence
        all_tokens = []
        for t in texts:
            all_tokens.extend(tokenizer.encode(t))

        # chop the tokenized sequence into chunks of block_size
        self.data = []
        for i in range(0, max(1, len(all_tokens) - block_size)):
            x = all_tokens[i : i + block_size]
            y = all_tokens[i+1 : i + block_size+1]
            
            # pad if too short
            if len(x) < block_size:
                pad_len = block_size - len(x)
                x = x + [tokenizer.pad_token_id] * pad_len
                y = y + [tokenizer.pad_token_id] * pad_len
            
            self.data.append((x, y))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


In [90]:
# test the dataset

dataset = TextDataset('hello world', tokenizer)

In [91]:
print(len(dataset))     # 1
# print(dataset[0])       # (x,y) tensors

6


GPT class

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GPT(nn.Module):
    
    def __init__(self, vocab_size: int, context_length: int, model_dim: int, num_blocks: int, num_heads: int):
        super().__init__()
        torch.manual_seed(0)
        self.token_embeddings = nn.Embedding(vocab_size, model_dim)
        self.pos_embeddings = nn.Embedding(context_length, model_dim)
        # number of iterations trough the transformer block
        self.blocks = nn.Sequential()
        for i in range(num_blocks):
            self.blocks.append(self.TransformerBlock(model_dim, num_heads))
        self.final_ln = nn.LayerNorm(model_dim)
        self.vocabulary_projection = nn.Linear(model_dim, vocab_size)
                   
    def forward(self, context):
        # context: TensorType[int] -> TensorType[float]
        torch.manual_seed(0)
        # 🔍 Debug check
        # print("min:", context.min().item(), "max:", context.max().item(), "vocab_size:", self.token_embeddings.num_embeddings)
        
        token_embeds = self.token_embeddings(context)  # B, T, D
        B, T, D = token_embeds.shape
        pos_embeds = self.pos_embeddings(torch.arange(T))
        total_embeddings = token_embeds + pos_embeds
        
        un_normalized = self.vocabulary_projection(self.final_ln(self.blocks(total_embeddings)))
        probs = nn.functional.softmax(un_normalized, dim = -1)
        return probs
        
    class TransformerBlock(nn.Module):
        
        class MultiHeadedSelfAttention(nn.Module):
        
            class SingleHeadAttention(nn.Module):
                
                def __init__(self, model_dim, head_size):
                    super().__init__()
                    torch.manual_seed(0)
                    # not biases in the linear layers for getting the keys, queries and values of the tokens (for attention, better results)
                    self.get_keys = nn.Linear(model_dim, head_size, bias=False)
                    self.get_queries = nn.Linear(model_dim, head_size, bias=False)
                    self.get_values = nn.Linear(model_dim, head_size, bias=False)

                def forward(self, embedded):
                    k = self.get_keys(embedded)  # BxTxA
                    q = self.get_queries(embedded)
                    v = self.get_values(embedded)

                    scores = q @ torch.transpose(k, 1, 2)
                    b, t, a = k.shape  # batch dim, context dim, attention dim
                    scores = scores/ (a ** 0.5)

                    # lower triangular tensor
                    pre_mask = torch.tril(torch.ones(t, t))
                    mask = pre_mask == 0

                    scores = scores.masked_fill(mask, float('-inf'))  # b, t, t
                    scores = nn.functional.softmax(scores, dim=2)  # dim=2 is the columns
                    transformed = scores @ v

                    return transformed
        
            def __init__(self, model_dim, num_heads):
                super().__init__()
                torch.manual_seed(0)
                self.heads = nn.ModuleList()  # list to store neural network layers
                for i in range(num_heads):
                    # list of single attention layers
                    self.heads.append(self.SingleHeadAttention(model_dim, model_dim // num_heads))

            def forward(self, embedded):
                outputs = []  # each element is B, T, Head_size --> B, T, Attention_sim (after concatenation)
                for head in self.heads:
                    outputs.append(head(embedded))
                cated = torch.cat(outputs, dim = 2)  # dim = 2 the last dimension (attention)
                return cated
    
        def __init__(self, model_dim, num_heads):
            super().__init__()
            torch.manual_seed(0)
            # multi head self attention layer
            self.mhsa = self.MultiHeadedSelfAttention(model_dim, num_heads)
            # layers norm
            self.first_ln = nn.LayerNorm(model_dim)
            self.second_ln = nn.LayerNorm(model_dim)
            # fee forward
            self.ff = self.VanillaNeuralNetwork(model_dim)
        

        def forward(self, embedded):
            # def forward(self, embedded: TensorType[float]) -> TensorType[float]:
            torch.manual_seed(0)
            # add layer after the multi head self attention
            first_part = embedded + self.mhsa(self.first_ln(embedded))
            # add layer after feed forward
            result = first_part + self.ff(self.second_ln(first_part))
            return result
    
        class VanillaNeuralNetwork(nn.Module):
        
            def __init__(self, model_dim, droput=0.1):
                super().__init__()
                self.fc1 = nn.Linear(model_dim, model_dim)
                self.fc2 = nn.Linear(model_dim, model_dim)
                self.dropout = nn.Dropout()

            def forward(self, x):
                x = self.fc1(x)
                x = F.relu(x)
                x = self.dropout(x)
                x = self.fc2(x)
                x = self.dropout(x)
                return x

To train the model, it takes the txt file 'bon_jovi'

In [97]:
with open("bon_jovi.txt", 'r', encoding='utf-8') as file:
    test = file.read()
print(len(test))  # 39604
dataset = TextDataset(test, tokenizer)
loader = DataLoader(dataset, batch_size=8, shuffle=True)


39604


In [98]:
model = GPT(vocab_size=tokenizer.vocab_size, context_length=16, model_dim=12, num_blocks= 4, num_heads=4)

In [99]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

# training loop
for epoch in range(10):
    for batch_x, batch_y in loader:
        
        # forward
        logits = model(batch_x).squeeze()  # [batch, seq, vocab]
        loss = criterion(logits.view(-1, logits.size(-1)), batch_y.view(-1))

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch} | Loss {loss.item():.4f}")

Epoch 0 | Loss 5.0596
Epoch 1 | Loss 5.0596
Epoch 2 | Loss 5.0596
Epoch 3 | Loss 5.0596
Epoch 4 | Loss 5.0596
Epoch 5 | Loss 5.0596
Epoch 6 | Loss 5.0596
Epoch 7 | Loss 5.0596
Epoch 8 | Loss 5.0596
Epoch 9 | Loss 5.0596


In [100]:
# Saving the model’s weights only

# After training
torch.save(model.state_dict(), "gpt_decoder_weights.pth")

"""
This saves only the parameters (weights and biases).

Lightweight and flexible.

To load it, you need to recreate the model architecture first:
"""


'\nThis saves only the parameters (weights and biases).\n\nLightweight and flexible.\n\nTo load it, you need to recreate the model architecture first:\n'

In [None]:
# Recreate model instance and load it
model = GPT(vocab_size=tokenizer.vocab_size, context_length=16, model_dim=12, num_blocks= 4, num_heads=4)  # same arguments as during training
model.load_state_dict(torch.load("gpt_decoder_weights.pth"))
model.eval()  # set to evaluation mode

In [101]:
# 2️⃣ Saving the entire model (architecture + weights)

torch.save(model, "gpt_decoder_full.pth")


In [None]:
# load the model

model = torch.load("gpt_decoder_full.pth")
model.eval()

# Warning: Less flexible; may break if you change your code or PyTorch version.

In [103]:
# 3️⃣ (Optional) Save optimizer state too. If you want to resume training:

torch.save({
    'epoch': epoch,                    # last completed epoch
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss                       # optional, for logging
}, "checkpoint_2025_09_12.pth")


In [None]:
# Load the checkpoint and continue training

model = GPT(vocab_size=tokenizer.vocab_size, context_length=16, model_dim=12, num_blocks= 4, num_heads=4)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Load checkpoint
checkpoint = torch.load("checkpoint_2025_09_12.pth")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch'] + 1


new_lr = 0.0001  # new learning rate

for param_group in optimizer.param_groups:
    param_group['lr'] = new_lr

model.train()  # make sure to set back to training mode

⚠️ Important Tips

Always call model.train() after loading if you want to train.

If you only want to evaluate, use model.eval().

Make sure the model architecture and optimizer are created exactly as before.

You don’t need pickle — torch.save / torch.load handles everything safely.

You can change the learning rate (or other hyperparameters) after loading a checkpoint. The key is that the optimizer state (like momentum in Adam/SGD) will still be there, but you can override the learning rate.
