In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from datetime import datetime
from byte_tokenizer import ByteTokenizer
from class_gpt import GPT


Prepare the dataset for GPT

Dataset → creates input-target pairs for LM training.

In [3]:
today = datetime.today().strftime('%Y-%m-%d')

# parameters of the model
context_length = 16
model_dim = 12  # dimensionality for embedding and attention
num_blocks = 4  # number of repetitions of the transformer block
num_heads = 4  # number of self attention instances, each with size model_dim // num_heads

tokenizer = ByteTokenizer()

vocab_size = tokenizer.vocab_size
batch_size = 8
epochs = 10
lr=3e-4  # learning rate for the gradient descent method

In [4]:

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, block_size):  #  here, block_size is the same as context_length
        self.tokenizer = tokenizer
        self.block_size = block_size

        # Flatten all tokens into one big sequence
        all_tokens = []
        for t in texts:
            all_tokens.extend(tokenizer.encode(t))

        # chop the tokenized sequence into chunks of block_size
        self.data = []
        for i in range(0, max(1, len(all_tokens) - block_size)):
            x = all_tokens[i : i + block_size]
            y = all_tokens[i+1 : i + block_size+1]
            
            # pad if too short
            if len(x) < block_size:
                pad_len = block_size - len(x)
                x = x + [tokenizer.pad_token_id] * pad_len
                y = y + [tokenizer.pad_token_id] * pad_len
            
            self.data.append((x, y))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


To train the model, it takes the txt file 'bon_jovi'

In [5]:
with open("bon_jovi.txt", 'r', encoding='utf-8') as file:
    test = file.read()
print(len(test))  # 39604

dataset = TextDataset(test, tokenizer, context_length)

loader = DataLoader(dataset, batch_size, shuffle=True)


39604


In [6]:
model = GPT(vocab_size, context_length, model_dim, num_blocks, num_heads)

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=lr)

# training loop
for epoch in range(epochs):
    for batch_x, batch_y in loader:
        
        # forward
        logits = model(batch_x).squeeze()  # [batch, seq, vocab]
        loss = criterion(logits.view(-1, logits.size(-1)), batch_y.view(-1))

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch} | Loss {loss.item():.4f}")

Epoch 0 | Loss 5.0596
Epoch 1 | Loss 5.0596
Epoch 2 | Loss 5.0596
Epoch 3 | Loss 5.0596
Epoch 4 | Loss 5.0596
Epoch 5 | Loss 5.0596
Epoch 6 | Loss 5.0596
Epoch 7 | Loss 5.0596
Epoch 8 | Loss 5.0596
Epoch 9 | Loss 5.0596


For a random model with vocab_size = 258, the expected loss is: - log(1/258)≈5.55


In [8]:
# Save model and optimizer, to resume training:
checkpoint_file = f'checkpoint_{today}.pth'

torch.save({
    'epoch': epoch,                    # last completed epoch
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss                       # optional, for logging
}, checkpoint_file)
