In [9]:
import os
import sys
import torch
import tiktoken

from src.utils import *
from src.gpt2 import GPT2
from src.config import GPTConfig
from src.tokenizer import Tokenizer
from src.data_loader import DataLoader

### Constants and hyperparameters

In [10]:
# Constants
dataset_path = os.path.join(os.getcwd(), 'dataset', 'input.txt')

In [None]:
# Hyperparameters
batch_size = 1024 # Batch size for training

# Micro batch size for gradient accumulation. This is the number of batches to accumulate gradients before backpropagating.
# This is useful when the batch size is too large to fit into memory, so we split the batch into smaller micro batches and accumulate the gradients before backpropagating
micro_batch_size = 4

epochs = 50 # Number of training epochs
sequence_length = 32 # Number of tokens in each training sequence
train_val_split = 0.1 # Percentage of training data to use for validation
learning_rate = 3e-4 # Learning rate for the optimizer

### Initializations

In [12]:
# Set the random seed for reproducibility
torch.manual_seed(1337);

# Reduce the precision for the matmul operator to improve performance
torch.set_float32_matmul_precision('high')

### Data loading

In [None]:
# Instantiate the tokenizer
tokenizer = Tokenizer('gpt2')

In [14]:
# Instantiate the data loader
data_loader = DataLoader(
    txt_file = dataset_path,
    tokenizer = tokenizer,
    train_val_split = train_val_split
)

# Print the dataset statistics
print("Training set size: ", len(data_loader.train_tokens))
print("Validation set size: ", len(data_loader.val_tokens))

Training set size:  304223
Validation set size:  33802


In [15]:
# Create the model configuration
# The vocabulary size is 50304, instead of the classic 50257 if the gpt2 tokenizer,
# because we add some padding tokens to the vocabulary in order to make the vocabulary 
# size a multiple of 8 in order to improve performance when using FP16 training.
model_config = GPTConfig(
    context_size = 1024,
    vocab_size = 50304,
    n_blocks = 12,
    n_heads = 12,
    n_embed = 768
)

### Building the model

In [16]:
# Creating the GPT-2 model
gpt2 = GPT2(model_config)

# Move the model to the GPU if available 
# and set the precision to bfloat16 for improved performance
gpt2 = gpt2.to(torch.bfloat16).to(device)

# Compile the model to optimize performance
gpt2 = torch.compile(gpt2)

### Training the model

In [17]:
# Fitting the model
gpt2.fit(
    data_loader = data_loader,
    epochs = epochs,
    lr = learning_rate,
    batch_size = batch_size,
    micro_batch_size = micro_batch_size,
    sequence_length = sequence_length
)

Epoch: 1/50 | Completion percentage: 11.11% | Step duration 937.23 ms/step --> loss: 11.0000
Epoch: 1/50 | Completion percentage: 22.22% | Step duration 476.07 ms/step --> loss: 9.8125
Epoch: 1/50 | Completion percentage: 33.33% | Step duration 467.13 ms/step --> loss: 10.3125
Epoch: 1/50 | Completion percentage: 44.44% | Step duration 478.22 ms/step --> loss: 9.3125
Epoch: 1/50 | Completion percentage: 55.56% | Step duration 444.64 ms/step --> loss: 8.8750
Epoch: 1/50 | Completion percentage: 66.67% | Step duration 456.91 ms/step --> loss: 8.6250
Epoch: 1/50 | Completion percentage: 77.78% | Step duration 434.14 ms/step --> loss: 8.3125
Epoch: 1/50 | Completion percentage: 88.89% | Step duration 437.26 ms/step --> loss: 8.3750
Epoch: 1/50 | Completion percentage: 100.00% | Step duration 432.81 ms/step --> loss: 7.9688
Epoch 1/50 | Average step duration 507.16 ms/step | Epoch duration 8493.89 ms/epoch --> loss: 9.1875 - val_loss: 7.8894
Epoch: 2/50 | Completion percentage: 11.11% | Ste

In [20]:
# Encode the context using the tokenizer and convert it to a tensor
context = "the state of the"
context = torch.tensor(tokenizer.encode(context), dtype=torch.long).unsqueeze(0)
context = context.to(device) # Move the tensor to the GPU if available

# Decode and display the generated text
print(tokenizer.decode(gpt2.generate(context, max_new_tokens=200).squeeze().tolist()))

the state of the:
But what I do thet that are him for me.

VOLUMNIA:
What, for you that? and not to mytis I had not been, go to to my death?
LORDAt, and myt stay.
VWtis she?
KINGTORKUS:
LADIUS: When you then,
CORIUS:
KING ED VIIUS:
FirstICHTRY Senator:
KING EDOLESOAB ED VIIOLENIUSUS:
LENAINCANOLENAANTERUS:
FirstIONCEOiser you pray if and betw IV:
KING RAMUS:
ProvUARDENIOLESO IV:
KING RESOUS:
BRESESES
KINGASTENIOLESOUS:
BRAathICHESO IV, and and if myETH:
GL
