In [2]:
import os
import torch

from src.tokenizer import Tokenizer
from src.data_loader import DataLoader
from src.transformer import Transformer
from src.utils import device, load_txt_file

### Constants and hyperparameters

In [3]:
# Constants
dataset_path = os.path.join(os.getcwd(), 'dataset', 'input.txt')
tokenizer_path = os.path.join(os.getcwd(), 'checkpoints', 'tokenizer.json')

In [4]:
# Hyperparameters
train_val_split = 0.9 # 90% of the data will be used for training, 10% for validation
batch_size = 64 # The number of samples to use for each batch
block_size = 256 # The size of the sequence length (the context window)
learning_rate = 1e-3 # The learning rate for the optimizer
epochs = 500 # The number of epochs to train the model for
n_embed = 384 # The size of the token embeddings (the dimensionality of the embeddings)
eval_iters = 10 # The number of iterations to evaluate the model
num_attention_heads = 6 # The number of attention heads in the multi-head attention mechanism
num_transformer_blocks = 6 # The number of transformer blocks in the model
dropout = 0.2 # The dropout rate

### Initializations

In [5]:
# Set the random seed for reproducibility
torch.manual_seed(1337);

### Data loading

In [6]:
# Instantiate the tokenizer
tokenizer = Tokenizer()

# Load the state of the tokenizer
tokenizer.load_state(tokenizer_path)

In [7]:
# Load the text file
text = load_txt_file(dataset_path)

# Encode the text using the tokenizer
encoded_text = tokenizer.encode(text)

# Convert the data to a tensor
data = torch.tensor(encoded_text, dtype=torch.long)

In [8]:
# Instantiate the data handler
data_handler = DataLoader(
    data = data, 
    train_val_split = train_val_split
)

### Building the model

In [9]:
# Create the language model
language_model = Transformer(
    vocab_size = tokenizer.vocab_size, # type: ignore
    n_embed = n_embed,
    n_heads = num_attention_heads,
    block_size = block_size,
    n_transformer_blocks = num_transformer_blocks,
    dropout = dropout
)

Model moved to device: mps


### Training the model

In [10]:
# Train the model
language_model.train_model(
    data_loader = data_handler,
    epochs = epochs, 
    lr = learning_rate, 
    batch_size = batch_size,
    eval_iters = eval_iters
)

Epoch 1/500 - Train Loss: 7.1066, Val Loss: 7.1044
Epoch 11/500 - Train Loss: 5.1603, Val Loss: 5.1907
Epoch 21/500 - Train Loss: 4.7002, Val Loss: 4.7628
Epoch 31/500 - Train Loss: 4.2573, Val Loss: 4.3366
Epoch 41/500 - Train Loss: 3.9934, Val Loss: 4.1140
Epoch 51/500 - Train Loss: 3.7969, Val Loss: 3.9445
Epoch 61/500 - Train Loss: 3.6891, Val Loss: 3.8471
Epoch 71/500 - Train Loss: 3.6000, Val Loss: 3.7789
Epoch 81/500 - Train Loss: 3.5420, Val Loss: 3.7201
Epoch 91/500 - Train Loss: 3.4958, Val Loss: 3.7188
Epoch 101/500 - Train Loss: 3.4752, Val Loss: 3.6707
Epoch 111/500 - Train Loss: 3.4440, Val Loss: 3.6378
Epoch 121/500 - Train Loss: 3.4149, Val Loss: 3.6055
Epoch 131/500 - Train Loss: 3.4050, Val Loss: 3.6024
Epoch 141/500 - Train Loss: 3.3823, Val Loss: 3.5734
Epoch 151/500 - Train Loss: 3.3580, Val Loss: 3.5585
Epoch 161/500 - Train Loss: 3.3267, Val Loss: 3.5495
Epoch 171/500 - Train Loss: 3.3157, Val Loss: 3.5355
Epoch 181/500 - Train Loss: 3.2779, Val Loss: 3.5188
Epoc

### Inference

In [11]:
# Generate some text from the trained model
context = torch.zeros((1, 1), dtype=torch.long, device=device)

# Decode and display the generated text
print(tokenizer.decode(language_model.generate(context, max_new_tokens=100).squeeze().tolist()))

 th we cand ourtever trumpeting a should my self,
Whose such, but good all think not be bound:
Go, believe a scheek rejoin,
Anon, sirs what is to make and hopes to be.
Paulince you, lets, and
