In [1]:
import os
import torch

from lib.torch.utils import device
from lib.utils import load_txt_file
from lib.torch.data_loader import DataLoader
from lib.torch.transformer import Transformer

### Constants and hyperparameters

In [2]:
# Constants
dataset_path = os.path.join(os.getcwd(), 'dataset', 'input.txt')

In [3]:
# Hyperparameters
train_val_split = 0.9 # 90% of the data will be used for training, 10% for validation
batch_size = 64 # The number of samples to use for each batch
block_size = 256 # The size of the sequence length (the context window)
learning_rate = 3e-4 # The learning rate for the optimizer
epochs = 500 # The number of epochs to train the model for
n_embed = 384 # The size of the token embeddings (the dimensionality of the embeddings)
eval_iters = 1 # The number of iterations to evaluate the model
num_attention_heads = 6 # The number of attention heads in the multi-head attention mechanism
num_transformer_blocks = 6 # The number of transformer blocks in the model
dropout = 0.2 # The dropout rate

### Initializations

In [None]:
# Set the random seed for reproducibility
torch.manual_seed(1337);

<torch._C.Generator at 0x11daefbb0>

### Data loading

In [5]:
# Load the text file
text = load_txt_file(dataset_path)

# Extract the unique characters (vocabulary)
vocab = sorted(list(set(text)))
vocab_size = len(vocab)

# Creating a simple mapping from characters to integers
char_to_int = {c: i for i, c in enumerate(vocab)}
int_to_char = {i: c for i, c in enumerate(vocab)}

# Creating the encoding and decoding functions
encode = lambda text: [char_to_int[c] for c in text]
decode = lambda tokens: ''.join([int_to_char[t] for t in tokens])

# Convert the data to a tensor
data = torch.tensor(encode(text), dtype=torch.long)

In [6]:
# Instantiate the data handler
data_handler = DataLoader(
    data = data, 
    train_val_split = train_val_split
)

### Building the model

In [7]:
# Create the language model
language_model = Transformer(
    vocab_size = vocab_size,
    n_embed = n_embed,
    n_heads = num_attention_heads,
    block_size = block_size,
    n_transformer_blocks = num_transformer_blocks,
    dropout = dropout
)

Model moved to device: mps


### Training the model

In [8]:
# Train the model
language_model.train_model(
    data_loader = data_handler,
    epochs = epochs, 
    lr = learning_rate, 
    batch_size = batch_size,
    eval_iters = eval_iters
)

Epoch 1/500 - Train Loss: 4.2895, Val Loss: 4.2849
Epoch 2/500 - Train Loss: 3.5412, Val Loss: 3.5796
Epoch 3/500 - Train Loss: 3.3211, Val Loss: 3.3152
Epoch 4/500 - Train Loss: 3.1753, Val Loss: 3.2280
Epoch 5/500 - Train Loss: 3.1116, Val Loss: 3.1755
Epoch 6/500 - Train Loss: 3.1316, Val Loss: 3.1379
Epoch 7/500 - Train Loss: 3.0509, Val Loss: 3.0978
Epoch 8/500 - Train Loss: 3.0011, Val Loss: 3.0701
Epoch 9/500 - Train Loss: 2.9825, Val Loss: 2.9918
Epoch 10/500 - Train Loss: 2.9705, Val Loss: 2.9956
Epoch 11/500 - Train Loss: 2.8752, Val Loss: 2.9527
Epoch 12/500 - Train Loss: 2.8687, Val Loss: 2.9161
Epoch 13/500 - Train Loss: 2.8459, Val Loss: 2.8675
Epoch 14/500 - Train Loss: 2.8429, Val Loss: 2.8522
Epoch 15/500 - Train Loss: 2.8013, Val Loss: 2.8086
Epoch 16/500 - Train Loss: 2.8151, Val Loss: 2.8516
Epoch 17/500 - Train Loss: 2.7592, Val Loss: 2.7936
Epoch 18/500 - Train Loss: 2.7705, Val Loss: 2.7897
Epoch 19/500 - Train Loss: 2.7719, Val Loss: 2.7690
Epoch 20/500 - Train 

### Inference

In [9]:
# Generate some text from the trained model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(language_model.generate(context, max_new_tokens=100).squeeze().tolist()))


Evircoliants, in it lighilan suft
ond uspolaly plet liverse ancur heit's souch,
for my dow; a whondi
