In [1]:
import os
import sys
import numpy as np

# Add the path to the custom library to the system path
sys.path.append('..')

# Import custom modules
from src import Tensor
from src.core.utils import context_manager
from src.architectures.transformer import Tokenizer, Transformer, DataLoader

### Constants and hyperparameters

In [2]:
# Constants
dataset_path = os.path.join(os.getcwd(), 'dataset', 'divina_commedia.txt')
tokenizer_path = os.path.join(os.getcwd(), 'checkpoints', 'tokenizer.json')

In [3]:
# Hyperparameters
dropout = 0.2 # The dropout rate
train_val_split = 0.9 # 90% of the data will be used for training, 10% for validation
batch_size = 32 # The number of samples to use for each batch
sequence_length = 256 # The size of the sequence length (the context window)
learning_rate = 1e-3 # The learning rate for the optimizer
epochs = 50 # The number of epochs to train the model for
n_embed = 384 # The size of the token embeddings (the dimensionality of the embeddings)
eval_iters = 10 # The number of iterations to evaluate the model
n_attention_heads = 6 # The number of attention heads in the multi-head attention mechanism
n_decoder_blocks = 6 # The number of transformer'decoder blocks in the model

### Initializations

In [None]:
# Set the random seed for reproducibility
np.random.seed(42)

### Data loading

In [5]:
def load_txt_file(path: str) -> str:
    """
    Load a text file from the specified path.
    
    Parameters:
    - path (str): The path to the text file.
    
    Returns:
    - str: The contents of the text file.
    """
    
    # Check if the file exists
    if not os.path.exists(path):
        raise FileNotFoundError(f'The file "{path}" does not exist.')
    
    # Read the file
    with open(path, 'r', encoding='utf-8') as file:
        return file.read()

In [6]:
# Instantiate the tokenizer
tokenizer = Tokenizer()

# Load the state of the tokenizer
tokenizer.load(tokenizer_path)

# Extract the vocabulary size
vocab_size = tokenizer.get_vocab_size()

In [7]:
# Load the text file
text = load_txt_file(dataset_path)

# Encode the text using the tokenizer
encoded_text = tokenizer.encode(text)

# Convert the data to a tensor
data = Tensor(np.array(encoded_text), dtype=np.int32)

In [8]:
# Instantiate the data loader
data_loader = DataLoader(
    data = data, 
    train_val_split = train_val_split
)

### Building the model

In [9]:
# Create the language model
language_model = Transformer(
    name = "Language Model",
    vocab_size = vocab_size,
    n_embed = n_embed,
    n_attention_heads = n_attention_heads,
    sequence_length = sequence_length,
    n_decoder_blocks = n_decoder_blocks,
    dropout = dropout
)

### Initializing the model

In [10]:
# Call the model with a first batch to initialize the weights
# This is not necessary, but it is useful to know the input size

# Disable gradient computation
with context_manager.no_grad():
    # Set the model in evaluation mode
    language_model.eval()
    
    # Get a batch of data
    x, _ = data_loader.get_batch(
        batch_size = batch_size,
        sequence_length = sequence_length
    )
    
    # Call the model with a batch of data to initialize it
    language_model(x)

In [11]:
# Display the model summary in tree format.
# This is useful since the whole model is composed of submodules,
# therefore, the model summary will be displayed recursively
language_model.summary(recursive=True)

Language Model (Transformer) [output_shape=(32, 1024), params=11526400]
└── language_model.decoder (Decoder) [output_shape=(32, 1024), params=11526400]
    ├── decoder.embedding (Embedding) [output_shape=(32, 384), params=393216]
    ├── decoder.positional_embedding (Embedding) [output_shape=(384), params=98304]
    ├── decoder.decoder_blocks[0].decoder_block (DecoderBlock) [output_shape=(32, 256, 384), params=1773312]
    │   ├── decoder_block.layer_norm_1 (LayerNormalization) [output_shape=(32, 256, 384), params=768]
    │   ├── decoder_block.mlp (MLP) [output_shape=(32, 256, 384), params=1181568]
    │   │   ├── mlp.dropout (Dropout) [output_shape=(32, 256, 384), params=0]
    │   │   ├── decoder_block.mlp.input_dense (Dense) [output_shape=(32, 256, 1536), params=591360]
    │   │   └── decoder_block.mlp.output_dense (Dense) [output_shape=(32, 256, 384), params=590208]
    │   ├── decoder_block.layer_norm_2 (LayerNormalization) [output_shape=(32, 256, 384), params=768]
    │   └── d

### Training the model

In [12]:
# Train the model
language_model.fit(
    data_loader = data_loader,
    epochs = epochs, 
    lr = learning_rate,
    batch_size = batch_size,
    eval_iters = eval_iters
)

Epoch 1/50 - Train Loss: 7.0785
Epoch 2/50 - Train Loss: 6.4140
Epoch 3/50 - Train Loss: 5.4272
Epoch 4/50 - Train Loss: 5.0996
Epoch 5/50 - Train Loss: 4.7485
Epoch 6/50 - Train Loss: 4.5417
Epoch 7/50 - Train Loss: 4.4484
Epoch 8/50 - Train Loss: 4.4168
Epoch 9/50 - Train Loss: 4.3065
Epoch 10/50 - Train Loss: 4.2797
Epoch 11/50 - Train Loss: 4.2845
Epoch 12/50 - Train Loss: 4.2444
Epoch 13/50 - Train Loss: 4.2433
Epoch 14/50 - Train Loss: 4.2342
Epoch 15/50 - Train Loss: 4.1823
Epoch 16/50 - Train Loss: 4.1533
Epoch 17/50 - Train Loss: 4.1858
Epoch 18/50 - Train Loss: 4.0968
Epoch 19/50 - Train Loss: 4.0679
Epoch 20/50 - Train Loss: 4.0860
Epoch 21/50 - Train Loss: 4.0566
Epoch 22/50 - Train Loss: 4.0444
Epoch 23/50 - Train Loss: 4.0568
Epoch 24/50 - Train Loss: 3.9951
Epoch 25/50 - Train Loss: 4.0132
Epoch 26/50 - Train Loss: 4.0193
Epoch 27/50 - Train Loss: 4.0037
Epoch 28/50 - Train Loss: 3.9421
Epoch 29/50 - Train Loss: 4.0004
Epoch 30/50 - Train Loss: 3.9378
Epoch 31/50 - Train

### Inference

In [13]:
# Generate some text context from the trained model
context = Tensor(np.zeros((1, 1), dtype=np.int32))

# Iterate over the tokens generated by the transformer
for token in language_model.generate(context, max_new_tokens=200, stream=True):
    # Decode the token
    decoded_token = tokenizer.decode([token.data.squeeze().tolist()])

    # Print the decoded token
    print(decoded_token, end='', flush=True)

o uo a stell che ma di peaio al mo quanaco di cia poggitorssogiuaticote alca grie e
nbmar le rucua.
nder llicor.
ombto stpiattcorin che ciascunr qucall o motro om fuo e in un bete mi mimi tra e le la la e lisca che de che stgue manorarmansi suanzlore vi, 'a e e diar,
mo tutte  e e 'agngria erali fo nella che chgia mi'risar