# **Transformer Language Model**

## Imports

#### Set Root Dir

In [1]:
import sys
import os

# Use the current working directory as root (or go up if needed)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '../..'))  # Adjust '..' as needed
sys.path.append(ROOT_DIR)

#### Import Modules

In [2]:
"""
Training script for the transformer language model.
"""
import argparse
import torch
import pytorch_lightning as pl
import pickle
import yaml
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer

from src.data.tokenizer import SimpleTokenizer, load_text_corpus, create_sample_corpus
from src.data.dataset import TextDataModule
from src.model.lightning_module import TransformerLightningModule

  from .autonotebook import tqdm as notebook_tqdm


## Load Params

#### Load yaml config

In [3]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

#### Set hyperparams

In [None]:
hparams = config["hparams"]

# this is not the best way to do unpack the hyperparams, but it's just for the demo
vocab_size, d_model, num_heads, num_layers, d_ff, \
sequence_length, batch_size, learning_rate, max_epochs, \
patience, min_delta, warmup_steps, weight_decay, dropout, \
train_split, val_split, num_workers, accelerator, devices, precision, \
gradient_clip_val, accumulate_grad_batches, log_every_n_steps, \
val_check_interval, save_top_k, monitor, mode = hparams.values()

In [5]:
hparams

{'vocab_size': 3000,
 'd_model': 32,
 'num_heads': 1,
 'num_layers': 2,
 'd_ff': 1024,
 'sequence_length': 32,
 'batch_size': 64,
 'learning_rate': 0.0001,
 'max_epochs': 100,
 'patience': 10,
 'min_delta': 0.001,
 'warmup_steps': 1000,
 'weight_decay': 0.01,
 'dropout': 0.1,
 'train_split': 0.7,
 'val_split': 0.2,
 'num_workers': 0,
 'accelerator': 'auto',
 'devices': 1,
 'precision': '32',
 'gradient_clip_val': 1.0,
 'accumulate_grad_batches': 1,
 'log_every_n_steps': 50,
 'val_check_interval': 1.0,
 'save_top_k': 1,
 'monitor': 'val/loss',
 'mode': 'min'}

#### Set paths and general config

In [6]:
paths = config["paths"]
general = config["general"]

corpus_path=paths["corpus_path"]
save_dir=paths["save_dir"]
log_dir=paths["log_dir"]
experiment_name=general["experiment_name"]
create_sample=general["create_sample"]

## Data preprocessing

#### Load text

In [7]:
# Load and tokenize text
print("Loading and tokenizing text...")
text = load_text_corpus(corpus_path)

Loading and tokenizing text...


#### Tokenize

In [8]:
# Create tokenizer and build vocabulary
tokenizer = SimpleTokenizer(vocab_size=hparams["vocab_size"])
tokenizer.build_vocab(text)

Vocabulary built with 2059 tokens


#### Save vocab

In [None]:
# Save vocabulary.
# For the demo, we override the vocab file. you can adjust the vocal file name as you like.
vocab_path = os.path.join(save_dir, "vocab.pkl")
os.makedirs(save_dir, exist_ok=True)
tokenizer.save_vocab(vocab_path)

Vocabulary saved to ./checkpoints\vocab.pkl


#### Read vocab

In [None]:
# read vocab
# this step is only necessary if loading the vocab from a file instead of creating one
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

print("The number of tokens in the vocab is:", len(vocab['word_to_idx']))


2059

#### Get token ids

In [11]:
# Encode text
token_ids = tokenizer.encode(text)
print(f"Encoded text length: {len(token_ids)} tokens")

Encoded text length: 21110 tokens


## Model Training

#### Create data module

In [12]:
# Create data module
data_module = TextDataModule(
    token_ids=token_ids,
    sequence_length=sequence_length,
    batch_size=batch_size,
    train_split=train_split,
    val_split=val_split,
    num_workers=num_workers
)

#### Create model

In [13]:
# Create model
model = TransformerLightningModule(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    d_ff=d_ff,
    dropout=dropout,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps
)

#### Set callback

In [14]:
# Create callbacks
callbacks = []

# Early stopping
early_stopping = EarlyStopping(
    monitor=monitor,
    patience=patience,
    min_delta=min_delta,
    mode=mode,
    verbose=True
)
callbacks.append(early_stopping)


# Model checkpointing
checkpoint_callback = ModelCheckpoint(
    dirpath=save_dir,
    filename=f"{experiment_name}-epoch={{epoch:02d}}-val_loss={{val/loss:.3f}}", #-v{trainer.logger.version:02d}",
    monitor=monitor,
    mode=mode,
    auto_insert_metric_name=False, # Prevents the name 'val/loss=' from being prepended
    save_top_k=save_top_k,
    save_last=True,
    verbose=True
)
callbacks.append(checkpoint_callback)

# Learning rate monitoring
lr_monitor = LearningRateMonitor(logging_interval='step')
callbacks.append(lr_monitor)

#### Create trainer and logger

In [15]:
# Create logger
logger = TensorBoardLogger(
    save_dir=log_dir,
    name=experiment_name,
    version=None
)

# Create trainer
trainer = Trainer(
    accelerator=accelerator,
    devices=devices,
    precision=precision,
    max_epochs=max_epochs,
    gradient_clip_val=gradient_clip_val,
    accumulate_grad_batches=accumulate_grad_batches,
    log_every_n_steps=log_every_n_steps,
    val_check_interval=val_check_interval,
    callbacks=callbacks,
    logger=logger,
    deterministic=True,
    enable_progress_bar=True,
    enable_model_summary=True
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


#### Summarize prepared data

In [None]:
# Print model summary
print("\nModel Summary:")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Check data sizes
print("Data sizes:")
print(f"Total tokens: {len(token_ids)}")
print(f"Sequence length: {sequence_length}")
print(f"Train split: {train_split}")
print(f"Val split: {val_split}")

# Calculate split sizes
total_len = len(token_ids)
train_end = int(total_len * train_split)
val_end = int(total_len * (train_split + val_split))

print(f"Train tokens: {train_end}")
print(f"Val tokens: {val_end - train_end}")
print(f"Test tokens: {total_len - val_end}")

# Check if validation data is sufficient
val_tokens = val_end - train_end
if val_tokens < sequence_length:
    print(f"WARNING: Validation data has only {val_tokens} tokens, less than sequence length {sequence_length}")
    print("This will cause validation to fail. Consider using a larger corpus or adjusting splits.")


Model Summary:
Vocabulary size: 2059
Model parameters: 275,723
Trainable parameters: 275,723


#### Train the neural netork transformer

In [None]:
# Train model
print("\nStarting training...")
print("With the updated parameters:")
print(f"- Sequence length: {sequence_length}")
print(f"- Train split: {train_split}")
print(f"- Val split: {val_split}")
print(f"- Monitor: {monitor}")
print()

trainer.fit(model, data_module)


Starting training...
With the updated parameters:
- Sequence length: 32
- Train split: 0.7
- Val split: 0.2
- Monitor: val/loss



c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\code\data_science\demo_llm\checkpoints exists and is not empty.

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | TransformerLM    | 275 K  | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
275 K     Trainable params
0         Non-trainable params
275 K     Total params
1.103     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


                                                                           

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0:   1%|▏         | 3/231 [00:00<00:22, 10.03it/s, v_num=29, train/loss_step=7.790, train/perplexity_step=2.43e+3]

Epoch 0: 100%|██████████| 231/231 [00:21<00:00, 10.66it/s, v_num=29, train/loss_step=7.610, train/perplexity_step=2.02e+3, val/loss=7.550, val/perplexity=1.91e+3, train/loss_epoch=7.710, train/perplexity_epoch=2.23e+3]

Metric val/loss improved. New best score: 7.553
Epoch 0, global step 231: 'val/loss' reached 7.55281 (best 7.55281), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=00-val_loss=7.553.ckpt' as top 1


Epoch 1: 100%|██████████| 231/231 [00:21<00:00, 10.67it/s, v_num=29, train/loss_step=7.230, train/perplexity_step=1.38e+3, val/loss=7.180, val/perplexity=1.32e+3, train/loss_epoch=7.390, train/perplexity_epoch=1.63e+3]

Metric val/loss improved by 0.368 >= min_delta = 0.001. New best score: 7.184
Epoch 1, global step 462: 'val/loss' reached 7.18438 (best 7.18438), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=01-val_loss=7.184.ckpt' as top 1


Epoch 2: 100%|██████████| 231/231 [00:21<00:00, 10.71it/s, v_num=29, train/loss_step=6.760, train/perplexity_step=859.0, val/loss=6.750, val/perplexity=859.0, train/loss_epoch=7.000, train/perplexity_epoch=1.11e+3]    

Metric val/loss improved by 0.436 >= min_delta = 0.001. New best score: 6.748
Epoch 2, global step 693: 'val/loss' reached 6.74838 (best 6.74838), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=02-val_loss=6.748.ckpt' as top 1


Epoch 3: 100%|██████████| 231/231 [00:21<00:00, 10.56it/s, v_num=29, train/loss_step=6.380, train/perplexity_step=592.0, val/loss=6.330, val/perplexity=571.0, train/loss_epoch=6.570, train/perplexity_epoch=717.0]  

Metric val/loss improved by 0.416 >= min_delta = 0.001. New best score: 6.332
Epoch 3, global step 924: 'val/loss' reached 6.33216 (best 6.33216), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=03-val_loss=6.332.ckpt' as top 1


Epoch 4: 100%|██████████| 231/231 [00:21<00:00, 10.66it/s, v_num=29, train/loss_step=6.010, train/perplexity_step=406.0, val/loss=6.060, val/perplexity=438.0, train/loss_epoch=6.210, train/perplexity_epoch=500.0]

Metric val/loss improved by 0.275 >= min_delta = 0.001. New best score: 6.057
Epoch 4, global step 1155: 'val/loss' reached 6.05727 (best 6.05727), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=04-val_loss=6.057.ckpt' as top 1


Epoch 5: 100%|██████████| 231/231 [00:21<00:00, 10.67it/s, v_num=29, train/loss_step=5.990, train/perplexity_step=398.0, val/loss=5.900, val/perplexity=376.0, train/loss_epoch=5.980, train/perplexity_epoch=397.0]

Metric val/loss improved by 0.159 >= min_delta = 0.001. New best score: 5.898
Epoch 5, global step 1386: 'val/loss' reached 5.89824 (best 5.89824), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=05-val_loss=5.898.ckpt' as top 1


Epoch 6: 100%|██████████| 231/231 [00:22<00:00, 10.37it/s, v_num=29, train/loss_step=5.850, train/perplexity_step=347.0, val/loss=5.810, val/perplexity=347.0, train/loss_epoch=5.850, train/perplexity_epoch=347.0]

Metric val/loss improved by 0.086 >= min_delta = 0.001. New best score: 5.812
Epoch 6, global step 1617: 'val/loss' reached 5.81225 (best 5.81225), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=06-val_loss=5.812.ckpt' as top 1


Epoch 7: 100%|██████████| 231/231 [00:23<00:00,  9.90it/s, v_num=29, train/loss_step=5.650, train/perplexity_step=283.0, val/loss=5.780, val/perplexity=337.0, train/loss_epoch=5.780, train/perplexity_epoch=325.0]

Metric val/loss improved by 0.029 >= min_delta = 0.001. New best score: 5.783
Epoch 7, global step 1848: 'val/loss' reached 5.78303 (best 5.78303), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=07-val_loss=5.783.ckpt' as top 1


Epoch 8: 100%|██████████| 231/231 [00:22<00:00, 10.12it/s, v_num=29, train/loss_step=5.770, train/perplexity_step=320.0, val/loss=5.780, val/perplexity=336.0, train/loss_epoch=5.770, train/perplexity_epoch=320.0]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 5.781
Epoch 8, global step 2079: 'val/loss' reached 5.78053 (best 5.78053), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=08-val_loss=5.781.ckpt' as top 1


Epoch 9: 100%|██████████| 231/231 [00:23<00:00,  9.99it/s, v_num=29, train/loss_step=5.770, train/perplexity_step=321.0, val/loss=5.760, val/perplexity=331.0, train/loss_epoch=5.760, train/perplexity_epoch=318.0]

Metric val/loss improved by 0.016 >= min_delta = 0.001. New best score: 5.765
Epoch 9, global step 2310: 'val/loss' reached 5.76462 (best 5.76462), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=09-val_loss=5.765.ckpt' as top 1


Epoch 10: 100%|██████████| 231/231 [00:24<00:00,  9.62it/s, v_num=29, train/loss_step=5.710, train/perplexity_step=302.0, val/loss=5.700, val/perplexity=311.0, train/loss_epoch=5.720, train/perplexity_epoch=305.0]

Metric val/loss improved by 0.064 >= min_delta = 0.001. New best score: 5.700
Epoch 10, global step 2541: 'val/loss' reached 5.70016 (best 5.70016), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=10-val_loss=5.700.ckpt' as top 1


Epoch 11: 100%|██████████| 231/231 [00:23<00:00,  9.89it/s, v_num=29, train/loss_step=5.610, train/perplexity_step=274.0, val/loss=5.580, val/perplexity=276.0, train/loss_epoch=5.610, train/perplexity_epoch=275.0]

Metric val/loss improved by 0.121 >= min_delta = 0.001. New best score: 5.579
Epoch 11, global step 2772: 'val/loss' reached 5.57901 (best 5.57901), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=11-val_loss=5.579.ckpt' as top 1


Epoch 12: 100%|██████████| 231/231 [00:23<00:00,  9.77it/s, v_num=29, train/loss_step=5.480, train/perplexity_step=239.0, val/loss=5.420, val/perplexity=237.0, train/loss_epoch=5.460, train/perplexity_epoch=236.0]

Metric val/loss improved by 0.157 >= min_delta = 0.001. New best score: 5.422
Epoch 12, global step 3003: 'val/loss' reached 5.42237 (best 5.42237), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=12-val_loss=5.422.ckpt' as top 1


Epoch 13: 100%|██████████| 231/231 [00:23<00:00,  9.84it/s, v_num=29, train/loss_step=5.200, train/perplexity_step=182.0, val/loss=5.280, val/perplexity=205.0, train/loss_epoch=5.290, train/perplexity_epoch=198.0]

Metric val/loss improved by 0.146 >= min_delta = 0.001. New best score: 5.276
Epoch 13, global step 3234: 'val/loss' reached 5.27596 (best 5.27596), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=13-val_loss=5.276.ckpt' as top 1


Epoch 14: 100%|██████████| 231/231 [00:23<00:00,  9.79it/s, v_num=29, train/loss_step=5.070, train/perplexity_step=159.0, val/loss=5.170, val/perplexity=185.0, train/loss_epoch=5.140, train/perplexity_epoch=172.0]

Metric val/loss improved by 0.104 >= min_delta = 0.001. New best score: 5.172
Epoch 14, global step 3465: 'val/loss' reached 5.17244 (best 5.17244), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=14-val_loss=5.172.ckpt' as top 1


Epoch 15: 100%|██████████| 231/231 [00:23<00:00,  9.80it/s, v_num=29, train/loss_step=4.820, train/perplexity_step=124.0, val/loss=5.120, val/perplexity=175.0, train/loss_epoch=5.050, train/perplexity_epoch=157.0]

Metric val/loss improved by 0.052 >= min_delta = 0.001. New best score: 5.121
Epoch 15, global step 3696: 'val/loss' reached 5.12085 (best 5.12085), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=15-val_loss=5.121.ckpt' as top 1


Epoch 16: 100%|██████████| 231/231 [00:24<00:00,  9.61it/s, v_num=29, train/loss_step=5.100, train/perplexity_step=163.0, val/loss=5.110, val/perplexity=173.0, train/loss_epoch=5.020, train/perplexity_epoch=151.0]

Metric val/loss improved by 0.013 >= min_delta = 0.001. New best score: 5.107
Epoch 16, global step 3927: 'val/loss' reached 5.10749 (best 5.10749), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=16-val_loss=5.107.ckpt' as top 1


Epoch 17: 100%|██████████| 231/231 [00:23<00:00,  9.88it/s, v_num=29, train/loss_step=4.910, train/perplexity_step=136.0, val/loss=5.110, val/perplexity=173.0, train/loss_epoch=5.010, train/perplexity_epoch=150.0]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 5.105
Epoch 17, global step 4158: 'val/loss' reached 5.10539 (best 5.10539), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=17-val_loss=5.105.ckpt' as top 1


Epoch 18: 100%|██████████| 231/231 [00:23<00:00,  9.88it/s, v_num=29, train/loss_step=4.840, train/perplexity_step=127.0, val/loss=5.080, val/perplexity=169.0, train/loss_epoch=5.000, train/perplexity_epoch=149.0]

Metric val/loss improved by 0.023 >= min_delta = 0.001. New best score: 5.083
Epoch 18, global step 4389: 'val/loss' reached 5.08266 (best 5.08266), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=18-val_loss=5.083.ckpt' as top 1


Epoch 19: 100%|██████████| 231/231 [00:23<00:00, 10.01it/s, v_num=29, train/loss_step=4.920, train/perplexity_step=137.0, val/loss=5.020, val/perplexity=160.0, train/loss_epoch=4.960, train/perplexity_epoch=142.0]

Metric val/loss improved by 0.059 >= min_delta = 0.001. New best score: 5.024
Epoch 19, global step 4620: 'val/loss' reached 5.02382 (best 5.02382), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=19-val_loss=5.024.ckpt' as top 1


Epoch 20: 100%|██████████| 231/231 [00:24<00:00,  9.61it/s, v_num=29, train/loss_step=4.600, train/perplexity_step=99.30, val/loss=4.930, val/perplexity=145.0, train/loss_epoch=4.860, train/perplexity_epoch=130.0]

Metric val/loss improved by 0.098 >= min_delta = 0.001. New best score: 4.926
Epoch 20, global step 4851: 'val/loss' reached 4.92613 (best 4.92613), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=20-val_loss=4.926.ckpt' as top 1


Epoch 21: 100%|██████████| 231/231 [00:23<00:00,  9.64it/s, v_num=29, train/loss_step=4.810, train/perplexity_step=123.0, val/loss=4.820, val/perplexity=130.0, train/loss_epoch=4.740, train/perplexity_epoch=115.0]

Metric val/loss improved by 0.108 >= min_delta = 0.001. New best score: 4.818
Epoch 21, global step 5082: 'val/loss' reached 4.81781 (best 4.81781), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=21-val_loss=4.818.ckpt' as top 1


Epoch 22: 100%|██████████| 231/231 [00:24<00:00,  9.31it/s, v_num=29, train/loss_step=4.610, train/perplexity_step=101.0, val/loss=4.720, val/perplexity=119.0, train/loss_epoch=4.620, train/perplexity_epoch=102.0]

Metric val/loss improved by 0.095 >= min_delta = 0.001. New best score: 4.723
Epoch 22, global step 5313: 'val/loss' reached 4.72301 (best 4.72301), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=22-val_loss=4.723.ckpt' as top 1


Epoch 23: 100%|██████████| 231/231 [00:24<00:00,  9.58it/s, v_num=29, train/loss_step=4.590, train/perplexity_step=98.80, val/loss=4.660, val/perplexity=111.0, train/loss_epoch=4.530, train/perplexity_epoch=93.10]

Metric val/loss improved by 0.063 >= min_delta = 0.001. New best score: 4.660
Epoch 23, global step 5544: 'val/loss' reached 4.66025 (best 4.66025), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=23-val_loss=4.660.ckpt' as top 1


Epoch 24: 100%|██████████| 231/231 [00:24<00:00,  9.49it/s, v_num=29, train/loss_step=4.480, train/perplexity_step=87.90, val/loss=4.630, val/perplexity=108.0, train/loss_epoch=4.480, train/perplexity_epoch=88.40]

Metric val/loss improved by 0.029 >= min_delta = 0.001. New best score: 4.631
Epoch 24, global step 5775: 'val/loss' reached 4.63098 (best 4.63098), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=24-val_loss=4.631.ckpt' as top 1


Epoch 25: 100%|██████████| 231/231 [00:23<00:00,  9.71it/s, v_num=29, train/loss_step=4.560, train/perplexity_step=96.10, val/loss=4.630, val/perplexity=108.0, train/loss_epoch=4.460, train/perplexity_epoch=86.80]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.627
Epoch 25, global step 6006: 'val/loss' reached 4.62712 (best 4.62712), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=25-val_loss=4.627.ckpt' as top 1


Epoch 26: 100%|██████████| 231/231 [00:23<00:00,  9.83it/s, v_num=29, train/loss_step=4.430, train/perplexity_step=83.60, val/loss=4.620, val/perplexity=107.0, train/loss_epoch=4.460, train/perplexity_epoch=86.60]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.623
Epoch 26, global step 6237: 'val/loss' reached 4.62298 (best 4.62298), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=26-val_loss=4.623.ckpt' as top 1


Epoch 27: 100%|██████████| 231/231 [00:23<00:00,  9.76it/s, v_num=29, train/loss_step=4.390, train/perplexity_step=80.50, val/loss=4.600, val/perplexity=105.0, train/loss_epoch=4.440, train/perplexity_epoch=85.20]

Metric val/loss improved by 0.026 >= min_delta = 0.001. New best score: 4.597
Epoch 27, global step 6468: 'val/loss' reached 4.59729 (best 4.59729), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=27-val_loss=4.597.ckpt' as top 1


Epoch 28: 100%|██████████| 231/231 [00:23<00:00, 10.01it/s, v_num=29, train/loss_step=4.490, train/perplexity_step=88.70, val/loss=4.540, val/perplexity=99.00, train/loss_epoch=4.400, train/perplexity_epoch=81.70]

Metric val/loss improved by 0.057 >= min_delta = 0.001. New best score: 4.541
Epoch 28, global step 6699: 'val/loss' reached 4.54076 (best 4.54076), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=28-val_loss=4.541.ckpt' as top 1


Epoch 29: 100%|██████████| 231/231 [00:24<00:00,  9.61it/s, v_num=29, train/loss_step=4.410, train/perplexity_step=82.20, val/loss=4.460, val/perplexity=91.90, train/loss_epoch=4.320, train/perplexity_epoch=75.60]

Metric val/loss improved by 0.076 >= min_delta = 0.001. New best score: 4.465
Epoch 29, global step 6930: 'val/loss' reached 4.46455 (best 4.46455), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=29-val_loss=4.465.ckpt' as top 1


Epoch 30: 100%|██████████| 231/231 [00:24<00:00,  9.61it/s, v_num=29, train/loss_step=4.260, train/perplexity_step=70.70, val/loss=4.380, val/perplexity=84.60, train/loss_epoch=4.230, train/perplexity_epoch=68.90]

Metric val/loss improved by 0.083 >= min_delta = 0.001. New best score: 4.382
Epoch 30, global step 7161: 'val/loss' reached 4.38196 (best 4.38196), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=30-val_loss=4.382.ckpt' as top 1


Epoch 31: 100%|██████████| 231/231 [00:23<00:00,  9.75it/s, v_num=29, train/loss_step=4.240, train/perplexity_step=69.60, val/loss=4.320, val/perplexity=79.30, train/loss_epoch=4.140, train/perplexity_epoch=63.20]

Metric val/loss improved by 0.067 >= min_delta = 0.001. New best score: 4.315
Epoch 31, global step 7392: 'val/loss' reached 4.31522 (best 4.31522), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=31-val_loss=4.315.ckpt' as top 1


Epoch 32: 100%|██████████| 231/231 [00:23<00:00,  9.85it/s, v_num=29, train/loss_step=4.050, train/perplexity_step=57.40, val/loss=4.270, val/perplexity=76.20, train/loss_epoch=4.080, train/perplexity_epoch=59.60]

Metric val/loss improved by 0.040 >= min_delta = 0.001. New best score: 4.275
Epoch 32, global step 7623: 'val/loss' reached 4.27484 (best 4.27484), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=32-val_loss=4.275.ckpt' as top 1


Epoch 33: 100%|██████████| 231/231 [00:23<00:00, 10.03it/s, v_num=29, train/loss_step=4.340, train/perplexity_step=76.50, val/loss=4.260, val/perplexity=75.20, train/loss_epoch=4.050, train/perplexity_epoch=57.80]

Metric val/loss improved by 0.013 >= min_delta = 0.001. New best score: 4.262
Epoch 33, global step 7854: 'val/loss' reached 4.26185 (best 4.26185), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=33-val_loss=4.262.ckpt' as top 1


Epoch 34: 100%|██████████| 231/231 [00:23<00:00,  9.78it/s, v_num=29, train/loss_step=3.770, train/perplexity_step=43.50, val/loss=4.260, val/perplexity=75.10, train/loss_epoch=4.050, train/perplexity_epoch=57.40]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.260
Epoch 34, global step 8085: 'val/loss' reached 4.26035 (best 4.26035), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=34-val_loss=4.260.ckpt' as top 1


Epoch 35: 100%|██████████| 231/231 [00:23<00:00,  9.65it/s, v_num=29, train/loss_step=3.810, train/perplexity_step=45.30, val/loss=4.250, val/perplexity=74.60, train/loss_epoch=4.040, train/perplexity_epoch=57.10]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 4.253
Epoch 35, global step 8316: 'val/loss' reached 4.25294 (best 4.25294), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=35-val_loss=4.253.ckpt' as top 1


Epoch 36: 100%|██████████| 231/231 [00:23<00:00, 10.04it/s, v_num=29, train/loss_step=4.120, train/perplexity_step=61.60, val/loss=4.230, val/perplexity=72.60, train/loss_epoch=4.020, train/perplexity_epoch=56.10]

Metric val/loss improved by 0.027 >= min_delta = 0.001. New best score: 4.226
Epoch 36, global step 8547: 'val/loss' reached 4.22573 (best 4.22573), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=36-val_loss=4.226.ckpt' as top 1


Epoch 37: 100%|██████████| 231/231 [00:23<00:00,  9.86it/s, v_num=29, train/loss_step=3.980, train/perplexity_step=53.60, val/loss=4.170, val/perplexity=68.90, train/loss_epoch=3.980, train/perplexity_epoch=53.90]

Metric val/loss improved by 0.052 >= min_delta = 0.001. New best score: 4.174
Epoch 37, global step 8778: 'val/loss' reached 4.17367 (best 4.17367), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=37-val_loss=4.174.ckpt' as top 1


Epoch 38: 100%|██████████| 231/231 [00:23<00:00,  9.82it/s, v_num=29, train/loss_step=3.950, train/perplexity_step=52.20, val/loss=4.110, val/perplexity=64.80, train/loss_epoch=3.920, train/perplexity_epoch=50.50]

Metric val/loss improved by 0.063 >= min_delta = 0.001. New best score: 4.111
Epoch 38, global step 9009: 'val/loss' reached 4.11093 (best 4.11093), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=38-val_loss=4.111.ckpt' as top 1


Epoch 39: 100%|██████████| 231/231 [00:23<00:00,  9.78it/s, v_num=29, train/loss_step=3.580, train/perplexity_step=35.80, val/loss=4.050, val/perplexity=60.70, train/loss_epoch=3.850, train/perplexity_epoch=47.10]

Metric val/loss improved by 0.065 >= min_delta = 0.001. New best score: 4.045
Epoch 39, global step 9240: 'val/loss' reached 4.04546 (best 4.04546), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=39-val_loss=4.045.ckpt' as top 1


Epoch 40: 100%|██████████| 231/231 [00:22<00:00, 10.42it/s, v_num=29, train/loss_step=3.920, train/perplexity_step=50.20, val/loss=4.000, val/perplexity=58.20, train/loss_epoch=3.790, train/perplexity_epoch=44.30]

Metric val/loss improved by 0.044 >= min_delta = 0.001. New best score: 4.001
Epoch 40, global step 9471: 'val/loss' reached 4.00099 (best 4.00099), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=40-val_loss=4.001.ckpt' as top 1


Epoch 41: 100%|██████████| 231/231 [00:23<00:00,  9.91it/s, v_num=29, train/loss_step=3.780, train/perplexity_step=43.70, val/loss=3.970, val/perplexity=56.60, train/loss_epoch=3.750, train/perplexity_epoch=42.50]

Metric val/loss improved by 0.026 >= min_delta = 0.001. New best score: 3.975
Epoch 41, global step 9702: 'val/loss' reached 3.97452 (best 3.97452), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=41-val_loss=3.975.ckpt' as top 1


Epoch 42: 100%|██████████| 231/231 [00:23<00:00,  9.77it/s, v_num=29, train/loss_step=3.560, train/perplexity_step=35.30, val/loss=3.970, val/perplexity=56.30, train/loss_epoch=3.730, train/perplexity_epoch=41.80]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 3.970
Epoch 42, global step 9933: 'val/loss' reached 3.96956 (best 3.96956), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=42-val_loss=3.970.ckpt' as top 1


Epoch 43: 100%|██████████| 231/231 [00:23<00:00,  9.84it/s, v_num=29, train/loss_step=3.900, train/perplexity_step=49.20, val/loss=3.970, val/perplexity=56.30, train/loss_epoch=3.730, train/perplexity_epoch=41.80]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 3.969
Epoch 43, global step 10164: 'val/loss' reached 3.96854 (best 3.96854), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=43-val_loss=3.969.ckpt' as top 1


Epoch 44: 100%|██████████| 231/231 [00:23<00:00,  9.68it/s, v_num=29, train/loss_step=3.870, train/perplexity_step=47.70, val/loss=3.960, val/perplexity=55.60, train/loss_epoch=3.720, train/perplexity_epoch=41.60]

Metric val/loss improved by 0.013 >= min_delta = 0.001. New best score: 3.956
Epoch 44, global step 10395: 'val/loss' reached 3.95567 (best 3.95567), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=44-val_loss=3.956.ckpt' as top 1


Epoch 45: 100%|██████████| 231/231 [00:24<00:00,  9.47it/s, v_num=29, train/loss_step=3.680, train/perplexity_step=39.80, val/loss=3.930, val/perplexity=54.10, train/loss_epoch=3.700, train/perplexity_epoch=40.80]

Metric val/loss improved by 0.026 >= min_delta = 0.001. New best score: 3.930
Epoch 45, global step 10626: 'val/loss' reached 3.92970 (best 3.92970), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=45-val_loss=3.930.ckpt' as top 1


Epoch 46: 100%|██████████| 231/231 [00:23<00:00,  9.73it/s, v_num=29, train/loss_step=3.770, train/perplexity_step=43.40, val/loss=3.880, val/perplexity=51.70, train/loss_epoch=3.660, train/perplexity_epoch=39.20]

Metric val/loss improved by 0.046 >= min_delta = 0.001. New best score: 3.884
Epoch 46, global step 10857: 'val/loss' reached 3.88404 (best 3.88404), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=46-val_loss=3.884.ckpt' as top 1


Epoch 47: 100%|██████████| 231/231 [00:24<00:00,  9.59it/s, v_num=29, train/loss_step=3.580, train/perplexity_step=36.00, val/loss=3.830, val/perplexity=48.90, train/loss_epoch=3.610, train/perplexity_epoch=37.20]

Metric val/loss improved by 0.057 >= min_delta = 0.001. New best score: 3.827
Epoch 47, global step 11088: 'val/loss' reached 3.82738 (best 3.82738), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=47-val_loss=3.827.ckpt' as top 1


Epoch 48: 100%|██████████| 231/231 [00:24<00:00,  9.31it/s, v_num=29, train/loss_step=3.440, train/perplexity_step=31.10, val/loss=3.780, val/perplexity=46.60, train/loss_epoch=3.560, train/perplexity_epoch=35.30]

Metric val/loss improved by 0.049 >= min_delta = 0.001. New best score: 3.778
Epoch 48, global step 11319: 'val/loss' reached 3.77803 (best 3.77803), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=48-val_loss=3.778.ckpt' as top 1


Epoch 49: 100%|██████████| 231/231 [00:23<00:00,  9.86it/s, v_num=29, train/loss_step=3.540, train/perplexity_step=34.40, val/loss=3.740, val/perplexity=45.00, train/loss_epoch=3.520, train/perplexity_epoch=33.80]

Metric val/loss improved by 0.034 >= min_delta = 0.001. New best score: 3.744
Epoch 49, global step 11550: 'val/loss' reached 3.74386 (best 3.74386), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=49-val_loss=3.744.ckpt' as top 1


Epoch 50: 100%|██████████| 231/231 [00:24<00:00,  9.42it/s, v_num=29, train/loss_step=3.370, train/perplexity_step=28.90, val/loss=3.730, val/perplexity=44.40, train/loss_epoch=3.490, train/perplexity_epoch=32.90]

Metric val/loss improved by 0.013 >= min_delta = 0.001. New best score: 3.731
Epoch 50, global step 11781: 'val/loss' reached 3.73067 (best 3.73067), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=50-val_loss=3.731.ckpt' as top 1


Epoch 51: 100%|██████████| 231/231 [00:23<00:00,  9.79it/s, v_num=29, train/loss_step=3.700, train/perplexity_step=40.60, val/loss=3.730, val/perplexity=44.30, train/loss_epoch=3.480, train/perplexity_epoch=32.70]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 3.728
Epoch 51, global step 12012: 'val/loss' reached 3.72776 (best 3.72776), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=51-val_loss=3.728.ckpt' as top 1


Epoch 52: 100%|██████████| 231/231 [00:23<00:00,  9.99it/s, v_num=29, train/loss_step=3.760, train/perplexity_step=42.90, val/loss=3.730, val/perplexity=44.20, train/loss_epoch=3.480, train/perplexity_epoch=32.60]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 3.726
Epoch 52, global step 12243: 'val/loss' reached 3.72641 (best 3.72641), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=52-val_loss=3.726.ckpt' as top 1


Epoch 53: 100%|██████████| 231/231 [00:23<00:00,  9.80it/s, v_num=29, train/loss_step=3.540, train/perplexity_step=34.50, val/loss=3.710, val/perplexity=43.60, train/loss_epoch=3.480, train/perplexity_epoch=32.50]

Metric val/loss improved by 0.014 >= min_delta = 0.001. New best score: 3.712
Epoch 53, global step 12474: 'val/loss' reached 3.71234 (best 3.71234), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=53-val_loss=3.712.ckpt' as top 1


Epoch 54: 100%|██████████| 231/231 [00:23<00:00,  9.67it/s, v_num=29, train/loss_step=3.490, train/perplexity_step=32.90, val/loss=3.690, val/perplexity=42.40, train/loss_epoch=3.450, train/perplexity_epoch=31.70]

Metric val/loss improved by 0.027 >= min_delta = 0.001. New best score: 3.685
Epoch 54, global step 12705: 'val/loss' reached 3.68521 (best 3.68521), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=54-val_loss=3.685.ckpt' as top 1


Epoch 55: 100%|██████████| 231/231 [00:23<00:00,  9.63it/s, v_num=29, train/loss_step=3.290, train/perplexity_step=26.70, val/loss=3.640, val/perplexity=40.50, train/loss_epoch=3.420, train/perplexity_epoch=30.80]

Metric val/loss improved by 0.045 >= min_delta = 0.001. New best score: 3.640
Epoch 55, global step 12936: 'val/loss' reached 3.64021 (best 3.64021), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=55-val_loss=3.640.ckpt' as top 1


Epoch 56: 100%|██████████| 231/231 [00:23<00:00,  9.75it/s, v_num=29, train/loss_step=3.320, train/perplexity_step=27.80, val/loss=3.590, val/perplexity=38.60, train/loss_epoch=3.380, train/perplexity_epoch=29.40]

Metric val/loss improved by 0.047 >= min_delta = 0.001. New best score: 3.593
Epoch 56, global step 13167: 'val/loss' reached 3.59294 (best 3.59294), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=56-val_loss=3.593.ckpt' as top 1


Epoch 57: 100%|██████████| 231/231 [00:23<00:00,  9.72it/s, v_num=29, train/loss_step=3.240, train/perplexity_step=25.60, val/loss=3.560, val/perplexity=37.20, train/loss_epoch=3.340, train/perplexity_epoch=28.30]

Metric val/loss improved by 0.038 >= min_delta = 0.001. New best score: 3.555
Epoch 57, global step 13398: 'val/loss' reached 3.55540 (best 3.55540), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=57-val_loss=3.555.ckpt' as top 1


Epoch 58: 100%|██████████| 231/231 [00:24<00:00,  9.58it/s, v_num=29, train/loss_step=3.220, train/perplexity_step=25.00, val/loss=3.530, val/perplexity=36.40, train/loss_epoch=3.310, train/perplexity_epoch=27.40]

Metric val/loss improved by 0.022 >= min_delta = 0.001. New best score: 3.533
Epoch 58, global step 13629: 'val/loss' reached 3.53296 (best 3.53296), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=58-val_loss=3.533.ckpt' as top 1


Epoch 59: 100%|██████████| 231/231 [00:23<00:00,  9.86it/s, v_num=29, train/loss_step=3.140, train/perplexity_step=23.20, val/loss=3.520, val/perplexity=36.10, train/loss_epoch=3.290, train/perplexity_epoch=27.00]

Metric val/loss improved by 0.008 >= min_delta = 0.001. New best score: 3.525
Epoch 59, global step 13860: 'val/loss' reached 3.52454 (best 3.52454), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=59-val_loss=3.525.ckpt' as top 1


Epoch 60:  26%|██▌       | 59/231 [00:05<00:17,  9.91it/s, v_num=29, train/loss_step=3.280, train/perplexity_step=26.50, val/loss=3.520, val/perplexity=36.10, train/loss_epoch=3.290, train/perplexity_epoch=27.00] 

#### Save the model

In [None]:
# Test model
print("\nTesting model...")
trainer.test(model, data_module)

# Save best model instead of final model
if checkpoint_callback.best_model_path:
    # Copy the best model to a final location with version number
    import shutil
    version = trainer.logger.version
    final_model_path = os.path.join(save_dir, f"{experiment_name}-best-v{version:02d}.ckpt")
    shutil.copy2(checkpoint_callback.best_model_path, final_model_path)
    print(f"Best model copied to {final_model_path}")
else:
    print("No best model found, saving current model as final")
    version = trainer.logger.version
    final_model_path = os.path.join(save_dir, f"{experiment_name}-final-v{version:02d}.ckpt")
    trainer.save_checkpoint(final_model_path)
    print(f"Final model saved to {final_model_path}")


Testing model...
Testing DataLoader 0: 100%|██████████| 19/19 [00:00<00:00, 53.89it/s]


Final model saved to ./checkpoints\transformer_lm-final.ckpt


In [None]:
print("\nTraining completed!")
print(f"Checkpoints saved in: {save_dir}")
print(f"Logs saved in: {log_dir}")
print(f"Vocabulary saved in: {vocab_path}")
print(f"Best model: {checkpoint_callback.best_model_path}")
print(f"Best score: {checkpoint_callback.best_model_score}")
print(f"Final model saved as: {final_model_path}")

# Print instructions for running the app
print("\n" + "="*50)
print("To run the Gradio app with your trained model:")
print(f"python -m src.app.gradio_app --model_path {final_model_path} --vocab_path {vocab_path}")

print("="*50)


Training completed!
Checkpoints saved in: ./checkpoints
Logs saved in: ./logs
Vocabulary saved in: ./checkpoints\vocab.pkl
Best model: C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=40-val_loss=5.686.ckpt
Best score: 5.686227798461914

To run the Gradio app with your trained model:
python -m src.app.gradio_app --model_path C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=40-val_loss=5.686.ckpt --vocab_path ./checkpoints\vocab.pkl
