# IMPORTS

#### Set Root Dir

In [1]:
import sys
import os

# Use the current working directory as root (or go up if needed)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '../..'))  # Adjust '..' as needed
sys.path.append(ROOT_DIR)

#### import modules

In [2]:
"""
Training script for the transformer language model.
"""
import argparse
import torch
import pytorch_lightning as pl
import pickle
import yaml
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer

from src.data.tokenizer import SimpleTokenizer, load_text_corpus, create_sample_corpus
from src.data.dataset import TextDataModule
from src.model.lightning_module import TransformerLightningModule

  from .autonotebook import tqdm as notebook_tqdm


# Params

In [3]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [4]:
hparams = config["hparams"]

vocab_size, d_model, num_heads, num_layers, d_ff, \
sequence_length, batch_size, learning_rate, max_epochs, \
patience, min_delta, warmup_steps, weight_decay, dropout, \
train_split, val_split, num_workers, accelerator, devices, precision, \
gradient_clip_val, accumulate_grad_batches, log_every_n_steps, \
val_check_interval, save_top_k, monitor, mode = hparams.values()

In [5]:
hparams

{'vocab_size': 3000,
 'd_model': 32,
 'num_heads': 1,
 'num_layers': 2,
 'd_ff': 1024,
 'sequence_length': 64,
 'batch_size': 64,
 'learning_rate': 0.0001,
 'max_epochs': 100,
 'patience': 10,
 'min_delta': 0.001,
 'warmup_steps': 1000,
 'weight_decay': 0.01,
 'dropout': 0.1,
 'train_split': 0.7,
 'val_split': 0.2,
 'num_workers': 0,
 'accelerator': 'auto',
 'devices': 1,
 'precision': '32',
 'gradient_clip_val': 1.0,
 'accumulate_grad_batches': 1,
 'log_every_n_steps': 50,
 'val_check_interval': 1.0,
 'save_top_k': 3,
 'monitor': 'val/loss',
 'mode': 'min'}

In [6]:
paths = config["paths"]
general = config["general"]

corpus_path=paths["corpus_path"]
save_dir=paths["save_dir"]
log_dir=paths["log_dir"]
experiment_name=general["experiment_name"]
create_sample=general["create_sample"]

In [7]:
# Load and tokenize text
print("Loading and tokenizing text...")
text = load_text_corpus(corpus_path)

Loading and tokenizing text...


In [8]:
# Create tokenizer and build vocabulary
tokenizer = SimpleTokenizer(vocab_size=hparams["vocab_size"])
tokenizer.build_vocab(text)

Vocabulary built with 1836 tokens


In [9]:
# Save vocabulary
vocab_path = os.path.join(save_dir, "vocab.pkl")
os.makedirs(save_dir, exist_ok=True)
tokenizer.save_vocab(vocab_path)

Vocabulary saved to ./checkpoints\vocab.pkl


In [10]:
# read vocab
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

len(vocab['word_to_idx'])


1836

In [11]:
# Encode text
token_ids = tokenizer.encode(text)
print(f"Encoded text length: {len(token_ids)} tokens")

Encoded text length: 9705 tokens


In [12]:
# Create data module
data_module = TextDataModule(
    token_ids=token_ids,
    sequence_length=sequence_length,
    batch_size=batch_size,
    train_split=train_split,
    val_split=val_split,
    num_workers=num_workers
)

In [13]:
# Create model
model = TransformerLightningModule(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    d_ff=d_ff,
    dropout=dropout,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps
)



In [14]:
# Create callbacks
callbacks = []

# Early stopping
early_stopping = EarlyStopping(
    monitor=monitor,
    patience=patience,
    min_delta=min_delta,
    mode=mode,
    verbose=True
)
callbacks.append(early_stopping)

# Model checkpointing
checkpoint_callback = ModelCheckpoint(
    dirpath=save_dir,
    filename=f"{experiment_name}-{{epoch:02d}}-{{val_loss:.4f}}",
    monitor=monitor,
    mode=mode,
    save_top_k=save_top_k,
    save_last=True,
    verbose=True
)
callbacks.append(checkpoint_callback)

# Learning rate monitoring
lr_monitor = LearningRateMonitor(logging_interval='step')
callbacks.append(lr_monitor)

In [15]:
# Create logger
logger = TensorBoardLogger(
    save_dir=log_dir,
    name=experiment_name,
    version=None
)

# Create trainer
trainer = Trainer(
    accelerator=accelerator,
    devices=devices,
    precision=precision,
    max_epochs=max_epochs,
    gradient_clip_val=gradient_clip_val,
    accumulate_grad_batches=accumulate_grad_batches,
    log_every_n_steps=log_every_n_steps,
    val_check_interval=val_check_interval,
    callbacks=callbacks,
    logger=logger,
    deterministic=True,
    enable_progress_bar=True,
    enable_model_summary=True
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [16]:
# Print model summary
print("\nModel Summary:")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


Model Summary:
Vocabulary size: 1836
Model parameters: 261,228
Trainable parameters: 261,228


In [17]:
# Check data sizes
print("Data sizes:")
print(f"Total tokens: {len(token_ids)}")
print(f"Sequence length: {sequence_length}")
print(f"Train split: {train_split}")
print(f"Val split: {val_split}")

# Calculate split sizes
total_len = len(token_ids)
train_end = int(total_len * train_split)
val_end = int(total_len * (train_split + val_split))

print(f"Train tokens: {train_end}")
print(f"Val tokens: {val_end - train_end}")
print(f"Test tokens: {total_len - val_end}")

# Check if validation data is sufficient
val_tokens = val_end - train_end
if val_tokens < sequence_length:
    print(f"WARNING: Validation data has only {val_tokens} tokens, less than sequence length {sequence_length}")
    print("This will cause validation to fail. Consider using a larger corpus or adjusting splits.")


Data sizes:
Total tokens: 9705
Sequence length: 64
Train split: 0.7
Val split: 0.2
Train tokens: 6793
Val tokens: 1941
Test tokens: 971


In [None]:
# Train model
print("\nStarting training...")
print("With the updated parameters:")
print(f"- Sequence length: {sequence_length}")
print(f"- Train split: {train_split}")
print(f"- Val split: {val_split}")
print(f"- Monitor: {monitor}")
print()

trainer.fit(model, data_module)


Starting training...
With the updated parameters:
- Sequence length: 64
- Train split: 0.7
- Val split: 0.2
- Monitor: val/loss



c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\code\data_science\demo_llm\checkpoints exists and is not empty.

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | model            | TransformerLM    | 261 K  | train
1 | criterion        | CrossEntropyLoss | 0      | train
2 | train_perplexity | _Perplexity      | 0      | train
3 | val_perplexity   | _Perplexity      | 0      | train
--------------------------------------------------------------
261 K     Trainable params
0         Non-trainable params
261 K     Total params
1.045     Total estimated model params size (MB)
37        Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


                                                                           

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 106/106 [00:19<00:00,  5.44it/s, v_num=19, train/loss_step=7.620, train/perplexity_step=2.03e+3, val/loss=7.630, val/perplexity=2.07e+3, train/loss_epoch=7.670, train/perplexity_epoch=2.15e+3]

Metric val/loss improved. New best score: 7.632
Epoch 0, global step 106: 'val/loss' reached 7.63215 (best 7.63215), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=00-val_loss=0.0000.ckpt' as top 3


Epoch 1: 100%|██████████| 106/106 [00:19<00:00,  5.42it/s, v_num=19, train/loss_step=7.500, train/perplexity_step=1.8e+3, val/loss=7.450, val/perplexity=1.73e+3, train/loss_epoch=7.560, train/perplexity_epoch=1.92e+3] 

Metric val/loss improved by 0.178 >= min_delta = 0.001. New best score: 7.454
Epoch 1, global step 212: 'val/loss' reached 7.45436 (best 7.45436), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=01-val_loss=0.0000.ckpt' as top 3


Epoch 2: 100%|██████████| 106/106 [00:19<00:00,  5.51it/s, v_num=19, train/loss_step=7.190, train/perplexity_step=1.32e+3, val/loss=7.260, val/perplexity=1.43e+3, train/loss_epoch=7.370, train/perplexity_epoch=1.59e+3]

Metric val/loss improved by 0.193 >= min_delta = 0.001. New best score: 7.261
Epoch 2, global step 318: 'val/loss' reached 7.26140 (best 7.26140), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=02-val_loss=0.0000-v1.ckpt' as top 3


Epoch 3: 100%|██████████| 106/106 [00:19<00:00,  5.48it/s, v_num=19, train/loss_step=7.200, train/perplexity_step=1.33e+3, val/loss=7.090, val/perplexity=1.2e+3, train/loss_epoch=7.200, train/perplexity_epoch=1.34e+3] 

Metric val/loss improved by 0.175 >= min_delta = 0.001. New best score: 7.086
Epoch 3, global step 424: 'val/loss' reached 7.08648 (best 7.08648), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=03-val_loss=0.0000-v2.ckpt' as top 3


Epoch 4: 100%|██████████| 106/106 [00:19<00:00,  5.40it/s, v_num=19, train/loss_step=6.920, train/perplexity_step=1.02e+3, val/loss=6.910, val/perplexity=1.01e+3, train/loss_epoch=7.020, train/perplexity_epoch=1.13e+3]

Metric val/loss improved by 0.181 >= min_delta = 0.001. New best score: 6.906
Epoch 4, global step 530: 'val/loss' reached 6.90562 (best 6.90562), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=04-val_loss=0.0000-v3.ckpt' as top 3


Epoch 5: 100%|██████████| 106/106 [00:19<00:00,  5.45it/s, v_num=19, train/loss_step=6.770, train/perplexity_step=868.0, val/loss=6.710, val/perplexity=830.0, train/loss_epoch=6.840, train/perplexity_epoch=933.0]      

Metric val/loss improved by 0.196 >= min_delta = 0.001. New best score: 6.710
Epoch 5, global step 636: 'val/loss' reached 6.70963 (best 6.70963), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=05-val_loss=0.0000-v2.ckpt' as top 3


Epoch 6: 100%|██████████| 106/106 [00:19<00:00,  5.47it/s, v_num=19, train/loss_step=6.500, train/perplexity_step=667.0, val/loss=6.510, val/perplexity=684.0, train/loss_epoch=6.640, train/perplexity_epoch=763.0]

Metric val/loss improved by 0.199 >= min_delta = 0.001. New best score: 6.511
Epoch 6, global step 742: 'val/loss' reached 6.51055 (best 6.51055), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=06-val_loss=0.0000-v1.ckpt' as top 3


Epoch 7: 100%|██████████| 106/106 [00:19<00:00,  5.44it/s, v_num=19, train/loss_step=6.380, train/perplexity_step=590.0, val/loss=6.330, val/perplexity=574.0, train/loss_epoch=6.430, train/perplexity_epoch=624.0]

Metric val/loss improved by 0.183 >= min_delta = 0.001. New best score: 6.327
Epoch 7, global step 848: 'val/loss' reached 6.32714 (best 6.32714), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=07-val_loss=0.0000.ckpt' as top 3


Epoch 8: 100%|██████████| 106/106 [00:19<00:00,  5.46it/s, v_num=19, train/loss_step=6.100, train/perplexity_step=448.0, val/loss=6.170, val/perplexity=499.0, train/loss_epoch=6.250, train/perplexity_epoch=519.0]

Metric val/loss improved by 0.153 >= min_delta = 0.001. New best score: 6.175
Epoch 8, global step 954: 'val/loss' reached 6.17456 (best 6.17456), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=08-val_loss=0.0000.ckpt' as top 3


Epoch 9: 100%|██████████| 106/106 [00:19<00:00,  5.45it/s, v_num=19, train/loss_step=6.150, train/perplexity_step=469.0, val/loss=6.050, val/perplexity=447.0, train/loss_epoch=6.090, train/perplexity_epoch=441.0]

Metric val/loss improved by 0.127 >= min_delta = 0.001. New best score: 6.048
Epoch 9, global step 1060: 'val/loss' reached 6.04783 (best 6.04783), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=09-val_loss=0.0000.ckpt' as top 3


Epoch 10: 100%|██████████| 106/106 [00:19<00:00,  5.41it/s, v_num=19, train/loss_step=5.800, train/perplexity_step=329.0, val/loss=5.950, val/perplexity=413.0, train/loss_epoch=5.940, train/perplexity_epoch=382.0]

Metric val/loss improved by 0.099 >= min_delta = 0.001. New best score: 5.948
Epoch 10, global step 1166: 'val/loss' reached 5.94847 (best 5.94847), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=10-val_loss=0.0000-v2.ckpt' as top 3


Epoch 11: 100%|██████████| 106/106 [00:19<00:00,  5.54it/s, v_num=19, train/loss_step=5.790, train/perplexity_step=328.0, val/loss=5.870, val/perplexity=388.0, train/loss_epoch=5.820, train/perplexity_epoch=338.0]

Metric val/loss improved by 0.083 >= min_delta = 0.001. New best score: 5.865
Epoch 11, global step 1272: 'val/loss' reached 5.86537 (best 5.86537), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=11-val_loss=0.0000-v3.ckpt' as top 3


Epoch 12: 100%|██████████| 106/106 [00:20<00:00,  5.23it/s, v_num=19, train/loss_step=5.730, train/perplexity_step=307.0, val/loss=5.790, val/perplexity=367.0, train/loss_epoch=5.720, train/perplexity_epoch=304.0]

Metric val/loss improved by 0.075 >= min_delta = 0.001. New best score: 5.790
Epoch 12, global step 1378: 'val/loss' reached 5.78992 (best 5.78992), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=12-val_loss=0.0000-v3.ckpt' as top 3


Epoch 13: 100%|██████████| 106/106 [00:19<00:00,  5.43it/s, v_num=19, train/loss_step=5.740, train/perplexity_step=311.0, val/loss=5.730, val/perplexity=353.0, train/loss_epoch=5.620, train/perplexity_epoch=278.0]

Metric val/loss improved by 0.058 >= min_delta = 0.001. New best score: 5.732
Epoch 13, global step 1484: 'val/loss' reached 5.73223 (best 5.73223), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=13-val_loss=0.0000-v1.ckpt' as top 3


Epoch 14: 100%|██████████| 106/106 [00:20<00:00,  5.21it/s, v_num=19, train/loss_step=5.490, train/perplexity_step=242.0, val/loss=5.690, val/perplexity=344.0, train/loss_epoch=5.560, train/perplexity_epoch=259.0]

Metric val/loss improved by 0.042 >= min_delta = 0.001. New best score: 5.691
Epoch 14, global step 1590: 'val/loss' reached 5.69059 (best 5.69059), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=14-val_loss=0.0000.ckpt' as top 3


Epoch 15: 100%|██████████| 106/106 [00:19<00:00,  5.42it/s, v_num=19, train/loss_step=5.670, train/perplexity_step=291.0, val/loss=5.670, val/perplexity=339.0, train/loss_epoch=5.510, train/perplexity_epoch=247.0]

Metric val/loss improved by 0.025 >= min_delta = 0.001. New best score: 5.666
Epoch 15, global step 1696: 'val/loss' reached 5.66576 (best 5.66576), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=15-val_loss=0.0000-v1.ckpt' as top 3


Epoch 16: 100%|██████████| 106/106 [00:19<00:00,  5.45it/s, v_num=19, train/loss_step=5.440, train/perplexity_step=231.0, val/loss=5.650, val/perplexity=336.0, train/loss_epoch=5.480, train/perplexity_epoch=240.0]

Metric val/loss improved by 0.013 >= min_delta = 0.001. New best score: 5.653
Epoch 16, global step 1802: 'val/loss' reached 5.65297 (best 5.65297), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=16-val_loss=0.0000-v2.ckpt' as top 3


Epoch 17: 100%|██████████| 106/106 [00:19<00:00,  5.48it/s, v_num=19, train/loss_step=5.560, train/perplexity_step=260.0, val/loss=5.650, val/perplexity=335.0, train/loss_epoch=5.470, train/perplexity_epoch=237.0]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 5.649
Epoch 17, global step 1908: 'val/loss' reached 5.64876 (best 5.64876), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=17-val_loss=0.0000-v2.ckpt' as top 3


Epoch 18: 100%|██████████| 106/106 [00:19<00:00,  5.42it/s, v_num=19, train/loss_step=5.360, train/perplexity_step=213.0, val/loss=5.650, val/perplexity=335.0, train/loss_epoch=5.470, train/perplexity_epoch=237.0]

Epoch 18, global step 2014: 'val/loss' reached 5.64830 (best 5.64830), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=18-val_loss=0.0000-v1.ckpt' as top 3


Epoch 19: 100%|██████████| 106/106 [00:19<00:00,  5.37it/s, v_num=19, train/loss_step=5.480, train/perplexity_step=240.0, val/loss=5.650, val/perplexity=335.0, train/loss_epoch=5.470, train/perplexity_epoch=237.0]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 5.647
Epoch 19, global step 2120: 'val/loss' reached 5.64708 (best 5.64708), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=19-val_loss=0.0000.ckpt' as top 3


Epoch 20: 100%|██████████| 106/106 [00:19<00:00,  5.39it/s, v_num=19, train/loss_step=5.330, train/perplexity_step=206.0, val/loss=5.640, val/perplexity=334.0, train/loss_epoch=5.460, train/perplexity_epoch=235.0]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 5.641
Epoch 20, global step 2226: 'val/loss' reached 5.64138 (best 5.64138), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=20-val_loss=0.0000.ckpt' as top 3


Epoch 21: 100%|██████████| 106/106 [00:19<00:00,  5.46it/s, v_num=19, train/loss_step=5.350, train/perplexity_step=211.0, val/loss=5.630, val/perplexity=331.0, train/loss_epoch=5.450, train/perplexity_epoch=232.0]

Metric val/loss improved by 0.014 >= min_delta = 0.001. New best score: 5.628
Epoch 21, global step 2332: 'val/loss' reached 5.62785 (best 5.62785), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=21-val_loss=0.0000.ckpt' as top 3


Epoch 22: 100%|██████████| 106/106 [00:19<00:00,  5.46it/s, v_num=19, train/loss_step=5.520, train/perplexity_step=251.0, val/loss=5.600, val/perplexity=327.0, train/loss_epoch=5.420, train/perplexity_epoch=226.0]

Metric val/loss improved by 0.025 >= min_delta = 0.001. New best score: 5.603
Epoch 22, global step 2438: 'val/loss' reached 5.60286 (best 5.60286), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=22-val_loss=0.0000.ckpt' as top 3


Epoch 23: 100%|██████████| 106/106 [00:19<00:00,  5.45it/s, v_num=19, train/loss_step=5.460, train/perplexity_step=235.0, val/loss=5.570, val/perplexity=321.0, train/loss_epoch=5.370, train/perplexity_epoch=215.0]

Metric val/loss improved by 0.034 >= min_delta = 0.001. New best score: 5.569
Epoch 23, global step 2544: 'val/loss' reached 5.56868 (best 5.56868), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=23-val_loss=0.0000.ckpt' as top 3


Epoch 24: 100%|██████████| 106/106 [00:19<00:00,  5.46it/s, v_num=19, train/loss_step=5.310, train/perplexity_step=202.0, val/loss=5.520, val/perplexity=314.0, train/loss_epoch=5.310, train/perplexity_epoch=202.0]

Metric val/loss improved by 0.045 >= min_delta = 0.001. New best score: 5.524
Epoch 24, global step 2650: 'val/loss' reached 5.52417 (best 5.52417), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=24-val_loss=0.0000.ckpt' as top 3


Epoch 25: 100%|██████████| 106/106 [00:19<00:00,  5.43it/s, v_num=19, train/loss_step=5.100, train/perplexity_step=164.0, val/loss=5.480, val/perplexity=308.0, train/loss_epoch=5.220, train/perplexity_epoch=186.0]

Metric val/loss improved by 0.048 >= min_delta = 0.001. New best score: 5.476
Epoch 25, global step 2756: 'val/loss' reached 5.47580 (best 5.47580), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=25-val_loss=0.0000-v1.ckpt' as top 3


Epoch 26: 100%|██████████| 106/106 [00:20<00:00,  5.26it/s, v_num=19, train/loss_step=5.280, train/perplexity_step=196.0, val/loss=5.420, val/perplexity=301.0, train/loss_epoch=5.130, train/perplexity_epoch=169.0]

Metric val/loss improved by 0.056 >= min_delta = 0.001. New best score: 5.420
Epoch 26, global step 2862: 'val/loss' reached 5.41998 (best 5.41998), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=26-val_loss=0.0000.ckpt' as top 3


Epoch 27: 100%|██████████| 106/106 [00:20<00:00,  5.14it/s, v_num=19, train/loss_step=5.000, train/perplexity_step=149.0, val/loss=5.360, val/perplexity=295.0, train/loss_epoch=5.030, train/perplexity_epoch=152.0]

Metric val/loss improved by 0.057 >= min_delta = 0.001. New best score: 5.363
Epoch 27, global step 2968: 'val/loss' reached 5.36258 (best 5.36258), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=27-val_loss=0.0000-v1.ckpt' as top 3


Epoch 28: 100%|██████████| 106/106 [00:19<00:00,  5.37it/s, v_num=19, train/loss_step=5.050, train/perplexity_step=155.0, val/loss=5.310, val/perplexity=291.0, train/loss_epoch=4.920, train/perplexity_epoch=137.0]

Metric val/loss improved by 0.054 >= min_delta = 0.001. New best score: 5.308
Epoch 28, global step 3074: 'val/loss' reached 5.30839 (best 5.30839), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=28-val_loss=0.0000.ckpt' as top 3


Epoch 29: 100%|██████████| 106/106 [00:19<00:00,  5.36it/s, v_num=19, train/loss_step=4.720, train/perplexity_step=112.0, val/loss=5.260, val/perplexity=288.0, train/loss_epoch=4.810, train/perplexity_epoch=124.0]

Metric val/loss improved by 0.046 >= min_delta = 0.001. New best score: 5.263
Epoch 29, global step 3180: 'val/loss' reached 5.26256 (best 5.26256), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=29-val_loss=0.0000-v1.ckpt' as top 3


Epoch 30: 100%|██████████| 106/106 [00:19<00:00,  5.37it/s, v_num=19, train/loss_step=4.700, train/perplexity_step=110.0, val/loss=5.220, val/perplexity=287.0, train/loss_epoch=4.720, train/perplexity_epoch=112.0]

Metric val/loss improved by 0.040 >= min_delta = 0.001. New best score: 5.223
Epoch 30, global step 3286: 'val/loss' reached 5.22269 (best 5.22269), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=30-val_loss=0.0000-v2.ckpt' as top 3


Epoch 31: 100%|██████████| 106/106 [00:19<00:00,  5.34it/s, v_num=19, train/loss_step=4.470, train/perplexity_step=87.30, val/loss=5.190, val/perplexity=286.0, train/loss_epoch=4.640, train/perplexity_epoch=104.0]

Metric val/loss improved by 0.031 >= min_delta = 0.001. New best score: 5.191
Epoch 31, global step 3392: 'val/loss' reached 5.19121 (best 5.19121), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=31-val_loss=0.0000-v2.ckpt' as top 3


Epoch 32: 100%|██████████| 106/106 [00:20<00:00,  5.28it/s, v_num=19, train/loss_step=4.540, train/perplexity_step=93.70, val/loss=5.160, val/perplexity=285.0, train/loss_epoch=4.570, train/perplexity_epoch=97.00]

Metric val/loss improved by 0.028 >= min_delta = 0.001. New best score: 5.163
Epoch 32, global step 3498: 'val/loss' reached 5.16298 (best 5.16298), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=32-val_loss=0.0000-v1.ckpt' as top 3


Epoch 33: 100%|██████████| 106/106 [00:20<00:00,  5.23it/s, v_num=19, train/loss_step=4.660, train/perplexity_step=105.0, val/loss=5.150, val/perplexity=284.0, train/loss_epoch=4.530, train/perplexity_epoch=92.40]

Metric val/loss improved by 0.016 >= min_delta = 0.001. New best score: 5.147
Epoch 33, global step 3604: 'val/loss' reached 5.14657 (best 5.14657), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=33-val_loss=0.0000-v1.ckpt' as top 3


Epoch 34: 100%|██████████| 106/106 [00:19<00:00,  5.34it/s, v_num=19, train/loss_step=4.340, train/perplexity_step=76.90, val/loss=5.140, val/perplexity=285.0, train/loss_epoch=4.490, train/perplexity_epoch=89.40]

Metric val/loss improved by 0.010 >= min_delta = 0.001. New best score: 5.136
Epoch 34, global step 3710: 'val/loss' reached 5.13649 (best 5.13649), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=34-val_loss=0.0000.ckpt' as top 3


Epoch 35: 100%|██████████| 106/106 [00:19<00:00,  5.43it/s, v_num=19, train/loss_step=4.490, train/perplexity_step=88.80, val/loss=5.130, val/perplexity=285.0, train/loss_epoch=4.470, train/perplexity_epoch=87.80]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 5.132
Epoch 35, global step 3816: 'val/loss' reached 5.13156 (best 5.13156), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=35-val_loss=0.0000.ckpt' as top 3


Epoch 36: 100%|██████████| 106/106 [00:19<00:00,  5.44it/s, v_num=19, train/loss_step=4.660, train/perplexity_step=105.0, val/loss=5.130, val/perplexity=285.0, train/loss_epoch=4.460, train/perplexity_epoch=87.00]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 5.130
Epoch 36, global step 3922: 'val/loss' reached 5.13026 (best 5.13026), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=36-val_loss=0.0000.ckpt' as top 3


Epoch 37: 100%|██████████| 106/106 [00:20<00:00,  5.20it/s, v_num=19, train/loss_step=4.630, train/perplexity_step=103.0, val/loss=5.130, val/perplexity=285.0, train/loss_epoch=4.460, train/perplexity_epoch=86.80]

Epoch 37, global step 4028: 'val/loss' reached 5.13016 (best 5.13016), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=37-val_loss=0.0000.ckpt' as top 3


Epoch 38: 100%|██████████| 106/106 [00:20<00:00,  5.16it/s, v_num=19, train/loss_step=4.450, train/perplexity_step=85.30, val/loss=5.130, val/perplexity=285.0, train/loss_epoch=4.460, train/perplexity_epoch=86.80]

Epoch 38, global step 4134: 'val/loss' reached 5.12937 (best 5.12937), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=38-val_loss=0.0000.ckpt' as top 3


Epoch 39: 100%|██████████| 106/106 [00:19<00:00,  5.31it/s, v_num=19, train/loss_step=4.560, train/perplexity_step=95.30, val/loss=5.130, val/perplexity=286.0, train/loss_epoch=4.460, train/perplexity_epoch=86.50]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 5.128
Epoch 39, global step 4240: 'val/loss' reached 5.12805 (best 5.12805), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=39-val_loss=0.0000.ckpt' as top 3


Epoch 40: 100%|██████████| 106/106 [00:20<00:00,  5.30it/s, v_num=19, train/loss_step=4.460, train/perplexity_step=86.80, val/loss=5.120, val/perplexity=286.0, train/loss_epoch=4.450, train/perplexity_epoch=85.50]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 5.121
Epoch 40, global step 4346: 'val/loss' reached 5.12105 (best 5.12105), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=40-val_loss=0.0000.ckpt' as top 3


Epoch 41: 100%|██████████| 106/106 [00:20<00:00,  5.26it/s, v_num=19, train/loss_step=4.390, train/perplexity_step=80.30, val/loss=5.110, val/perplexity=288.0, train/loss_epoch=4.420, train/perplexity_epoch=83.50]

Metric val/loss improved by 0.008 >= min_delta = 0.001. New best score: 5.113
Epoch 41, global step 4452: 'val/loss' reached 5.11328 (best 5.11328), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=41-val_loss=0.0000.ckpt' as top 3


Epoch 42: 100%|██████████| 106/106 [00:21<00:00,  5.01it/s, v_num=19, train/loss_step=4.640, train/perplexity_step=103.0, val/loss=5.100, val/perplexity=289.0, train/loss_epoch=4.390, train/perplexity_epoch=80.60]

Metric val/loss improved by 0.017 >= min_delta = 0.001. New best score: 5.096
Epoch 42, global step 4558: 'val/loss' reached 5.09597 (best 5.09597), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=42-val_loss=0.0000.ckpt' as top 3


Epoch 43: 100%|██████████| 106/106 [00:19<00:00,  5.30it/s, v_num=19, train/loss_step=4.290, train/perplexity_step=72.80, val/loss=5.080, val/perplexity=291.0, train/loss_epoch=4.340, train/perplexity_epoch=76.80]

Metric val/loss improved by 0.017 >= min_delta = 0.001. New best score: 5.078
Epoch 43, global step 4664: 'val/loss' reached 5.07849 (best 5.07849), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=43-val_loss=0.0000.ckpt' as top 3


Epoch 44: 100%|██████████| 106/106 [00:21<00:00,  4.84it/s, v_num=19, train/loss_step=4.230, train/perplexity_step=68.60, val/loss=5.060, val/perplexity=296.0, train/loss_epoch=4.280, train/perplexity_epoch=72.30]

Metric val/loss improved by 0.017 >= min_delta = 0.001. New best score: 5.061
Epoch 44, global step 4770: 'val/loss' reached 5.06135 (best 5.06135), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=44-val_loss=0.0000.ckpt' as top 3


Epoch 45: 100%|██████████| 106/106 [00:20<00:00,  5.06it/s, v_num=19, train/loss_step=4.180, train/perplexity_step=65.20, val/loss=5.040, val/perplexity=300.0, train/loss_epoch=4.210, train/perplexity_epoch=67.50]

Metric val/loss improved by 0.022 >= min_delta = 0.001. New best score: 5.039
Epoch 45, global step 4876: 'val/loss' reached 5.03922 (best 5.03922), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=45-val_loss=0.0000.ckpt' as top 3


Epoch 46: 100%|██████████| 106/106 [00:20<00:00,  5.16it/s, v_num=19, train/loss_step=3.880, train/perplexity_step=48.60, val/loss=5.020, val/perplexity=306.0, train/loss_epoch=4.140, train/perplexity_epoch=62.70]

Metric val/loss improved by 0.017 >= min_delta = 0.001. New best score: 5.022
Epoch 46, global step 4982: 'val/loss' reached 5.02177 (best 5.02177), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=46-val_loss=0.0000.ckpt' as top 3


Epoch 47: 100%|██████████| 106/106 [00:20<00:00,  5.25it/s, v_num=19, train/loss_step=3.940, train/perplexity_step=51.60, val/loss=5.010, val/perplexity=313.0, train/loss_epoch=4.060, train/perplexity_epoch=58.20]

Metric val/loss improved by 0.016 >= min_delta = 0.001. New best score: 5.006
Epoch 47, global step 5088: 'val/loss' reached 5.00618 (best 5.00618), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=47-val_loss=0.0000.ckpt' as top 3


Epoch 48: 100%|██████████| 106/106 [00:20<00:00,  5.27it/s, v_num=19, train/loss_step=4.170, train/perplexity_step=64.60, val/loss=4.990, val/perplexity=319.0, train/loss_epoch=3.990, train/perplexity_epoch=54.20]

Metric val/loss improved by 0.013 >= min_delta = 0.001. New best score: 4.993
Epoch 48, global step 5194: 'val/loss' reached 4.99296 (best 4.99296), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=48-val_loss=0.0000.ckpt' as top 3


Epoch 49: 100%|██████████| 106/106 [00:20<00:00,  5.11it/s, v_num=19, train/loss_step=3.710, train/perplexity_step=40.90, val/loss=4.980, val/perplexity=326.0, train/loss_epoch=3.920, train/perplexity_epoch=50.70]

Metric val/loss improved by 0.010 >= min_delta = 0.001. New best score: 4.983
Epoch 49, global step 5300: 'val/loss' reached 4.98280 (best 4.98280), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=49-val_loss=0.0000.ckpt' as top 3


Epoch 50: 100%|██████████| 106/106 [00:20<00:00,  5.26it/s, v_num=19, train/loss_step=3.860, train/perplexity_step=47.40, val/loss=4.980, val/perplexity=333.0, train/loss_epoch=3.870, train/perplexity_epoch=48.10]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 4.977
Epoch 50, global step 5406: 'val/loss' reached 4.97715 (best 4.97715), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=50-val_loss=0.0000.ckpt' as top 3


Epoch 51: 100%|██████████| 106/106 [00:20<00:00,  5.18it/s, v_num=19, train/loss_step=4.100, train/perplexity_step=60.20, val/loss=4.970, val/perplexity=338.0, train/loss_epoch=3.830, train/perplexity_epoch=46.00]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 4.972
Epoch 51, global step 5512: 'val/loss' reached 4.97250 (best 4.97250), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=51-val_loss=0.0000.ckpt' as top 3


Epoch 52: 100%|██████████| 106/106 [00:20<00:00,  5.22it/s, v_num=19, train/loss_step=3.760, train/perplexity_step=42.80, val/loss=4.970, val/perplexity=342.0, train/loss_epoch=3.800, train/perplexity_epoch=44.60]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 4.970
Epoch 52, global step 5618: 'val/loss' reached 4.96961 (best 4.96961), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=52-val_loss=0.0000.ckpt' as top 3


Epoch 53: 100%|██████████| 106/106 [00:20<00:00,  5.12it/s, v_num=19, train/loss_step=3.960, train/perplexity_step=52.20, val/loss=4.970, val/perplexity=344.0, train/loss_epoch=3.780, train/perplexity_epoch=43.70]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 4.968
Epoch 53, global step 5724: 'val/loss' reached 4.96835 (best 4.96835), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=53-val_loss=0.0000.ckpt' as top 3


Epoch 54: 100%|██████████| 106/106 [00:21<00:00,  4.85it/s, v_num=19, train/loss_step=4.030, train/perplexity_step=56.20, val/loss=4.970, val/perplexity=346.0, train/loss_epoch=3.760, train/perplexity_epoch=43.20]

Epoch 54, global step 5830: 'val/loss' reached 4.96801 (best 4.96801), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=54-val_loss=0.0000.ckpt' as top 3


Epoch 55: 100%|██████████| 106/106 [00:20<00:00,  5.22it/s, v_num=19, train/loss_step=3.780, train/perplexity_step=43.80, val/loss=4.970, val/perplexity=346.0, train/loss_epoch=3.760, train/perplexity_epoch=43.00]

Epoch 55, global step 5936: 'val/loss' reached 4.96783 (best 4.96783), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=55-val_loss=0.0000.ckpt' as top 3


Epoch 56: 100%|██████████| 106/106 [00:19<00:00,  5.31it/s, v_num=19, train/loss_step=3.640, train/perplexity_step=37.90, val/loss=4.970, val/perplexity=346.0, train/loss_epoch=3.760, train/perplexity_epoch=42.90]

Epoch 56, global step 6042: 'val/loss' reached 4.96778 (best 4.96778), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=56-val_loss=0.0000.ckpt' as top 3


Epoch 57: 100%|██████████| 106/106 [00:20<00:00,  5.23it/s, v_num=19, train/loss_step=3.650, train/perplexity_step=38.30, val/loss=4.970, val/perplexity=347.0, train/loss_epoch=3.760, train/perplexity_epoch=42.80]

Epoch 57, global step 6148: 'val/loss' reached 4.96754 (best 4.96754), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=57-val_loss=0.0000.ckpt' as top 3


Epoch 58: 100%|██████████| 106/106 [00:19<00:00,  5.33it/s, v_num=19, train/loss_step=3.920, train/perplexity_step=50.30, val/loss=4.970, val/perplexity=347.0, train/loss_epoch=3.750, train/perplexity_epoch=42.70]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.967
Epoch 58, global step 6254: 'val/loss' reached 4.96684 (best 4.96684), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=58-val_loss=0.0000.ckpt' as top 3


Epoch 59: 100%|██████████| 106/106 [00:20<00:00,  5.16it/s, v_num=19, train/loss_step=3.900, train/perplexity_step=49.40, val/loss=4.970, val/perplexity=350.0, train/loss_epoch=3.740, train/perplexity_epoch=42.30]

Epoch 59, global step 6360: 'val/loss' reached 4.96747 (best 4.96684), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=59-val_loss=0.0000.ckpt' as top 3


Epoch 60: 100%|██████████| 106/106 [00:20<00:00,  5.26it/s, v_num=19, train/loss_step=3.940, train/perplexity_step=51.60, val/loss=4.970, val/perplexity=355.0, train/loss_epoch=3.730, train/perplexity_epoch=41.70]

Epoch 60, global step 6466: 'val/loss' was not in top 3


Epoch 61: 100%|██████████| 106/106 [00:19<00:00,  5.33it/s, v_num=19, train/loss_step=3.700, train/perplexity_step=40.50, val/loss=4.970, val/perplexity=361.0, train/loss_epoch=3.700, train/perplexity_epoch=40.70]

Epoch 61, global step 6572: 'val/loss' reached 4.96672 (best 4.96672), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=61-val_loss=0.0000.ckpt' as top 3


Epoch 62: 100%|██████████| 106/106 [00:19<00:00,  5.50it/s, v_num=19, train/loss_step=3.550, train/perplexity_step=34.70, val/loss=4.970, val/perplexity=370.0, train/loss_epoch=3.670, train/perplexity_epoch=39.30]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 4.966
Epoch 62, global step 6678: 'val/loss' reached 4.96580 (best 4.96580), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=62-val_loss=0.0000.ckpt' as top 3


Epoch 63: 100%|██████████| 106/106 [00:19<00:00,  5.47it/s, v_num=19, train/loss_step=3.560, train/perplexity_step=35.30, val/loss=4.970, val/perplexity=381.0, train/loss_epoch=3.630, train/perplexity_epoch=37.60]

Epoch 63, global step 6784: 'val/loss' was not in top 3


Epoch 64:  59%|█████▉    | 63/106 [00:11<00:07,  5.44it/s, v_num=19, train/loss_step=3.710, train/perplexity_step=40.70, val/loss=4.970, val/perplexity=381.0, train/loss_epoch=3.630, train/perplexity_epoch=37.60] 

In [None]:
# Test model
print("\nTesting model...")
trainer.test(model, data_module)

# Save final model
final_model_path = os.path.join(save_dir, f"{experiment_name}-final.ckpt")
trainer.save_checkpoint(final_model_path)
print(f"Final model saved to {final_model_path}")


Testing model...
Testing DataLoader 0: 100%|██████████| 15/15 [00:00<00:00, 53.33it/s]


Final model saved to ./checkpoints\transformer_lm-final.ckpt


In [None]:
print("\nTraining completed!")
print(f"Checkpoints saved in: {save_dir}")
print(f"Logs saved in: {log_dir}")
print(f"Vocabulary saved in: {vocab_path}")
print(f"Best model: {checkpoint_callback.best_model_path}")
print(f"Best score: {checkpoint_callback.best_model_score}")

# Print instructions for running the app
print("\n" + "="*50)
print("To run the Gradio app with your trained model:")
print(f"python -m src.app.gradio_app --model_path {checkpoint_callback.best_model_path} --vocab_path {vocab_path}")
print("="*50)


Training completed!
Checkpoints saved in: ./checkpoints
Logs saved in: ./logs
Vocabulary saved in: ./checkpoints\vocab.pkl
Best model: C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=67-val_loss=0.0000.ckpt
Best score: 5.026226043701172

To run the Gradio app with your trained model:
python -m src.app.gradio_app --model_path C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=67-val_loss=0.0000.ckpt --vocab_path ./checkpoints\vocab.pkl
