# IMPORTS

#### Set Root Dir

In [1]:
import sys
import os

# Use the current working directory as root (or go up if needed)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '../..'))  # Adjust '..' as needed
sys.path.append(ROOT_DIR)

#### import modules

In [2]:
"""
Training script for the transformer language model.
"""
import argparse
import torch
import pytorch_lightning as pl
import pickle
import yaml
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer

from src.data.tokenizer import SimpleTokenizer, load_text_corpus, create_sample_corpus
from src.data.dataset import TextDataModule
from src.model.lightning_module import TransformerLightningModule

  from .autonotebook import tqdm as notebook_tqdm


# Params

In [3]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [4]:
hparams = config["hparams"]

vocab_size, d_model, num_heads, num_layers, d_ff, \
sequence_length, batch_size, learning_rate, max_epochs, \
patience, min_delta, warmup_steps, weight_decay, dropout, \
train_split, val_split, num_workers, accelerator, devices, precision, \
gradient_clip_val, accumulate_grad_batches, log_every_n_steps, \
val_check_interval, save_top_k, monitor, mode = hparams.values()

In [5]:
hparams

{'vocab_size': 3000,
 'd_model': 32,
 'num_heads': 1,
 'num_layers': 2,
 'd_ff': 1024,
 'sequence_length': 32,
 'batch_size': 64,
 'learning_rate': 0.0001,
 'max_epochs': 100,
 'patience': 10,
 'min_delta': 0.001,
 'warmup_steps': 1000,
 'weight_decay': 0.01,
 'dropout': 0.1,
 'train_split': 0.7,
 'val_split': 0.2,
 'num_workers': 0,
 'accelerator': 'auto',
 'devices': 1,
 'precision': '32',
 'gradient_clip_val': 1.0,
 'accumulate_grad_batches': 1,
 'log_every_n_steps': 50,
 'val_check_interval': 1.0,
 'save_top_k': 1,
 'monitor': 'val/loss',
 'mode': 'min'}

In [6]:
paths = config["paths"]
general = config["general"]

corpus_path=paths["corpus_path"]
save_dir=paths["save_dir"]
log_dir=paths["log_dir"]
experiment_name=general["experiment_name"]
create_sample=general["create_sample"]

In [7]:
# Load and tokenize text
print("Loading and tokenizing text...")
text = load_text_corpus(corpus_path)

Loading and tokenizing text...


In [8]:
# Create tokenizer and build vocabulary
tokenizer = SimpleTokenizer(vocab_size=hparams["vocab_size"])
tokenizer.build_vocab(text)

Vocabulary built with 2059 tokens


In [9]:
# Save vocabulary
vocab_path = os.path.join(save_dir, "vocab.pkl")
os.makedirs(save_dir, exist_ok=True)
tokenizer.save_vocab(vocab_path)

Vocabulary saved to ./checkpoints\vocab.pkl


In [10]:
# read vocab
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

len(vocab['word_to_idx'])


2059

In [11]:
# Encode text
token_ids = tokenizer.encode(text)
print(f"Encoded text length: {len(token_ids)} tokens")

Encoded text length: 12344 tokens


In [12]:
# Create data module
data_module = TextDataModule(
    token_ids=token_ids,
    sequence_length=sequence_length,
    batch_size=batch_size,
    train_split=train_split,
    val_split=val_split,
    num_workers=num_workers
)

In [13]:
# Create model
model = TransformerLightningModule(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    d_ff=d_ff,
    dropout=dropout,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps
)

In [14]:
# Create callbacks
callbacks = []

# Early stopping
early_stopping = EarlyStopping(
    monitor=monitor,
    patience=patience,
    min_delta=min_delta,
    mode=mode,
    verbose=True
)
callbacks.append(early_stopping)


# Model checkpointing
checkpoint_callback = ModelCheckpoint(
    dirpath=save_dir,
    filename=f"{experiment_name}-epoch={{epoch:02d}}-val_loss={{val/loss:.3f}}", #-v{trainer.logger.version:02d}",
    monitor=monitor,
    mode=mode,
    auto_insert_metric_name=False, # Prevents the name 'val/loss=' from being prepended
    save_top_k=save_top_k,
    save_last=True,
    verbose=True
)
callbacks.append(checkpoint_callback)

# Learning rate monitoring
lr_monitor = LearningRateMonitor(logging_interval='step')
callbacks.append(lr_monitor)

In [15]:
# Create logger
logger = TensorBoardLogger(
    save_dir=log_dir,
    name=experiment_name,
    version=None
)

# Create trainer
trainer = Trainer(
    accelerator=accelerator,
    devices=devices,
    precision=precision,
    max_epochs=max_epochs,
    gradient_clip_val=gradient_clip_val,
    accumulate_grad_batches=accumulate_grad_batches,
    log_every_n_steps=log_every_n_steps,
    val_check_interval=val_check_interval,
    callbacks=callbacks,
    logger=logger,
    deterministic=True,
    enable_progress_bar=True,
    enable_model_summary=True
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [16]:
# Print model summary
print("\nModel Summary:")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


Model Summary:
Vocabulary size: 2059
Model parameters: 275,723
Trainable parameters: 275,723


In [17]:
# Check data sizes
print("Data sizes:")
print(f"Total tokens: {len(token_ids)}")
print(f"Sequence length: {sequence_length}")
print(f"Train split: {train_split}")
print(f"Val split: {val_split}")

# Calculate split sizes
total_len = len(token_ids)
train_end = int(total_len * train_split)
val_end = int(total_len * (train_split + val_split))

print(f"Train tokens: {train_end}")
print(f"Val tokens: {val_end - train_end}")
print(f"Test tokens: {total_len - val_end}")

# Check if validation data is sufficient
val_tokens = val_end - train_end
if val_tokens < sequence_length:
    print(f"WARNING: Validation data has only {val_tokens} tokens, less than sequence length {sequence_length}")
    print("This will cause validation to fail. Consider using a larger corpus or adjusting splits.")


Data sizes:
Total tokens: 12344
Sequence length: 32
Train split: 0.7
Val split: 0.2
Train tokens: 8640
Val tokens: 2469
Test tokens: 1235


In [18]:
# Train model
print("\nStarting training...")
print("With the updated parameters:")
print(f"- Sequence length: {sequence_length}")
print(f"- Train split: {train_split}")
print(f"- Val split: {val_split}")
print(f"- Monitor: {monitor}")
print()

trainer.fit(model, data_module)


Starting training...
With the updated parameters:
- Sequence length: 32
- Train split: 0.7
- Val split: 0.2
- Monitor: val/loss



c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\code\data_science\demo_llm\checkpoints exists and is not empty.

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | TransformerLM    | 275 K  | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
275 K     Trainable params
0         Non-trainable params
275 K     Total params
1.103     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


                                                                           

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 135/135 [00:13<00:00, 10.30it/s, v_num=28, train/loss_step=7.720, train/perplexity_step=2.26e+3, val/loss=7.710, val/perplexity=2.24e+3, train/loss_epoch=7.770, train/perplexity_epoch=2.38e+3]

Metric val/loss improved. New best score: 7.711
Epoch 0, global step 135: 'val/loss' reached 7.71142 (best 7.71142), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=00-val_loss=7.711.ckpt' as top 1


Epoch 1: 100%|██████████| 135/135 [00:12<00:00, 10.43it/s, v_num=28, train/loss_step=7.510, train/perplexity_step=1.83e+3, val/loss=7.510, val/perplexity=1.84e+3, train/loss_epoch=7.620, train/perplexity_epoch=2.04e+3]

Metric val/loss improved by 0.202 >= min_delta = 0.001. New best score: 7.510
Epoch 1, global step 270: 'val/loss' reached 7.50958 (best 7.50958), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=01-val_loss=7.510.ckpt' as top 1


Epoch 2: 100%|██████████| 135/135 [00:12<00:00, 11.01it/s, v_num=28, train/loss_step=7.330, train/perplexity_step=1.52e+3, val/loss=7.310, val/perplexity=1.51e+3, train/loss_epoch=7.410, train/perplexity_epoch=1.66e+3]

Metric val/loss improved by 0.201 >= min_delta = 0.001. New best score: 7.309
Epoch 2, global step 405: 'val/loss' reached 7.30889 (best 7.30889), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=02-val_loss=7.309.ckpt' as top 1


Epoch 3: 100%|██████████| 135/135 [00:12<00:00, 10.59it/s, v_num=28, train/loss_step=7.070, train/perplexity_step=1.18e+3, val/loss=7.090, val/perplexity=1.22e+3, train/loss_epoch=7.200, train/perplexity_epoch=1.34e+3]

Metric val/loss improved by 0.216 >= min_delta = 0.001. New best score: 7.093
Epoch 3, global step 540: 'val/loss' reached 7.09303 (best 7.09303), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=03-val_loss=7.093.ckpt' as top 1


Epoch 4: 100%|██████████| 135/135 [00:12<00:00, 10.90it/s, v_num=28, train/loss_step=6.830, train/perplexity_step=926.0, val/loss=6.850, val/perplexity=967.0, train/loss_epoch=6.950, train/perplexity_epoch=1.05e+3]    

Metric val/loss improved by 0.239 >= min_delta = 0.001. New best score: 6.854
Epoch 4, global step 675: 'val/loss' reached 6.85384 (best 6.85384), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=04-val_loss=6.854.ckpt' as top 1


Epoch 5: 100%|██████████| 135/135 [00:12<00:00, 10.96it/s, v_num=28, train/loss_step=6.560, train/perplexity_step=708.0, val/loss=6.620, val/perplexity=773.0, train/loss_epoch=6.690, train/perplexity_epoch=804.0]  

Metric val/loss improved by 0.235 >= min_delta = 0.001. New best score: 6.619
Epoch 5, global step 810: 'val/loss' reached 6.61902 (best 6.61902), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=05-val_loss=6.619.ckpt' as top 1


Epoch 6: 100%|██████████| 135/135 [00:12<00:00, 10.95it/s, v_num=28, train/loss_step=6.290, train/perplexity_step=539.0, val/loss=6.430, val/perplexity=649.0, train/loss_epoch=6.430, train/perplexity_epoch=623.0]

Metric val/loss improved by 0.191 >= min_delta = 0.001. New best score: 6.428
Epoch 6, global step 945: 'val/loss' reached 6.42835 (best 6.42835), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=06-val_loss=6.428.ckpt' as top 1


Epoch 7: 100%|██████████| 135/135 [00:12<00:00, 10.74it/s, v_num=28, train/loss_step=6.180, train/perplexity_step=482.0, val/loss=6.290, val/perplexity=577.0, train/loss_epoch=6.220, train/perplexity_epoch=504.0]

Metric val/loss improved by 0.136 >= min_delta = 0.001. New best score: 6.293
Epoch 7, global step 1080: 'val/loss' reached 6.29259 (best 6.29259), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=07-val_loss=6.293.ckpt' as top 1


Epoch 8: 100%|██████████| 135/135 [00:12<00:00, 10.77it/s, v_num=28, train/loss_step=5.960, train/perplexity_step=387.0, val/loss=6.200, val/perplexity=536.0, train/loss_epoch=6.060, train/perplexity_epoch=429.0]

Metric val/loss improved by 0.092 >= min_delta = 0.001. New best score: 6.200
Epoch 8, global step 1215: 'val/loss' reached 6.20028 (best 6.20028), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=08-val_loss=6.200.ckpt' as top 1


Epoch 9: 100%|██████████| 135/135 [00:12<00:00, 10.88it/s, v_num=28, train/loss_step=6.020, train/perplexity_step=412.0, val/loss=6.140, val/perplexity=510.0, train/loss_epoch=5.940, train/perplexity_epoch=379.0]

Metric val/loss improved by 0.065 >= min_delta = 0.001. New best score: 6.135
Epoch 9, global step 1350: 'val/loss' reached 6.13544 (best 6.13544), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=09-val_loss=6.135.ckpt' as top 1


Epoch 10: 100%|██████████| 135/135 [00:12<00:00, 10.42it/s, v_num=28, train/loss_step=5.780, train/perplexity_step=322.0, val/loss=6.090, val/perplexity=492.0, train/loss_epoch=5.840, train/perplexity_epoch=344.0]

Metric val/loss improved by 0.047 >= min_delta = 0.001. New best score: 6.088
Epoch 10, global step 1485: 'val/loss' reached 6.08841 (best 6.08841), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=10-val_loss=6.088.ckpt' as top 1


Epoch 11: 100%|██████████| 135/135 [00:12<00:00, 10.46it/s, v_num=28, train/loss_step=5.780, train/perplexity_step=323.0, val/loss=6.060, val/perplexity=481.0, train/loss_epoch=5.770, train/perplexity_epoch=321.0]

Metric val/loss improved by 0.029 >= min_delta = 0.001. New best score: 6.060
Epoch 11, global step 1620: 'val/loss' reached 6.05958 (best 6.05958), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=11-val_loss=6.060.ckpt' as top 1


Epoch 12: 100%|██████████| 135/135 [00:14<00:00,  9.40it/s, v_num=28, train/loss_step=5.650, train/perplexity_step=284.0, val/loss=6.040, val/perplexity=477.0, train/loss_epoch=5.730, train/perplexity_epoch=308.0]

Metric val/loss improved by 0.015 >= min_delta = 0.001. New best score: 6.044
Epoch 12, global step 1755: 'val/loss' reached 6.04428 (best 6.04428), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=12-val_loss=6.044.ckpt' as top 1


Epoch 13: 100%|██████████| 135/135 [00:13<00:00, 10.28it/s, v_num=28, train/loss_step=5.740, train/perplexity_step=311.0, val/loss=6.040, val/perplexity=475.0, train/loss_epoch=5.710, train/perplexity_epoch=302.0]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 6.039
Epoch 13, global step 1890: 'val/loss' reached 6.03927 (best 6.03927), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=13-val_loss=6.039.ckpt' as top 1


Epoch 14: 100%|██████████| 135/135 [00:13<00:00,  9.93it/s, v_num=28, train/loss_step=5.570, train/perplexity_step=263.0, val/loss=6.040, val/perplexity=475.0, train/loss_epoch=5.710, train/perplexity_epoch=301.0]

Epoch 14, global step 2025: 'val/loss' reached 6.03874 (best 6.03874), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=14-val_loss=6.039.ckpt' as top 1


Epoch 15: 100%|██████████| 135/135 [00:13<00:00,  9.99it/s, v_num=28, train/loss_step=5.600, train/perplexity_step=271.0, val/loss=6.040, val/perplexity=474.0, train/loss_epoch=5.710, train/perplexity_epoch=301.0]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 6.037
Epoch 15, global step 2160: 'val/loss' reached 6.03711 (best 6.03711), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=15-val_loss=6.037.ckpt' as top 1


Epoch 16: 100%|██████████| 135/135 [00:12<00:00, 10.51it/s, v_num=28, train/loss_step=5.590, train/perplexity_step=268.0, val/loss=6.030, val/perplexity=472.0, train/loss_epoch=5.700, train/perplexity_epoch=299.0]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 6.030
Epoch 16, global step 2295: 'val/loss' reached 6.03022 (best 6.03022), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=16-val_loss=6.030.ckpt' as top 1


Epoch 17: 100%|██████████| 135/135 [00:12<00:00, 10.54it/s, v_num=28, train/loss_step=5.700, train/perplexity_step=299.0, val/loss=6.010, val/perplexity=467.0, train/loss_epoch=5.670, train/perplexity_epoch=292.0]

Metric val/loss improved by 0.017 >= min_delta = 0.001. New best score: 6.013
Epoch 17, global step 2430: 'val/loss' reached 6.01329 (best 6.01329), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=17-val_loss=6.013.ckpt' as top 1


Epoch 18: 100%|██████████| 135/135 [00:12<00:00, 10.47it/s, v_num=28, train/loss_step=5.630, train/perplexity_step=279.0, val/loss=5.990, val/perplexity=461.0, train/loss_epoch=5.630, train/perplexity_epoch=279.0]

Metric val/loss improved by 0.026 >= min_delta = 0.001. New best score: 5.988
Epoch 18, global step 2565: 'val/loss' reached 5.98775 (best 5.98775), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=18-val_loss=5.988.ckpt' as top 1


Epoch 19: 100%|██████████| 135/135 [00:12<00:00, 10.68it/s, v_num=28, train/loss_step=5.500, train/perplexity_step=245.0, val/loss=5.950, val/perplexity=453.0, train/loss_epoch=5.560, train/perplexity_epoch=261.0]

Metric val/loss improved by 0.035 >= min_delta = 0.001. New best score: 5.952
Epoch 19, global step 2700: 'val/loss' reached 5.95236 (best 5.95236), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=19-val_loss=5.952.ckpt' as top 1


Epoch 20: 100%|██████████| 135/135 [00:12<00:00, 10.40it/s, v_num=28, train/loss_step=5.470, train/perplexity_step=238.0, val/loss=5.920, val/perplexity=445.0, train/loss_epoch=5.470, train/perplexity_epoch=239.0]

Metric val/loss improved by 0.037 >= min_delta = 0.001. New best score: 5.915
Epoch 20, global step 2835: 'val/loss' reached 5.91535 (best 5.91535), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=20-val_loss=5.915.ckpt' as top 1


Epoch 21: 100%|██████████| 135/135 [00:12<00:00, 10.73it/s, v_num=28, train/loss_step=5.270, train/perplexity_step=195.0, val/loss=5.870, val/perplexity=438.0, train/loss_epoch=5.370, train/perplexity_epoch=216.0]

Metric val/loss improved by 0.043 >= min_delta = 0.001. New best score: 5.873
Epoch 21, global step 2970: 'val/loss' reached 5.87253 (best 5.87253), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=21-val_loss=5.873.ckpt' as top 1


Epoch 22: 100%|██████████| 135/135 [00:13<00:00, 10.31it/s, v_num=28, train/loss_step=5.300, train/perplexity_step=200.0, val/loss=5.840, val/perplexity=433.0, train/loss_epoch=5.270, train/perplexity_epoch=195.0]

Metric val/loss improved by 0.036 >= min_delta = 0.001. New best score: 5.836
Epoch 22, global step 3105: 'val/loss' reached 5.83605 (best 5.83605), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=22-val_loss=5.836.ckpt' as top 1


Epoch 23: 100%|██████████| 135/135 [00:13<00:00, 10.38it/s, v_num=28, train/loss_step=5.140, train/perplexity_step=170.0, val/loss=5.800, val/perplexity=429.0, train/loss_epoch=5.170, train/perplexity_epoch=176.0]

Metric val/loss improved by 0.034 >= min_delta = 0.001. New best score: 5.802
Epoch 23, global step 3240: 'val/loss' reached 5.80157 (best 5.80157), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=23-val_loss=5.802.ckpt' as top 1


Epoch 24: 100%|██████████| 135/135 [00:13<00:00, 10.33it/s, v_num=28, train/loss_step=5.030, train/perplexity_step=153.0, val/loss=5.780, val/perplexity=431.0, train/loss_epoch=5.080, train/perplexity_epoch=161.0]

Metric val/loss improved by 0.019 >= min_delta = 0.001. New best score: 5.783
Epoch 24, global step 3375: 'val/loss' reached 5.78255 (best 5.78255), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=24-val_loss=5.783.ckpt' as top 1


Epoch 25: 100%|██████████| 135/135 [00:13<00:00, 10.34it/s, v_num=28, train/loss_step=4.890, train/perplexity_step=133.0, val/loss=5.760, val/perplexity=429.0, train/loss_epoch=5.010, train/perplexity_epoch=151.0]

Metric val/loss improved by 0.018 >= min_delta = 0.001. New best score: 5.764
Epoch 25, global step 3510: 'val/loss' reached 5.76441 (best 5.76441), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=25-val_loss=5.764.ckpt' as top 1


Epoch 26: 100%|██████████| 135/135 [00:13<00:00,  9.65it/s, v_num=28, train/loss_step=5.000, train/perplexity_step=148.0, val/loss=5.760, val/perplexity=432.0, train/loss_epoch=4.970, train/perplexity_epoch=144.0]

Metric val/loss improved by 0.009 >= min_delta = 0.001. New best score: 5.755
Epoch 26, global step 3645: 'val/loss' reached 5.75550 (best 5.75550), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=26-val_loss=5.755.ckpt' as top 1


Epoch 27: 100%|██████████| 135/135 [00:14<00:00,  9.40it/s, v_num=28, train/loss_step=4.920, train/perplexity_step=137.0, val/loss=5.750, val/perplexity=431.0, train/loss_epoch=4.940, train/perplexity_epoch=140.0]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 5.752
Epoch 27, global step 3780: 'val/loss' reached 5.75176 (best 5.75176), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=27-val_loss=5.752.ckpt' as top 1


Epoch 28: 100%|██████████| 135/135 [00:14<00:00,  9.55it/s, v_num=28, train/loss_step=4.900, train/perplexity_step=134.0, val/loss=5.750, val/perplexity=431.0, train/loss_epoch=4.930, train/perplexity_epoch=138.0]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 5.750
Epoch 28, global step 3915: 'val/loss' reached 5.74995 (best 5.74995), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=28-val_loss=5.750.ckpt' as top 1


Epoch 29: 100%|██████████| 135/135 [00:13<00:00,  9.65it/s, v_num=28, train/loss_step=5.000, train/perplexity_step=149.0, val/loss=5.750, val/perplexity=431.0, train/loss_epoch=4.920, train/perplexity_epoch=138.0]

Epoch 29, global step 4050: 'val/loss' reached 5.74988 (best 5.74988), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=29-val_loss=5.750.ckpt' as top 1


Epoch 30: 100%|██████████| 135/135 [00:13<00:00,  9.69it/s, v_num=28, train/loss_step=4.900, train/perplexity_step=134.0, val/loss=5.750, val/perplexity=431.0, train/loss_epoch=4.920, train/perplexity_epoch=138.0]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 5.749
Epoch 30, global step 4185: 'val/loss' reached 5.74878 (best 5.74878), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=30-val_loss=5.749.ckpt' as top 1


Epoch 31: 100%|██████████| 135/135 [00:14<00:00,  9.57it/s, v_num=28, train/loss_step=4.930, train/perplexity_step=139.0, val/loss=5.750, val/perplexity=431.0, train/loss_epoch=4.920, train/perplexity_epoch=137.0]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 5.746
Epoch 31, global step 4320: 'val/loss' reached 5.74629 (best 5.74629), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=31-val_loss=5.746.ckpt' as top 1


Epoch 32: 100%|██████████| 135/135 [00:14<00:00,  9.24it/s, v_num=28, train/loss_step=4.820, train/perplexity_step=124.0, val/loss=5.740, val/perplexity=435.0, train/loss_epoch=4.900, train/perplexity_epoch=134.0]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 5.743
Epoch 32, global step 4455: 'val/loss' reached 5.74295 (best 5.74295), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=32-val_loss=5.743.ckpt' as top 1


Epoch 33: 100%|██████████| 135/135 [00:14<00:00,  9.43it/s, v_num=28, train/loss_step=4.790, train/perplexity_step=121.0, val/loss=5.730, val/perplexity=439.0, train/loss_epoch=4.860, train/perplexity_epoch=129.0]

Metric val/loss improved by 0.009 >= min_delta = 0.001. New best score: 5.734
Epoch 33, global step 4590: 'val/loss' reached 5.73440 (best 5.73440), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=33-val_loss=5.734.ckpt' as top 1


Epoch 34: 100%|██████████| 135/135 [00:15<00:00,  8.74it/s, v_num=28, train/loss_step=4.800, train/perplexity_step=121.0, val/loss=5.730, val/perplexity=445.0, train/loss_epoch=4.810, train/perplexity_epoch=123.0]

Metric val/loss improved by 0.009 >= min_delta = 0.001. New best score: 5.725
Epoch 34, global step 4725: 'val/loss' reached 5.72540 (best 5.72540), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=34-val_loss=5.725.ckpt' as top 1


Epoch 35: 100%|██████████| 135/135 [00:14<00:00,  9.06it/s, v_num=28, train/loss_step=4.700, train/perplexity_step=110.0, val/loss=5.720, val/perplexity=452.0, train/loss_epoch=4.740, train/perplexity_epoch=115.0]

Metric val/loss improved by 0.010 >= min_delta = 0.001. New best score: 5.716
Epoch 35, global step 4860: 'val/loss' reached 5.71556 (best 5.71556), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=35-val_loss=5.716.ckpt' as top 1


Epoch 36: 100%|██████████| 135/135 [00:14<00:00,  9.63it/s, v_num=28, train/loss_step=4.660, train/perplexity_step=106.0, val/loss=5.710, val/perplexity=466.0, train/loss_epoch=4.670, train/perplexity_epoch=107.0]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 5.709
Epoch 36, global step 4995: 'val/loss' reached 5.70870 (best 5.70870), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=36-val_loss=5.709.ckpt' as top 1


Epoch 37: 100%|██████████| 135/135 [00:14<00:00,  9.63it/s, v_num=28, train/loss_step=4.740, train/perplexity_step=115.0, val/loss=5.700, val/perplexity=467.0, train/loss_epoch=4.600, train/perplexity_epoch=99.20]

Metric val/loss improved by 0.011 >= min_delta = 0.001. New best score: 5.698
Epoch 37, global step 5130: 'val/loss' reached 5.69783 (best 5.69783), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=37-val_loss=5.698.ckpt' as top 1


Epoch 38: 100%|██████████| 135/135 [00:13<00:00, 10.11it/s, v_num=28, train/loss_step=4.530, train/perplexity_step=92.80, val/loss=5.690, val/perplexity=487.0, train/loss_epoch=4.520, train/perplexity_epoch=92.20]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 5.693
Epoch 38, global step 5265: 'val/loss' reached 5.69304 (best 5.69304), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=38-val_loss=5.693.ckpt' as top 1


Epoch 39: 100%|██████████| 135/135 [00:12<00:00, 10.52it/s, v_num=28, train/loss_step=4.570, train/perplexity_step=96.50, val/loss=5.690, val/perplexity=497.0, train/loss_epoch=4.460, train/perplexity_epoch=86.70]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 5.690
Epoch 39, global step 5400: 'val/loss' reached 5.69006 (best 5.69006), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=39-val_loss=5.690.ckpt' as top 1


Epoch 40: 100%|██████████| 135/135 [00:12<00:00, 10.70it/s, v_num=28, train/loss_step=4.490, train/perplexity_step=89.20, val/loss=5.690, val/perplexity=502.0, train/loss_epoch=4.410, train/perplexity_epoch=82.60]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 5.686
Epoch 40, global step 5535: 'val/loss' reached 5.68623 (best 5.68623), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=40-val_loss=5.686.ckpt' as top 1


Epoch 41: 100%|██████████| 135/135 [00:13<00:00, 10.29it/s, v_num=28, train/loss_step=4.520, train/perplexity_step=91.60, val/loss=5.690, val/perplexity=510.0, train/loss_epoch=4.380, train/perplexity_epoch=80.00]

Epoch 41, global step 5670: 'val/loss' was not in top 1


Epoch 42: 100%|██████████| 135/135 [00:13<00:00, 10.20it/s, v_num=28, train/loss_step=4.370, train/perplexity_step=79.40, val/loss=5.690, val/perplexity=512.0, train/loss_epoch=4.360, train/perplexity_epoch=78.60]

Epoch 42, global step 5805: 'val/loss' was not in top 1


Epoch 43: 100%|██████████| 135/135 [00:13<00:00,  9.83it/s, v_num=28, train/loss_step=4.370, train/perplexity_step=79.00, val/loss=5.690, val/perplexity=513.0, train/loss_epoch=4.360, train/perplexity_epoch=78.10]

Epoch 43, global step 5940: 'val/loss' was not in top 1


Epoch 44: 100%|██████████| 135/135 [00:13<00:00,  9.79it/s, v_num=28, train/loss_step=4.300, train/perplexity_step=73.50, val/loss=5.690, val/perplexity=514.0, train/loss_epoch=4.350, train/perplexity_epoch=77.90]

Epoch 44, global step 6075: 'val/loss' was not in top 1


Epoch 45: 100%|██████████| 135/135 [00:13<00:00,  9.91it/s, v_num=28, train/loss_step=4.350, train/perplexity_step=77.50, val/loss=5.690, val/perplexity=515.0, train/loss_epoch=4.350, train/perplexity_epoch=78.00]

Epoch 45, global step 6210: 'val/loss' was not in top 1


Epoch 46: 100%|██████████| 135/135 [00:14<00:00,  9.43it/s, v_num=28, train/loss_step=4.390, train/perplexity_step=80.50, val/loss=5.690, val/perplexity=518.0, train/loss_epoch=4.350, train/perplexity_epoch=77.50]

Epoch 46, global step 6345: 'val/loss' was not in top 1


Epoch 47: 100%|██████████| 135/135 [00:14<00:00,  9.11it/s, v_num=28, train/loss_step=4.280, train/perplexity_step=72.10, val/loss=5.690, val/perplexity=523.0, train/loss_epoch=4.330, train/perplexity_epoch=76.00]

Epoch 47, global step 6480: 'val/loss' was not in top 1


Epoch 48: 100%|██████████| 135/135 [00:12<00:00, 10.41it/s, v_num=28, train/loss_step=4.380, train/perplexity_step=79.90, val/loss=5.690, val/perplexity=538.0, train/loss_epoch=4.300, train/perplexity_epoch=74.00]

Epoch 48, global step 6615: 'val/loss' was not in top 1


Epoch 49: 100%|██████████| 135/135 [00:12<00:00, 10.73it/s, v_num=28, train/loss_step=4.410, train/perplexity_step=82.20, val/loss=5.690, val/perplexity=546.0, train/loss_epoch=4.260, train/perplexity_epoch=71.00]

Epoch 49, global step 6750: 'val/loss' was not in top 1


Epoch 50: 100%|██████████| 135/135 [00:12<00:00, 10.68it/s, v_num=28, train/loss_step=4.060, train/perplexity_step=58.10, val/loss=5.700, val/perplexity=564.0, train/loss_epoch=4.210, train/perplexity_epoch=67.60]

Monitored metric val/loss did not improve in the last 10 records. Best score: 5.686. Signaling Trainer to stop.
Epoch 50, global step 6885: 'val/loss' was not in top 1


Epoch 50: 100%|██████████| 135/135 [00:12<00:00, 10.65it/s, v_num=28, train/loss_step=4.060, train/perplexity_step=58.10, val/loss=5.700, val/perplexity=564.0, train/loss_epoch=4.210, train/perplexity_epoch=67.60]


In [21]:
# Test model
print("\nTesting model...")
trainer.test(model, data_module)

# Save final model
final_model_path = os.path.join(save_dir, f"{experiment_name}-final.ckpt")
trainer.save_checkpoint(final_model_path)
print(f"Final model saved to {final_model_path}")


Testing model...
Testing DataLoader 0: 100%|██████████| 19/19 [00:00<00:00, 53.89it/s]


Final model saved to ./checkpoints\transformer_lm-final.ckpt


In [22]:
print("\nTraining completed!")
print(f"Checkpoints saved in: {save_dir}")
print(f"Logs saved in: {log_dir}")
print(f"Vocabulary saved in: {vocab_path}")
print(f"Best model: {checkpoint_callback.best_model_path}")
print(f"Best score: {checkpoint_callback.best_model_score}")

# Print instructions for running the app
print("\n" + "="*50)
print("To run the Gradio app with your trained model:")
print(f"python -m src.app.gradio_app --model_path {checkpoint_callback.best_model_path} --vocab_path {vocab_path}")
print("="*50)


Training completed!
Checkpoints saved in: ./checkpoints
Logs saved in: ./logs
Vocabulary saved in: ./checkpoints\vocab.pkl
Best model: C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=40-val_loss=5.686.ckpt
Best score: 5.686227798461914

To run the Gradio app with your trained model:
python -m src.app.gradio_app --model_path C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=40-val_loss=5.686.ckpt --vocab_path ./checkpoints\vocab.pkl
