# IMPORTS

#### Set Root Dir

In [1]:
import sys
import os

# Use the current working directory as root (or go up if needed)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '../..'))  # Adjust '..' as needed
sys.path.append(ROOT_DIR)

#### import modules

In [None]:
"""
Training script for the transformer language model.
"""
import argparse
import torch
import pytorch_lightning as pl
import pickle
import yaml
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer

from src.data.tokenizer import SimpleTokenizer, load_text_corpus, create_sample_corpus
from src.data.dataset import TextDataModule
from src.model.lightning_module import TransformerLightningModule

# Params

In [None]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [None]:
hparams = config["hparams"]

vocab_size, d_model, num_heads, num_layers, d_ff, \
sequence_length, batch_size, learning_rate, max_epochs, \
patience, min_delta, warmup_steps, weight_decay, dropout, \
train_split, val_split, num_workers, accelerator, devices, precision, \
gradient_clip_val, accumulate_grad_batches, log_every_n_steps, \
val_check_interval, save_top_k, monitor, mode = hparams.values()

In [None]:
hparams

{'vocab_size': 3000,
 'd_model': 32,
 'num_heads': 1,
 'num_layers': 2,
 'd_ff': 1024,
 'sequence_length': 32,
 'batch_size': 64,
 'learning_rate': 0.0001,
 'max_epochs': 100,
 'patience': 10,
 'min_delta': 0.001,
 'warmup_steps': 1000,
 'weight_decay': 0.01,
 'dropout': 0.1,
 'train_split': 0.7,
 'val_split': 0.2,
 'num_workers': 0,
 'accelerator': 'auto',
 'devices': 1,
 'precision': '32',
 'gradient_clip_val': 1.0,
 'accumulate_grad_batches': 1,
 'log_every_n_steps': 50,
 'val_check_interval': 1.0,
 'save_top_k': 1,
 'monitor': 'val/loss',
 'mode': 'min'}

In [None]:
paths = config["paths"]
general = config["general"]

corpus_path=paths["corpus_path"]
save_dir=paths["save_dir"]
log_dir=paths["log_dir"]
experiment_name=general["experiment_name"]
create_sample=general["create_sample"]

In [None]:
# Load and tokenize text
print("Loading and tokenizing text...")
text = load_text_corpus(corpus_path)

Loading and tokenizing text...


In [None]:
# Create tokenizer and build vocabulary
tokenizer = SimpleTokenizer(vocab_size=hparams["vocab_size"])
tokenizer.build_vocab(text)

Vocabulary built with 1972 tokens


In [None]:
# Save vocabulary
vocab_path = os.path.join(save_dir, "vocab.pkl")
os.makedirs(save_dir, exist_ok=True)
tokenizer.save_vocab(vocab_path)

Vocabulary saved to ./checkpoints\vocab.pkl


In [None]:
# read vocab
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

len(vocab['word_to_idx'])


1972

In [None]:
# Encode text
token_ids = tokenizer.encode(text)
print(f"Encoded text length: {len(token_ids)} tokens")

Encoded text length: 13731 tokens


In [None]:
# Create data module
data_module = TextDataModule(
    token_ids=token_ids,
    sequence_length=sequence_length,
    batch_size=batch_size,
    train_split=train_split,
    val_split=val_split,
    num_workers=num_workers
)

In [None]:
# Create model
model = TransformerLightningModule(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    d_ff=d_ff,
    dropout=dropout,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps
)



In [None]:
# Create callbacks
callbacks = []

# Early stopping
early_stopping = EarlyStopping(
    monitor=monitor,
    patience=patience,
    min_delta=min_delta,
    mode=mode,
    verbose=True
)
callbacks.append(early_stopping)


# Model checkpointing
checkpoint_callback = ModelCheckpoint(
    dirpath=save_dir,
    filename=f"{experiment_name}-epoch={{epoch:02d}}-val_loss={{val/loss:.3f}}", #-v{trainer.logger.version:02d}",
    monitor=monitor,
    mode=mode,
    auto_insert_metric_name=False, # Prevents the name 'val/loss=' from being prepended
    save_top_k=save_top_k,
    save_last=True,
    verbose=True
)
callbacks.append(checkpoint_callback)

# Learning rate monitoring
lr_monitor = LearningRateMonitor(logging_interval='step')
callbacks.append(lr_monitor)

In [None]:
# Create logger
logger = TensorBoardLogger(
    save_dir=log_dir,
    name=experiment_name,
    version=None
)

# Create trainer
trainer = Trainer(
    accelerator=accelerator,
    devices=devices,
    precision=precision,
    max_epochs=max_epochs,
    gradient_clip_val=gradient_clip_val,
    accumulate_grad_batches=accumulate_grad_batches,
    log_every_n_steps=log_every_n_steps,
    val_check_interval=val_check_interval,
    callbacks=callbacks,
    logger=logger,
    deterministic=True,
    enable_progress_bar=True,
    enable_model_summary=True
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [None]:
# Print model summary
print("\nModel Summary:")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


Model Summary:
Vocabulary size: 1972
Model parameters: 270,068
Trainable parameters: 270,068


In [None]:
# Check data sizes
print("Data sizes:")
print(f"Total tokens: {len(token_ids)}")
print(f"Sequence length: {sequence_length}")
print(f"Train split: {train_split}")
print(f"Val split: {val_split}")

# Calculate split sizes
total_len = len(token_ids)
train_end = int(total_len * train_split)
val_end = int(total_len * (train_split + val_split))

print(f"Train tokens: {train_end}")
print(f"Val tokens: {val_end - train_end}")
print(f"Test tokens: {total_len - val_end}")

# Check if validation data is sufficient
val_tokens = val_end - train_end
if val_tokens < sequence_length:
    print(f"WARNING: Validation data has only {val_tokens} tokens, less than sequence length {sequence_length}")
    print("This will cause validation to fail. Consider using a larger corpus or adjusting splits.")


Data sizes:
Total tokens: 13731
Sequence length: 32
Train split: 0.7
Val split: 0.2
Train tokens: 9611
Val tokens: 2746
Test tokens: 1374


In [None]:
# Train model
print("\nStarting training...")
print("With the updated parameters:")
print(f"- Sequence length: {sequence_length}")
print(f"- Train split: {train_split}")
print(f"- Val split: {val_split}")
print(f"- Monitor: {monitor}")
print()

trainer.fit(model, data_module)


Starting training...
With the updated parameters:
- Sequence length: 32
- Train split: 0.7
- Val split: 0.2
- Monitor: val/loss



c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\code\data_science\demo_llm\checkpoints exists and is not empty.

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | model            | TransformerLM    | 270 K  | train
1 | criterion        | CrossEntropyLoss | 0      | train
2 | train_perplexity | _Perplexity      | 0      | train
3 | val_perplexity   | _Perplexity      | 0      | train
--------------------------------------------------------------
270 K     Trainable params
0         Non-trainable params
270 K     Total params
1.080     Total estimated model params size (MB)
37        Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


                                                                           

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 150/150 [00:14<00:00, 10.20it/s, v_num=24, train/loss_step=7.650, train/perplexity_step=2.09e+3, val/loss=7.610, val/perplexity=2.03e+3, train/loss_epoch=7.730, train/perplexity_epoch=2.27e+3]

Metric val/loss improved. New best score: 7.614
Epoch 0, global step 150: 'val/loss' reached 7.61417 (best 7.61417), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=00-val_loss=7.614.ckpt' as top 1


Epoch 1: 100%|██████████| 150/150 [00:14<00:00, 10.23it/s, v_num=24, train/loss_step=7.390, train/perplexity_step=1.62e+3, val/loss=7.340, val/perplexity=1.54e+3, train/loss_epoch=7.510, train/perplexity_epoch=1.82e+3]

Metric val/loss improved by 0.276 >= min_delta = 0.001. New best score: 7.338
Epoch 1, global step 300: 'val/loss' reached 7.33847 (best 7.33847), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=01-val_loss=7.338.ckpt' as top 1


Epoch 2: 100%|██████████| 150/150 [00:14<00:00, 10.32it/s, v_num=24, train/loss_step=7.140, train/perplexity_step=1.27e+3, val/loss=7.080, val/perplexity=1.2e+3, train/loss_epoch=7.250, train/perplexity_epoch=1.41e+3] 

Metric val/loss improved by 0.255 >= min_delta = 0.001. New best score: 7.083
Epoch 2, global step 450: 'val/loss' reached 7.08333 (best 7.08333), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=02-val_loss=7.083.ckpt' as top 1


Epoch 3: 100%|██████████| 150/150 [00:14<00:00, 10.31it/s, v_num=24, train/loss_step=6.870, train/perplexity_step=963.0, val/loss=6.810, val/perplexity=910.0, train/loss_epoch=7.000, train/perplexity_epoch=1.1e+3]    

Metric val/loss improved by 0.278 >= min_delta = 0.001. New best score: 6.806
Epoch 3, global step 600: 'val/loss' reached 6.80569 (best 6.80569), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=03-val_loss=6.806.ckpt' as top 1


Epoch 4: 100%|██████████| 150/150 [00:15<00:00, 10.00it/s, v_num=24, train/loss_step=6.600, train/perplexity_step=734.0, val/loss=6.510, val/perplexity=682.0, train/loss_epoch=6.730, train/perplexity_epoch=840.0] 

Metric val/loss improved by 0.295 >= min_delta = 0.001. New best score: 6.511
Epoch 4, global step 750: 'val/loss' reached 6.51052 (best 6.51052), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=04-val_loss=6.511.ckpt' as top 1


Epoch 5: 100%|██████████| 150/150 [00:15<00:00,  9.70it/s, v_num=24, train/loss_step=6.320, train/perplexity_step=553.0, val/loss=6.240, val/perplexity=525.0, train/loss_epoch=6.460, train/perplexity_epoch=639.0]

Metric val/loss improved by 0.275 >= min_delta = 0.001. New best score: 6.235
Epoch 5, global step 900: 'val/loss' reached 6.23533 (best 6.23533), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=05-val_loss=6.235.ckpt' as top 1


Epoch 6: 100%|██████████| 150/150 [00:14<00:00, 10.08it/s, v_num=24, train/loss_step=6.130, train/perplexity_step=460.0, val/loss=6.020, val/perplexity=429.0, train/loss_epoch=6.220, train/perplexity_epoch=504.0]

Metric val/loss improved by 0.219 >= min_delta = 0.001. New best score: 6.016
Epoch 6, global step 1050: 'val/loss' reached 6.01625 (best 6.01625), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=06-val_loss=6.016.ckpt' as top 1


Epoch 7: 100%|██████████| 150/150 [00:14<00:00, 10.51it/s, v_num=24, train/loss_step=5.970, train/perplexity_step=392.0, val/loss=5.860, val/perplexity=372.0, train/loss_epoch=6.040, train/perplexity_epoch=422.0]

Metric val/loss improved by 0.161 >= min_delta = 0.001. New best score: 5.855
Epoch 7, global step 1200: 'val/loss' reached 5.85503 (best 5.85503), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=07-val_loss=5.855.ckpt' as top 1


Epoch 8: 100%|██████████| 150/150 [00:14<00:00, 10.22it/s, v_num=24, train/loss_step=5.860, train/perplexity_step=350.0, val/loss=5.730, val/perplexity=337.0, train/loss_epoch=5.910, train/perplexity_epoch=371.0]

Metric val/loss improved by 0.123 >= min_delta = 0.001. New best score: 5.732
Epoch 8, global step 1350: 'val/loss' reached 5.73156 (best 5.73156), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=08-val_loss=5.732.ckpt' as top 1


Epoch 9: 100%|██████████| 150/150 [00:15<00:00, 10.00it/s, v_num=24, train/loss_step=5.700, train/perplexity_step=300.0, val/loss=5.640, val/perplexity=315.0, train/loss_epoch=5.820, train/perplexity_epoch=336.0]

Metric val/loss improved by 0.091 >= min_delta = 0.001. New best score: 5.640
Epoch 9, global step 1500: 'val/loss' reached 5.64030 (best 5.64030), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=09-val_loss=5.640.ckpt' as top 1


Epoch 10: 100%|██████████| 150/150 [00:14<00:00, 10.39it/s, v_num=24, train/loss_step=5.740, train/perplexity_step=312.0, val/loss=5.580, val/perplexity=301.0, train/loss_epoch=5.750, train/perplexity_epoch=314.0]

Metric val/loss improved by 0.060 >= min_delta = 0.001. New best score: 5.580
Epoch 10, global step 1650: 'val/loss' reached 5.58046 (best 5.58046), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=10-val_loss=5.580.ckpt' as top 1


Epoch 11: 100%|██████████| 150/150 [00:14<00:00, 10.36it/s, v_num=24, train/loss_step=5.720, train/perplexity_step=303.0, val/loss=5.550, val/perplexity=296.0, train/loss_epoch=5.710, train/perplexity_epoch=302.0]

Metric val/loss improved by 0.027 >= min_delta = 0.001. New best score: 5.554
Epoch 11, global step 1800: 'val/loss' reached 5.55362 (best 5.55362), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=11-val_loss=5.554.ckpt' as top 1


Epoch 12: 100%|██████████| 150/150 [00:14<00:00, 10.43it/s, v_num=24, train/loss_step=5.670, train/perplexity_step=291.0, val/loss=5.550, val/perplexity=294.0, train/loss_epoch=5.690, train/perplexity_epoch=298.0]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 5.547
Epoch 12, global step 1950: 'val/loss' reached 5.54707 (best 5.54707), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=12-val_loss=5.547.ckpt' as top 1


Epoch 13: 100%|██████████| 150/150 [00:14<00:00, 10.41it/s, v_num=24, train/loss_step=5.600, train/perplexity_step=270.0, val/loss=5.550, val/perplexity=294.0, train/loss_epoch=5.690, train/perplexity_epoch=297.0]

Epoch 13, global step 2100: 'val/loss' reached 5.54629 (best 5.54629), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=13-val_loss=5.546.ckpt' as top 1


Epoch 14: 100%|██████████| 150/150 [00:14<00:00, 10.46it/s, v_num=24, train/loss_step=5.710, train/perplexity_step=301.0, val/loss=5.540, val/perplexity=292.0, train/loss_epoch=5.690, train/perplexity_epoch=296.0]

Metric val/loss improved by 0.011 >= min_delta = 0.001. New best score: 5.536
Epoch 14, global step 2250: 'val/loss' reached 5.53563 (best 5.53563), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=14-val_loss=5.536.ckpt' as top 1


Epoch 15: 100%|██████████| 150/150 [00:14<00:00, 10.39it/s, v_num=24, train/loss_step=5.660, train/perplexity_step=287.0, val/loss=5.500, val/perplexity=285.0, train/loss_epoch=5.670, train/perplexity_epoch=290.0]

Metric val/loss improved by 0.035 >= min_delta = 0.001. New best score: 5.501
Epoch 15, global step 2400: 'val/loss' reached 5.50070 (best 5.50070), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=15-val_loss=5.501.ckpt' as top 1


Epoch 16: 100%|██████████| 150/150 [00:14<00:00, 10.16it/s, v_num=24, train/loss_step=5.660, train/perplexity_step=287.0, val/loss=5.430, val/perplexity=273.0, train/loss_epoch=5.620, train/perplexity_epoch=278.0]

Metric val/loss improved by 0.067 >= min_delta = 0.001. New best score: 5.434
Epoch 16, global step 2550: 'val/loss' reached 5.43399 (best 5.43399), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=16-val_loss=5.434.ckpt' as top 1


Epoch 17: 100%|██████████| 150/150 [00:14<00:00, 10.08it/s, v_num=24, train/loss_step=5.530, train/perplexity_step=253.0, val/loss=5.340, val/perplexity=257.0, train/loss_epoch=5.550, train/perplexity_epoch=258.0]

Metric val/loss improved by 0.097 >= min_delta = 0.001. New best score: 5.337
Epoch 17, global step 2700: 'val/loss' reached 5.33689 (best 5.33689), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=17-val_loss=5.337.ckpt' as top 1


Epoch 18: 100%|██████████| 150/150 [00:14<00:00, 10.10it/s, v_num=24, train/loss_step=5.380, train/perplexity_step=218.0, val/loss=5.210, val/perplexity=239.0, train/loss_epoch=5.450, train/perplexity_epoch=233.0]

Metric val/loss improved by 0.123 >= min_delta = 0.001. New best score: 5.214
Epoch 18, global step 2850: 'val/loss' reached 5.21418 (best 5.21418), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=18-val_loss=5.214.ckpt' as top 1


Epoch 19: 100%|██████████| 150/150 [00:14<00:00, 10.28it/s, v_num=24, train/loss_step=5.240, train/perplexity_step=189.0, val/loss=5.080, val/perplexity=225.0, train/loss_epoch=5.330, train/perplexity_epoch=207.0]

Metric val/loss improved by 0.129 >= min_delta = 0.001. New best score: 5.085
Epoch 19, global step 3000: 'val/loss' reached 5.08476 (best 5.08476), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=19-val_loss=5.085.ckpt' as top 1


Epoch 20: 100%|██████████| 150/150 [00:15<00:00,  9.85it/s, v_num=24, train/loss_step=5.190, train/perplexity_step=180.0, val/loss=4.960, val/perplexity=212.0, train/loss_epoch=5.210, train/perplexity_epoch=183.0]

Metric val/loss improved by 0.124 >= min_delta = 0.001. New best score: 4.961
Epoch 20, global step 3150: 'val/loss' reached 4.96105 (best 4.96105), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=20-val_loss=4.961.ckpt' as top 1


Epoch 21: 100%|██████████| 150/150 [00:14<00:00, 10.41it/s, v_num=24, train/loss_step=5.080, train/perplexity_step=161.0, val/loss=4.860, val/perplexity=204.0, train/loss_epoch=5.090, train/perplexity_epoch=163.0]

Metric val/loss improved by 0.102 >= min_delta = 0.001. New best score: 4.859
Epoch 21, global step 3300: 'val/loss' reached 4.85901 (best 4.85901), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=21-val_loss=4.859.ckpt' as top 1


Epoch 22: 100%|██████████| 150/150 [00:14<00:00, 10.10it/s, v_num=24, train/loss_step=4.930, train/perplexity_step=139.0, val/loss=4.780, val/perplexity=198.0, train/loss_epoch=5.000, train/perplexity_epoch=149.0]

Metric val/loss improved by 0.078 >= min_delta = 0.001. New best score: 4.781
Epoch 22, global step 3450: 'val/loss' reached 4.78112 (best 4.78112), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=22-val_loss=4.781.ckpt' as top 1


Epoch 23: 100%|██████████| 150/150 [00:14<00:00, 10.45it/s, v_num=24, train/loss_step=5.050, train/perplexity_step=156.0, val/loss=4.730, val/perplexity=196.0, train/loss_epoch=4.940, train/perplexity_epoch=140.0]

Metric val/loss improved by 0.047 >= min_delta = 0.001. New best score: 4.734
Epoch 23, global step 3600: 'val/loss' reached 4.73407 (best 4.73407), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=23-val_loss=4.734.ckpt' as top 1


Epoch 24: 100%|██████████| 150/150 [00:14<00:00, 10.31it/s, v_num=24, train/loss_step=4.890, train/perplexity_step=132.0, val/loss=4.710, val/perplexity=194.0, train/loss_epoch=4.900, train/perplexity_epoch=134.0]

Metric val/loss improved by 0.026 >= min_delta = 0.001. New best score: 4.709
Epoch 24, global step 3750: 'val/loss' reached 4.70851 (best 4.70851), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=24-val_loss=4.709.ckpt' as top 1


Epoch 25: 100%|██████████| 150/150 [00:14<00:00, 10.46it/s, v_num=24, train/loss_step=4.790, train/perplexity_step=120.0, val/loss=4.700, val/perplexity=194.0, train/loss_epoch=4.880, train/perplexity_epoch=132.0]

Metric val/loss improved by 0.008 >= min_delta = 0.001. New best score: 4.700
Epoch 25, global step 3900: 'val/loss' reached 4.70048 (best 4.70048), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=25-val_loss=4.700.ckpt' as top 1


Epoch 26: 100%|██████████| 150/150 [00:15<00:00,  9.96it/s, v_num=24, train/loss_step=4.840, train/perplexity_step=127.0, val/loss=4.700, val/perplexity=194.0, train/loss_epoch=4.870, train/perplexity_epoch=131.0]

Epoch 26, global step 4050: 'val/loss' reached 4.69969 (best 4.69969), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=26-val_loss=4.700.ckpt' as top 1


Epoch 27: 100%|██████████| 150/150 [00:15<00:00,  9.91it/s, v_num=24, train/loss_step=4.890, train/perplexity_step=133.0, val/loss=4.700, val/perplexity=194.0, train/loss_epoch=4.870, train/perplexity_epoch=131.0]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.696
Epoch 27, global step 4200: 'val/loss' reached 4.69609 (best 4.69609), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=27-val_loss=4.696.ckpt' as top 1


Epoch 28: 100%|██████████| 150/150 [00:15<00:00,  9.71it/s, v_num=24, train/loss_step=4.980, train/perplexity_step=146.0, val/loss=4.680, val/perplexity=193.0, train/loss_epoch=4.870, train/perplexity_epoch=130.0]

Metric val/loss improved by 0.016 >= min_delta = 0.001. New best score: 4.681
Epoch 28, global step 4350: 'val/loss' reached 4.68057 (best 4.68057), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=28-val_loss=4.681.ckpt' as top 1


Epoch 29: 100%|██████████| 150/150 [00:14<00:00, 10.13it/s, v_num=24, train/loss_step=4.960, train/perplexity_step=143.0, val/loss=4.650, val/perplexity=192.0, train/loss_epoch=4.840, train/perplexity_epoch=127.0]

Metric val/loss improved by 0.032 >= min_delta = 0.001. New best score: 4.649
Epoch 29, global step 4500: 'val/loss' reached 4.64891 (best 4.64891), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=29-val_loss=4.649.ckpt' as top 1


Epoch 30: 100%|██████████| 150/150 [00:14<00:00, 10.29it/s, v_num=24, train/loss_step=4.880, train/perplexity_step=132.0, val/loss=4.590, val/perplexity=190.0, train/loss_epoch=4.790, train/perplexity_epoch=121.0]

Metric val/loss improved by 0.056 >= min_delta = 0.001. New best score: 4.593
Epoch 30, global step 4650: 'val/loss' reached 4.59314 (best 4.59314), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=30-val_loss=4.593.ckpt' as top 1


Epoch 31: 100%|██████████| 150/150 [00:14<00:00, 10.39it/s, v_num=24, train/loss_step=4.740, train/perplexity_step=115.0, val/loss=4.520, val/perplexity=189.0, train/loss_epoch=4.720, train/perplexity_epoch=113.0]

Metric val/loss improved by 0.070 >= min_delta = 0.001. New best score: 4.523
Epoch 31, global step 4800: 'val/loss' reached 4.52314 (best 4.52314), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=31-val_loss=4.523.ckpt' as top 1


Epoch 32: 100%|██████████| 150/150 [00:14<00:00, 10.49it/s, v_num=24, train/loss_step=4.560, train/perplexity_step=95.70, val/loss=4.450, val/perplexity=188.0, train/loss_epoch=4.640, train/perplexity_epoch=104.0]

Metric val/loss improved by 0.071 >= min_delta = 0.001. New best score: 4.452
Epoch 32, global step 4950: 'val/loss' reached 4.45226 (best 4.45226), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=32-val_loss=4.452.ckpt' as top 1


Epoch 33: 100%|██████████| 150/150 [00:14<00:00, 10.50it/s, v_num=24, train/loss_step=4.600, train/perplexity_step=99.40, val/loss=4.380, val/perplexity=189.0, train/loss_epoch=4.550, train/perplexity_epoch=95.40]

Metric val/loss improved by 0.068 >= min_delta = 0.001. New best score: 4.384
Epoch 33, global step 5100: 'val/loss' reached 4.38394 (best 4.38394), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=33-val_loss=4.384.ckpt' as top 1


Epoch 34: 100%|██████████| 150/150 [00:14<00:00, 10.50it/s, v_num=24, train/loss_step=4.250, train/perplexity_step=70.40, val/loss=4.320, val/perplexity=189.0, train/loss_epoch=4.470, train/perplexity_epoch=88.00]

Metric val/loss improved by 0.065 >= min_delta = 0.001. New best score: 4.319
Epoch 34, global step 5250: 'val/loss' reached 4.31903 (best 4.31903), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=34-val_loss=4.319.ckpt' as top 1


Epoch 35: 100%|██████████| 150/150 [00:14<00:00, 10.49it/s, v_num=24, train/loss_step=4.380, train/perplexity_step=79.60, val/loss=4.270, val/perplexity=190.0, train/loss_epoch=4.400, train/perplexity_epoch=82.10]

Metric val/loss improved by 0.049 >= min_delta = 0.001. New best score: 4.270
Epoch 35, global step 5400: 'val/loss' reached 4.27049 (best 4.27049), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=35-val_loss=4.270.ckpt' as top 1


Epoch 36: 100%|██████████| 150/150 [00:14<00:00, 10.30it/s, v_num=24, train/loss_step=4.290, train/perplexity_step=72.90, val/loss=4.240, val/perplexity=191.0, train/loss_epoch=4.350, train/perplexity_epoch=77.80]

Metric val/loss improved by 0.032 >= min_delta = 0.001. New best score: 4.238
Epoch 36, global step 5550: 'val/loss' reached 4.23846 (best 4.23846), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=36-val_loss=4.238.ckpt' as top 1


Epoch 37: 100%|██████████| 150/150 [00:14<00:00, 10.39it/s, v_num=24, train/loss_step=4.380, train/perplexity_step=79.60, val/loss=4.220, val/perplexity=192.0, train/loss_epoch=4.320, train/perplexity_epoch=75.20]

Metric val/loss improved by 0.019 >= min_delta = 0.001. New best score: 4.219
Epoch 37, global step 5700: 'val/loss' reached 4.21936 (best 4.21936), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=37-val_loss=4.219.ckpt' as top 1


Epoch 38: 100%|██████████| 150/150 [00:14<00:00, 10.16it/s, v_num=24, train/loss_step=4.250, train/perplexity_step=70.00, val/loss=4.210, val/perplexity=192.0, train/loss_epoch=4.300, train/perplexity_epoch=73.90]

Metric val/loss improved by 0.008 >= min_delta = 0.001. New best score: 4.211
Epoch 38, global step 5850: 'val/loss' reached 4.21093 (best 4.21093), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=38-val_loss=4.211.ckpt' as top 1


Epoch 39: 100%|██████████| 150/150 [00:14<00:00, 10.40it/s, v_num=24, train/loss_step=4.330, train/perplexity_step=76.20, val/loss=4.210, val/perplexity=192.0, train/loss_epoch=4.290, train/perplexity_epoch=73.50]

Epoch 39, global step 6000: 'val/loss' reached 4.20995 (best 4.20995), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=39-val_loss=4.210.ckpt' as top 1


Epoch 40: 100%|██████████| 150/150 [00:14<00:00, 10.18it/s, v_num=24, train/loss_step=4.170, train/perplexity_step=64.60, val/loss=4.210, val/perplexity=192.0, train/loss_epoch=4.290, train/perplexity_epoch=73.30]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.209
Epoch 40, global step 6150: 'val/loss' reached 4.20890 (best 4.20890), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=40-val_loss=4.209.ckpt' as top 1


Epoch 41: 100%|██████████| 150/150 [00:15<00:00,  9.94it/s, v_num=24, train/loss_step=4.210, train/perplexity_step=67.10, val/loss=4.200, val/perplexity=192.0, train/loss_epoch=4.290, train/perplexity_epoch=73.10]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 4.203
Epoch 41, global step 6300: 'val/loss' reached 4.20264 (best 4.20264), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=41-val_loss=4.203.ckpt' as top 1


Epoch 42: 100%|██████████| 150/150 [00:15<00:00,  9.93it/s, v_num=24, train/loss_step=4.320, train/perplexity_step=75.40, val/loss=4.190, val/perplexity=193.0, train/loss_epoch=4.270, train/perplexity_epoch=72.10]

Metric val/loss improved by 0.015 >= min_delta = 0.001. New best score: 4.187
Epoch 42, global step 6450: 'val/loss' reached 4.18739 (best 4.18739), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=42-val_loss=4.187.ckpt' as top 1


Epoch 43: 100%|██████████| 150/150 [00:14<00:00, 10.41it/s, v_num=24, train/loss_step=4.240, train/perplexity_step=69.40, val/loss=4.160, val/perplexity=196.0, train/loss_epoch=4.250, train/perplexity_epoch=70.10]

Metric val/loss improved by 0.028 >= min_delta = 0.001. New best score: 4.160
Epoch 43, global step 6600: 'val/loss' reached 4.15976 (best 4.15976), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=43-val_loss=4.160.ckpt' as top 1


Epoch 44: 100%|██████████| 150/150 [00:14<00:00, 10.06it/s, v_num=24, train/loss_step=4.360, train/perplexity_step=78.50, val/loss=4.130, val/perplexity=199.0, train/loss_epoch=4.200, train/perplexity_epoch=67.10]

Metric val/loss improved by 0.034 >= min_delta = 0.001. New best score: 4.125
Epoch 44, global step 6750: 'val/loss' reached 4.12548 (best 4.12548), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=44-val_loss=4.125.ckpt' as top 1


Epoch 45: 100%|██████████| 150/150 [00:15<00:00,  9.94it/s, v_num=24, train/loss_step=3.940, train/perplexity_step=51.20, val/loss=4.090, val/perplexity=203.0, train/loss_epoch=4.150, train/perplexity_epoch=63.50]

Metric val/loss improved by 0.039 >= min_delta = 0.001. New best score: 4.087
Epoch 45, global step 6900: 'val/loss' reached 4.08666 (best 4.08666), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=45-val_loss=4.087.ckpt' as top 1


Epoch 46: 100%|██████████| 150/150 [00:15<00:00,  9.55it/s, v_num=24, train/loss_step=4.030, train/perplexity_step=56.20, val/loss=4.040, val/perplexity=209.0, train/loss_epoch=4.080, train/perplexity_epoch=59.60]

Metric val/loss improved by 0.044 >= min_delta = 0.001. New best score: 4.043
Epoch 46, global step 7050: 'val/loss' reached 4.04309 (best 4.04309), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=46-val_loss=4.043.ckpt' as top 1


Epoch 47: 100%|██████████| 150/150 [00:14<00:00, 10.34it/s, v_num=24, train/loss_step=4.030, train/perplexity_step=56.40, val/loss=4.010, val/perplexity=214.0, train/loss_epoch=4.020, train/perplexity_epoch=55.90]

Metric val/loss improved by 0.036 >= min_delta = 0.001. New best score: 4.008
Epoch 47, global step 7200: 'val/loss' reached 4.00753 (best 4.00753), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=47-val_loss=4.008.ckpt' as top 1


Epoch 48: 100%|██████████| 150/150 [00:14<00:00, 10.46it/s, v_num=24, train/loss_step=3.710, train/perplexity_step=41.10, val/loss=3.980, val/perplexity=218.0, train/loss_epoch=3.970, train/perplexity_epoch=53.00]

Metric val/loss improved by 0.028 >= min_delta = 0.001. New best score: 3.980
Epoch 48, global step 7350: 'val/loss' reached 3.97955 (best 3.97955), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=48-val_loss=3.980.ckpt' as top 1


Epoch 49: 100%|██████████| 150/150 [00:14<00:00, 10.27it/s, v_num=24, train/loss_step=4.020, train/perplexity_step=55.50, val/loss=3.950, val/perplexity=220.0, train/loss_epoch=3.920, train/perplexity_epoch=50.70]

Metric val/loss improved by 0.025 >= min_delta = 0.001. New best score: 3.955
Epoch 49, global step 7500: 'val/loss' reached 3.95490 (best 3.95490), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=49-val_loss=3.955.ckpt' as top 1


Epoch 50: 100%|██████████| 150/150 [00:14<00:00, 10.06it/s, v_num=24, train/loss_step=3.970, train/perplexity_step=52.80, val/loss=3.950, val/perplexity=226.0, train/loss_epoch=3.890, train/perplexity_epoch=49.20]

Metric val/loss improved by 0.009 >= min_delta = 0.001. New best score: 3.946
Epoch 50, global step 7650: 'val/loss' reached 3.94576 (best 3.94576), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=50-val_loss=3.946.ckpt' as top 1


Epoch 51: 100%|██████████| 150/150 [00:14<00:00, 10.35it/s, v_num=24, train/loss_step=3.820, train/perplexity_step=45.80, val/loss=3.940, val/perplexity=226.0, train/loss_epoch=3.870, train/perplexity_epoch=48.30]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 3.939
Epoch 51, global step 7800: 'val/loss' reached 3.93935 (best 3.93935), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=51-val_loss=3.939.ckpt' as top 1


Epoch 52: 100%|██████████| 150/150 [00:14<00:00, 10.41it/s, v_num=24, train/loss_step=3.780, train/perplexity_step=43.90, val/loss=3.940, val/perplexity=226.0, train/loss_epoch=3.870, train/perplexity_epoch=47.90]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 3.938
Epoch 52, global step 7950: 'val/loss' reached 3.93766 (best 3.93766), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=52-val_loss=3.938.ckpt' as top 1


Epoch 53: 100%|██████████| 150/150 [00:14<00:00, 10.44it/s, v_num=24, train/loss_step=3.680, train/perplexity_step=39.70, val/loss=3.940, val/perplexity=226.0, train/loss_epoch=3.860, train/perplexity_epoch=47.80]

Epoch 53, global step 8100: 'val/loss' reached 3.93750 (best 3.93750), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=53-val_loss=3.937.ckpt' as top 1


Epoch 54: 100%|██████████| 150/150 [00:14<00:00, 10.49it/s, v_num=24, train/loss_step=3.820, train/perplexity_step=45.60, val/loss=3.940, val/perplexity=227.0, train/loss_epoch=3.860, train/perplexity_epoch=47.80]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 3.935
Epoch 54, global step 8250: 'val/loss' reached 3.93528 (best 3.93528), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=54-val_loss=3.935.ckpt' as top 1


Epoch 55: 100%|██████████| 150/150 [00:14<00:00, 10.32it/s, v_num=24, train/loss_step=3.870, train/perplexity_step=47.70, val/loss=3.930, val/perplexity=228.0, train/loss_epoch=3.860, train/perplexity_epoch=47.50]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 3.928
Epoch 55, global step 8400: 'val/loss' reached 3.92818 (best 3.92818), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=55-val_loss=3.928.ckpt' as top 1


Epoch 56: 100%|██████████| 150/150 [00:14<00:00, 10.29it/s, v_num=24, train/loss_step=3.830, train/perplexity_step=45.90, val/loss=3.920, val/perplexity=233.0, train/loss_epoch=3.840, train/perplexity_epoch=46.70]

Metric val/loss improved by 0.009 >= min_delta = 0.001. New best score: 3.919
Epoch 56, global step 8550: 'val/loss' reached 3.91868 (best 3.91868), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=56-val_loss=3.919.ckpt' as top 1


Epoch 57: 100%|██████████| 150/150 [00:14<00:00, 10.37it/s, v_num=24, train/loss_step=3.580, train/perplexity_step=36.00, val/loss=3.900, val/perplexity=237.0, train/loss_epoch=3.810, train/perplexity_epoch=45.40]

Metric val/loss improved by 0.017 >= min_delta = 0.001. New best score: 3.902
Epoch 57, global step 8700: 'val/loss' reached 3.90190 (best 3.90190), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=57-val_loss=3.902.ckpt' as top 1


Epoch 58: 100%|██████████| 150/150 [00:15<00:00,  9.98it/s, v_num=24, train/loss_step=3.550, train/perplexity_step=34.90, val/loss=3.880, val/perplexity=247.0, train/loss_epoch=3.770, train/perplexity_epoch=43.70]

Metric val/loss improved by 0.019 >= min_delta = 0.001. New best score: 3.882
Epoch 58, global step 8850: 'val/loss' reached 3.88240 (best 3.88240), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=58-val_loss=3.882.ckpt' as top 1


Epoch 59: 100%|██████████| 150/150 [00:15<00:00,  9.46it/s, v_num=24, train/loss_step=3.700, train/perplexity_step=40.50, val/loss=3.860, val/perplexity=254.0, train/loss_epoch=3.730, train/perplexity_epoch=41.70]

Metric val/loss improved by 0.019 >= min_delta = 0.001. New best score: 3.863
Epoch 59, global step 9000: 'val/loss' reached 3.86310 (best 3.86310), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=59-val_loss=3.863.ckpt' as top 1


Epoch 60: 100%|██████████| 150/150 [00:16<00:00,  9.29it/s, v_num=24, train/loss_step=3.610, train/perplexity_step=37.00, val/loss=3.850, val/perplexity=266.0, train/loss_epoch=3.680, train/perplexity_epoch=39.80]

Metric val/loss improved by 0.018 >= min_delta = 0.001. New best score: 3.845
Epoch 60, global step 9150: 'val/loss' reached 3.84513 (best 3.84513), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=60-val_loss=3.845.ckpt' as top 1


Epoch 61: 100%|██████████| 150/150 [00:14<00:00, 10.03it/s, v_num=24, train/loss_step=3.560, train/perplexity_step=35.10, val/loss=3.830, val/perplexity=274.0, train/loss_epoch=3.640, train/perplexity_epoch=38.10]

Metric val/loss improved by 0.019 >= min_delta = 0.001. New best score: 3.826
Epoch 61, global step 9300: 'val/loss' reached 3.82648 (best 3.82648), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=61-val_loss=3.826.ckpt' as top 1


Epoch 62: 100%|██████████| 150/150 [00:14<00:00, 10.03it/s, v_num=24, train/loss_step=3.540, train/perplexity_step=34.40, val/loss=3.810, val/perplexity=280.0, train/loss_epoch=3.600, train/perplexity_epoch=36.70]

Metric val/loss improved by 0.014 >= min_delta = 0.001. New best score: 3.813
Epoch 62, global step 9450: 'val/loss' reached 3.81282 (best 3.81282), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=62-val_loss=3.813.ckpt' as top 1


Epoch 63: 100%|██████████| 150/150 [00:14<00:00, 10.01it/s, v_num=24, train/loss_step=3.470, train/perplexity_step=32.20, val/loss=3.810, val/perplexity=288.0, train/loss_epoch=3.570, train/perplexity_epoch=35.80]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 3.810
Epoch 63, global step 9600: 'val/loss' reached 3.81027 (best 3.81027), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=63-val_loss=3.810.ckpt' as top 1


Epoch 64: 100%|██████████| 150/150 [00:14<00:00, 10.02it/s, v_num=24, train/loss_step=3.580, train/perplexity_step=36.00, val/loss=3.800, val/perplexity=290.0, train/loss_epoch=3.560, train/perplexity_epoch=35.30]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 3.804
Epoch 64, global step 9750: 'val/loss' reached 3.80410 (best 3.80410), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=64-val_loss=3.804.ckpt' as top 1


Epoch 65: 100%|██████████| 150/150 [00:15<00:00,  9.66it/s, v_num=24, train/loss_step=3.440, train/perplexity_step=31.10, val/loss=3.800, val/perplexity=291.0, train/loss_epoch=3.550, train/perplexity_epoch=34.80]

Epoch 65, global step 9900: 'val/loss' reached 3.80310 (best 3.80310), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=65-val_loss=3.803.ckpt' as top 1


Epoch 66: 100%|██████████| 150/150 [00:15<00:00,  9.49it/s, v_num=24, train/loss_step=3.630, train/perplexity_step=37.60, val/loss=3.800, val/perplexity=291.0, train/loss_epoch=3.550, train/perplexity_epoch=34.80]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 3.803
Epoch 66, global step 10050: 'val/loss' reached 3.80279 (best 3.80279), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=66-val_loss=3.803.ckpt' as top 1


Epoch 67: 100%|██████████| 150/150 [00:15<00:00,  9.96it/s, v_num=24, train/loss_step=3.430, train/perplexity_step=31.00, val/loss=3.800, val/perplexity=291.0, train/loss_epoch=3.540, train/perplexity_epoch=34.80]

Epoch 67, global step 10200: 'val/loss' reached 3.80248 (best 3.80248), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=67-val_loss=3.802.ckpt' as top 1


Epoch 68: 100%|██████████| 150/150 [00:14<00:00, 10.14it/s, v_num=24, train/loss_step=3.690, train/perplexity_step=40.10, val/loss=3.800, val/perplexity=293.0, train/loss_epoch=3.540, train/perplexity_epoch=34.70]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 3.801
Epoch 68, global step 10350: 'val/loss' reached 3.80112 (best 3.80112), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=68-val_loss=3.801.ckpt' as top 1


Epoch 69: 100%|██████████| 150/150 [00:15<00:00,  9.92it/s, v_num=24, train/loss_step=3.540, train/perplexity_step=34.30, val/loss=3.800, val/perplexity=297.0, train/loss_epoch=3.530, train/perplexity_epoch=34.30]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 3.796
Epoch 69, global step 10500: 'val/loss' reached 3.79585 (best 3.79585), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=69-val_loss=3.796.ckpt' as top 1


Epoch 70: 100%|██████████| 150/150 [00:14<00:00, 10.05it/s, v_num=24, train/loss_step=3.600, train/perplexity_step=36.70, val/loss=3.790, val/perplexity=306.0, train/loss_epoch=3.520, train/perplexity_epoch=33.70]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 3.788
Epoch 70, global step 10650: 'val/loss' reached 3.78843 (best 3.78843), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=70-val_loss=3.788.ckpt' as top 1


Epoch 71: 100%|██████████| 150/150 [00:15<00:00,  9.97it/s, v_num=24, train/loss_step=3.270, train/perplexity_step=26.20, val/loss=3.780, val/perplexity=317.0, train/loss_epoch=3.490, train/perplexity_epoch=32.80]

Metric val/loss improved by 0.008 >= min_delta = 0.001. New best score: 3.780
Epoch 71, global step 10800: 'val/loss' reached 3.78037 (best 3.78037), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=71-val_loss=3.780.ckpt' as top 1


Epoch 72: 100%|██████████| 150/150 [00:14<00:00, 10.09it/s, v_num=24, train/loss_step=3.490, train/perplexity_step=32.90, val/loss=3.770, val/perplexity=332.0, train/loss_epoch=3.460, train/perplexity_epoch=31.80]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 3.775
Epoch 72, global step 10950: 'val/loss' reached 3.77493 (best 3.77493), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=72-val_loss=3.775.ckpt' as top 1


Epoch 73: 100%|██████████| 150/150 [00:15<00:00,  9.96it/s, v_num=24, train/loss_step=3.460, train/perplexity_step=31.90, val/loss=3.760, val/perplexity=342.0, train/loss_epoch=3.420, train/perplexity_epoch=30.60]

Metric val/loss improved by 0.012 >= min_delta = 0.001. New best score: 3.763
Epoch 73, global step 11100: 'val/loss' reached 3.76262 (best 3.76262), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=73-val_loss=3.763.ckpt' as top 1


Epoch 74: 100%|██████████| 150/150 [00:15<00:00,  9.97it/s, v_num=24, train/loss_step=3.450, train/perplexity_step=31.40, val/loss=3.760, val/perplexity=361.0, train/loss_epoch=3.380, train/perplexity_epoch=29.60]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 3.758
Epoch 74, global step 11250: 'val/loss' reached 3.75781 (best 3.75781), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=74-val_loss=3.758.ckpt' as top 1


Epoch 75: 100%|██████████| 150/150 [00:15<00:00,  9.94it/s, v_num=24, train/loss_step=3.290, train/perplexity_step=26.80, val/loss=3.760, val/perplexity=378.0, train/loss_epoch=3.350, train/perplexity_epoch=28.70]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 3.757
Epoch 75, global step 11400: 'val/loss' reached 3.75672 (best 3.75672), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=75-val_loss=3.757.ckpt' as top 1


Epoch 76: 100%|██████████| 150/150 [00:15<00:00,  9.77it/s, v_num=24, train/loss_step=3.240, train/perplexity_step=25.60, val/loss=3.750, val/perplexity=384.0, train/loss_epoch=3.330, train/perplexity_epoch=28.00]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 3.751
Epoch 76, global step 11550: 'val/loss' reached 3.75078 (best 3.75078), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=76-val_loss=3.751.ckpt' as top 1


Epoch 77: 100%|██████████| 150/150 [00:15<00:00,  9.78it/s, v_num=24, train/loss_step=3.370, train/perplexity_step=29.10, val/loss=3.750, val/perplexity=389.0, train/loss_epoch=3.310, train/perplexity_epoch=27.60]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 3.748
Epoch 77, global step 11700: 'val/loss' reached 3.74800 (best 3.74800), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=77-val_loss=3.748.ckpt' as top 1


Epoch 78: 100%|██████████| 150/150 [00:15<00:00,  9.84it/s, v_num=24, train/loss_step=3.360, train/perplexity_step=28.70, val/loss=3.750, val/perplexity=390.0, train/loss_epoch=3.310, train/perplexity_epoch=27.30]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 3.746
Epoch 78, global step 11850: 'val/loss' reached 3.74613 (best 3.74613), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=78-val_loss=3.746.ckpt' as top 1


Epoch 79: 100%|██████████| 150/150 [00:15<00:00,  9.60it/s, v_num=24, train/loss_step=3.300, train/perplexity_step=27.10, val/loss=3.750, val/perplexity=391.0, train/loss_epoch=3.300, train/perplexity_epoch=27.30]

Epoch 79, global step 12000: 'val/loss' was not in top 1


Epoch 80: 100%|██████████| 150/150 [00:16<00:00,  9.34it/s, v_num=24, train/loss_step=3.210, train/perplexity_step=24.90, val/loss=3.750, val/perplexity=392.0, train/loss_epoch=3.300, train/perplexity_epoch=27.30]

Epoch 80, global step 12150: 'val/loss' was not in top 1


Epoch 81: 100%|██████████| 150/150 [00:15<00:00,  9.77it/s, v_num=24, train/loss_step=3.270, train/perplexity_step=26.30, val/loss=3.750, val/perplexity=393.0, train/loss_epoch=3.300, train/perplexity_epoch=27.20]

Epoch 81, global step 12300: 'val/loss' reached 3.74524 (best 3.74524), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=81-val_loss=3.745.ckpt' as top 1


Epoch 82: 100%|██████████| 150/150 [00:15<00:00,  9.70it/s, v_num=24, train/loss_step=3.350, train/perplexity_step=28.60, val/loss=3.750, val/perplexity=398.0, train/loss_epoch=3.300, train/perplexity_epoch=27.10]

Epoch 82, global step 12450: 'val/loss' reached 3.74513 (best 3.74513), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=82-val_loss=3.745.ckpt' as top 1


Epoch 83: 100%|██████████| 150/150 [00:15<00:00,  9.82it/s, v_num=24, train/loss_step=3.080, train/perplexity_step=21.70, val/loss=3.750, val/perplexity=411.0, train/loss_epoch=3.280, train/perplexity_epoch=26.70]

Epoch 83, global step 12600: 'val/loss' was not in top 1


Epoch 84: 100%|██████████| 150/150 [00:14<00:00, 10.21it/s, v_num=24, train/loss_step=3.360, train/perplexity_step=28.90, val/loss=3.750, val/perplexity=431.0, train/loss_epoch=3.270, train/perplexity_epoch=26.30]

Epoch 84, global step 12750: 'val/loss' was not in top 1


Epoch 85: 100%|██████████| 150/150 [00:14<00:00, 10.12it/s, v_num=24, train/loss_step=3.080, train/perplexity_step=21.70, val/loss=3.740, val/perplexity=449.0, train/loss_epoch=3.240, train/perplexity_epoch=25.60]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 3.741
Epoch 85, global step 12900: 'val/loss' reached 3.74126 (best 3.74126), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=85-val_loss=3.741.ckpt' as top 1


Epoch 86: 100%|██████████| 150/150 [00:16<00:00,  9.23it/s, v_num=24, train/loss_step=3.010, train/perplexity_step=20.20, val/loss=3.740, val/perplexity=467.0, train/loss_epoch=3.210, train/perplexity_epoch=25.00]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 3.739
Epoch 86, global step 13050: 'val/loss' reached 3.73852 (best 3.73852), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=86-val_loss=3.739.ckpt' as top 1


Epoch 87: 100%|██████████| 150/150 [00:15<00:00,  9.95it/s, v_num=24, train/loss_step=3.280, train/perplexity_step=26.40, val/loss=3.740, val/perplexity=496.0, train/loss_epoch=3.190, train/perplexity_epoch=24.30]

Epoch 87, global step 13200: 'val/loss' reached 3.73801 (best 3.73801), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=87-val_loss=3.738.ckpt' as top 1


Epoch 88: 100%|██████████| 150/150 [00:15<00:00,  9.99it/s, v_num=24, train/loss_step=3.340, train/perplexity_step=28.30, val/loss=3.740, val/perplexity=510.0, train/loss_epoch=3.160, train/perplexity_epoch=23.60]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 3.735
Epoch 88, global step 13350: 'val/loss' reached 3.73532 (best 3.73532), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=88-val_loss=3.735.ckpt' as top 1


Epoch 89: 100%|██████████| 150/150 [00:15<00:00,  9.72it/s, v_num=24, train/loss_step=3.210, train/perplexity_step=24.70, val/loss=3.740, val/perplexity=528.0, train/loss_epoch=3.140, train/perplexity_epoch=23.20]

Epoch 89, global step 13500: 'val/loss' reached 3.73507 (best 3.73507), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=89-val_loss=3.735.ckpt' as top 1


Epoch 90: 100%|██████████| 150/150 [00:15<00:00,  9.97it/s, v_num=24, train/loss_step=3.180, train/perplexity_step=24.10, val/loss=3.730, val/perplexity=541.0, train/loss_epoch=3.120, train/perplexity_epoch=22.80]

Epoch 90, global step 13650: 'val/loss' reached 3.73485 (best 3.73485), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=90-val_loss=3.735.ckpt' as top 1


Epoch 91: 100%|██████████| 150/150 [00:14<00:00, 10.06it/s, v_num=24, train/loss_step=3.160, train/perplexity_step=23.70, val/loss=3.740, val/perplexity=543.0, train/loss_epoch=3.120, train/perplexity_epoch=22.60]

Epoch 91, global step 13800: 'val/loss' was not in top 1


Epoch 92: 100%|██████████| 150/150 [00:14<00:00, 10.10it/s, v_num=24, train/loss_step=3.020, train/perplexity_step=20.50, val/loss=3.730, val/perplexity=543.0, train/loss_epoch=3.110, train/perplexity_epoch=22.50]

Epoch 92, global step 13950: 'val/loss' reached 3.73469 (best 3.73469), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=92-val_loss=3.735.ckpt' as top 1


Epoch 93: 100%|██████████| 150/150 [00:14<00:00, 10.25it/s, v_num=24, train/loss_step=3.100, train/perplexity_step=22.20, val/loss=3.730, val/perplexity=544.0, train/loss_epoch=3.110, train/perplexity_epoch=22.50]

Epoch 93, global step 14100: 'val/loss' reached 3.73468 (best 3.73468), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=93-val_loss=3.735.ckpt' as top 1


Epoch 94: 100%|██████████| 150/150 [00:15<00:00,  9.96it/s, v_num=24, train/loss_step=3.000, train/perplexity_step=20.10, val/loss=3.740, val/perplexity=549.0, train/loss_epoch=3.110, train/perplexity_epoch=22.50]

Epoch 94, global step 14250: 'val/loss' was not in top 1


Epoch 95: 100%|██████████| 150/150 [00:14<00:00, 10.09it/s, v_num=24, train/loss_step=3.060, train/perplexity_step=21.40, val/loss=3.740, val/perplexity=558.0, train/loss_epoch=3.110, train/perplexity_epoch=22.40]

Epoch 95, global step 14400: 'val/loss' was not in top 1


Epoch 96: 100%|██████████| 150/150 [00:15<00:00,  9.95it/s, v_num=24, train/loss_step=3.180, train/perplexity_step=24.10, val/loss=3.740, val/perplexity=569.0, train/loss_epoch=3.100, train/perplexity_epoch=22.30]

Epoch 96, global step 14550: 'val/loss' was not in top 1


Epoch 97: 100%|██████████| 150/150 [00:14<00:00, 10.12it/s, v_num=24, train/loss_step=2.970, train/perplexity_step=19.50, val/loss=3.740, val/perplexity=593.0, train/loss_epoch=3.090, train/perplexity_epoch=22.00]

Epoch 97, global step 14700: 'val/loss' was not in top 1


Epoch 98: 100%|██████████| 150/150 [00:14<00:00, 10.01it/s, v_num=24, train/loss_step=3.110, train/perplexity_step=22.50, val/loss=3.740, val/perplexity=614.0, train/loss_epoch=3.070, train/perplexity_epoch=21.70]

Monitored metric val/loss did not improve in the last 10 records. Best score: 3.735. Signaling Trainer to stop.
Epoch 98, global step 14850: 'val/loss' was not in top 1


Epoch 98: 100%|██████████| 150/150 [00:15<00:00,  9.98it/s, v_num=24, train/loss_step=3.110, train/perplexity_step=22.50, val/loss=3.740, val/perplexity=614.0, train/loss_epoch=3.070, train/perplexity_epoch=21.70]


In [None]:
# Test model
print("\nTesting model...")
trainer.test(model, data_module)

# Save final model
final_model_path = os.path.join(save_dir, f"{experiment_name}-final.ckpt")
trainer.save_checkpoint(final_model_path)
print(f"Final model saved to {final_model_path}")


Testing model...


c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing DataLoader 0:   0%|          | 0/21 [00:00<?, ?it/s]

Testing DataLoader 0: 100%|██████████| 21/21 [00:00<00:00, 86.65it/s]


Final model saved to ./checkpoints\transformer_lm-final.ckpt


In [None]:
print("\nTraining completed!")
print(f"Checkpoints saved in: {save_dir}")
print(f"Logs saved in: {log_dir}")
print(f"Vocabulary saved in: {vocab_path}")
print(f"Best model: {checkpoint_callback.best_model_path}")
print(f"Best score: {checkpoint_callback.best_model_score}")

# Print instructions for running the app
print("\n" + "="*50)
print("To run the Gradio app with your trained model:")
print(f"python -m src.app.gradio_app --model_path {checkpoint_callback.best_model_path} --vocab_path {vocab_path}")
print("="*50)


Training completed!
Checkpoints saved in: ./checkpoints
Logs saved in: ./logs
Vocabulary saved in: ./checkpoints\vocab.pkl
Best model: C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=93-val_loss=3.735.ckpt
Best score: 3.734684705734253

To run the Gradio app with your trained model:
python -m src.app.gradio_app --model_path C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=93-val_loss=3.735.ckpt --vocab_path ./checkpoints\vocab.pkl
