# **Transformer Language Model**

## Imports

In [29]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device() if torch.cuda.is_available() else 'N/A'}")

CUDA available: True
Device count: 1
Current device: 0


#### Set Root Dir

In [30]:
import sys
import os

# Use the current working directory as root (or go up if needed)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '../..'))  # Adjust '..' as needed
sys.path.append(ROOT_DIR)

#### Import Modules

In [31]:
"""
Training script for the transformer language model.
"""
import argparse
import torch
import pytorch_lightning as pl
import pickle
import yaml
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer

from src.data.tokenizer import SimpleTokenizer, load_text_corpus, create_sample_corpus
from src.data.tokenizer_factory import TokenizerFactory
from src.data.dataset import TextDataModule
from src.model.lightning_module import TransformerLightningModule

## Load Params

#### Load yaml config

In [32]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

#### Set hyperparams

In [33]:
hparams = config["hparams"]

# this is not the best way to do unpack the hyperparams, but it's just for the demo
vocab_size, d_model, num_heads, num_layers, d_ff, \
sequence_length, batch_size, learning_rate, max_epochs, \
patience, min_delta, warmup_steps, weight_decay, dropout, \
train_split, val_split, num_workers, accelerator, devices, precision, \
gradient_clip_val, accumulate_grad_batches, log_every_n_steps, \
val_check_interval, save_top_k, monitor, mode = hparams.values()

In [34]:
hparams

{'vocab_size': 8000,
 'd_model': 8,
 'num_heads': 1,
 'num_layers': 2,
 'd_ff': 32,
 'sequence_length': 64,
 'batch_size': 32,
 'learning_rate': 0.0005,
 'max_epochs': 100,
 'patience': 10,
 'min_delta': 0.001,
 'warmup_steps': 1000,
 'weight_decay': 0.01,
 'dropout': 0.2,
 'train_split': 0.7,
 'val_split': 0.2,
 'num_workers': 0,
 'accelerator': 'gpu',
 'devices': 1,
 'precision': '32',
 'gradient_clip_val': 1.0,
 'accumulate_grad_batches': 1,
 'log_every_n_steps': 50,
 'val_check_interval': 1.0,
 'save_top_k': 1,
 'monitor': 'val/loss',
 'mode': 'min'}

#### Set paths and general config

In [35]:
paths = config["paths"]
general = config["general"]

corpus_path=paths["corpus_path"]
save_dir=paths["save_dir"]
log_dir=paths["log_dir"]
experiment_name=general["experiment_name"]
create_sample=general["create_sample"]

## Data preprocessing

#### Load text

In [36]:
# Load and tokenize text
print("Loading and tokenizing text...")
text = load_text_corpus(corpus_path)

Loading and tokenizing text...


#### Tokenize

In [37]:
    # Create tokenizer and build vocabulary
    tokenizer_config = config.get("tokenizer", {})
    tokenizer_type = tokenizer_config.get("type", "word")
    
    if tokenizer_type == "word":
        tokenizer = TokenizerFactory.create_tokenizer("word", vocab_size=vocab_size)
    elif tokenizer_type == "bpe":
        bpe_options = tokenizer_config.get("bpe_options", {})
        tokenizer = TokenizerFactory.create_tokenizer("bpe", vocab_size=vocab_size, **bpe_options)
    else:
        raise ValueError(f"Unknown tokenizer type: {tokenizer_type}")

In [38]:
# Create tokenizer and build vocabulary
# tokenizer = SimpleTokenizer(vocab_size=hparams["vocab_size"])
tokenizer.build_vocab(text)

Vocabulary built with 6501 tokens


#### Save vocab

In [39]:
# Save vocabulary.
# For the demo, we override the vocab file. you can adjust the vocal file name as you like.
vocab_path = os.path.join(save_dir, "vocab.pkl")
os.makedirs(save_dir, exist_ok=True)
tokenizer.save_vocab(vocab_path)
print(f"Vocabulary saved to {vocab_path}")

Vocabulary saved to ./checkpoints\vocab.pkl
Vocabulary saved to ./checkpoints\vocab.pkl


#### Read vocab

In [40]:
# read vocab
# this step is only necessary if loading the vocab from a file instead of creating one
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)


#### Get token ids

In [41]:
# Encode text
token_ids = tokenizer.encode(text)
print(f"Encoded text length: {len(token_ids)} tokens")
print("The number of unique tokens in the vocab is:", len(set(token_ids)))

Encoded text length: 82212 tokens
The number of unique tokens in the vocab is: 6497


In [42]:
ids_to_words = tokenizer.decode(token_ids)
print(f"Decoded text: {ids_to_words[:100]}")


Decoded text: hello, it's me
i was wondering if after all these years, you'd like to meet
to go over everything
th


## Model Training

#### Create data module

In [43]:
# Create data module
data_module = TextDataModule(
    token_ids=token_ids,
    sequence_length=sequence_length,
    batch_size=batch_size,
    train_split=train_split,
    val_split=val_split,
    num_workers=int(os.cpu_count()*0.8) #use half of the cores   #num_workers
)

#### Create model

In [44]:
# Create model
model = TransformerLightningModule(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    d_ff=d_ff,
    dropout=dropout,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps
)

#### Set callback

In [45]:
# Create callbacks
callbacks = []

# Early stopping
early_stopping = EarlyStopping(
    monitor=monitor,
    patience=patience,
    min_delta=min_delta,
    mode=mode,
    verbose=True
)
callbacks.append(early_stopping)


# Model checkpointing
checkpoint_callback = ModelCheckpoint(
    dirpath=save_dir,
    filename=f"{experiment_name}-epoch={{epoch:02d}}-val_loss={{val/loss:.3f}}", #-v{trainer.logger.version:02d}",
    monitor=monitor,
    mode=mode,
    auto_insert_metric_name=False, # Prevents the name 'val/loss=' from being prepended
    save_top_k=save_top_k,
    save_last=True,
    verbose=True
)
callbacks.append(checkpoint_callback)

# Learning rate monitoring
lr_monitor = LearningRateMonitor(logging_interval='step')
callbacks.append(lr_monitor)

#### Create trainer and logger

In [46]:
# Create logger
logger = TensorBoardLogger(
    save_dir=log_dir,
    name=experiment_name,
    version=None
)

# Create trainer
trainer = Trainer(
    accelerator=accelerator,
    devices=devices,
    precision=precision,
    max_epochs=max_epochs,
    gradient_clip_val=gradient_clip_val,
    accumulate_grad_batches=accumulate_grad_batches,
    log_every_n_steps=log_every_n_steps,
    val_check_interval=val_check_interval,
    callbacks=callbacks,
    logger=logger,
    deterministic=True,
    enable_progress_bar=True,
    enable_model_summary=True
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


#### Summarize prepared data

In [47]:
# Print model summary
print("\nModel Summary:")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Check data sizes
print("Data sizes:")
print(f"Total tokens: {len(token_ids)}")
print(f"Sequence length: {sequence_length}")
print(f"Train split: {train_split}")
print(f"Val split: {val_split}")

# Calculate split sizes
total_len = len(token_ids)
train_end = int(total_len * train_split)
val_end = int(total_len * (train_split + val_split))

print(f"Train tokens: {train_end}")
print(f"Val tokens: {val_end - train_end}")
print(f"Test tokens: {total_len - val_end}")

# Check if validation data is sufficient
val_tokens = val_end - train_end
if val_tokens < sequence_length:
    print(f"WARNING: Validation data has only {val_tokens} tokens, less than sequence length {sequence_length}")
    print("This will cause validation to fail. Consider using a larger corpus or adjusting splits.")


Model Summary:
Vocabulary size: 6501
Model parameters: 112,261
Trainable parameters: 112,261
Data sizes:
Total tokens: 82212
Sequence length: 64
Train split: 0.7
Val split: 0.2
Train tokens: 57548
Val tokens: 16442
Test tokens: 8222


#### Train the neural netork transformer

In [20]:
# Train model
print("\nStarting training...")
print("With the updated parameters:")
print(f"- Sequence length: {sequence_length}")
print(f"- Train split: {train_split}")
print(f"- Val split: {val_split}")
print(f"- Monitor: {monitor}")
print()

trainer.fit(model, data_module)


Starting training...
With the updated parameters:
- Sequence length: 64
- Train split: 0.7
- Val split: 0.2
- Monitor: val/loss



c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\code\data_science\demo_llm\checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | TransformerLM    | 88.5 K | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
88.5 K    Trainable params
0         Non-trainable params
88.5 K    Total params
0.354     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:428: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:428: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 0: 100%|██████████| 2441/2441 [01:34<00:00, 25.78it/s, v_num=64, train/loss_step=5.850, train/perplexity_step=348.0, val/loss=5.950, val/perplexity=408.0, train/loss_epoch=6.630, train/perplexity_epoch=1.46e+3]

Metric val/loss improved. New best score: 5.954
Epoch 0, global step 2441: 'val/loss' reached 5.95422 (best 5.95422), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=00-val_loss=5.954.ckpt' as top 1


Epoch 1: 100%|██████████| 2441/2441 [02:32<00:00, 15.98it/s, v_num=64, train/loss_step=5.630, train/perplexity_step=280.0, val/loss=5.670, val/perplexity=314.0, train/loss_epoch=5.750, train/perplexity_epoch=315.0]  

Metric val/loss improved by 0.281 >= min_delta = 0.001. New best score: 5.673
Epoch 1, global step 4882: 'val/loss' reached 5.67282 (best 5.67282), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=01-val_loss=5.673.ckpt' as top 1


Epoch 2: 100%|██████████| 2441/2441 [02:33<00:00, 15.94it/s, v_num=64, train/loss_step=5.400, train/perplexity_step=221.0, val/loss=5.430, val/perplexity=248.0, train/loss_epoch=5.500, train/perplexity_epoch=246.0]

Metric val/loss improved by 0.239 >= min_delta = 0.001. New best score: 5.433
Epoch 2, global step 7323: 'val/loss' reached 5.43339 (best 5.43339), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=02-val_loss=5.433.ckpt' as top 1


Epoch 3: 100%|██████████| 2441/2441 [02:33<00:00, 15.92it/s, v_num=64, train/loss_step=5.190, train/perplexity_step=179.0, val/loss=5.270, val/perplexity=211.0, train/loss_epoch=5.310, train/perplexity_epoch=203.0]

Metric val/loss improved by 0.159 >= min_delta = 0.001. New best score: 5.274
Epoch 3, global step 9764: 'val/loss' reached 5.27397 (best 5.27397), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=03-val_loss=5.274.ckpt' as top 1


Epoch 4: 100%|██████████| 2441/2441 [02:32<00:00, 15.98it/s, v_num=64, train/loss_step=4.930, train/perplexity_step=139.0, val/loss=5.170, val/perplexity=188.0, train/loss_epoch=5.160, train/perplexity_epoch=176.0]

Metric val/loss improved by 0.108 >= min_delta = 0.001. New best score: 5.166
Epoch 4, global step 12205: 'val/loss' reached 5.16629 (best 5.16629), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=04-val_loss=5.166.ckpt' as top 1


Epoch 5: 100%|██████████| 2441/2441 [02:32<00:00, 16.05it/s, v_num=64, train/loss_step=4.760, train/perplexity_step=117.0, val/loss=5.070, val/perplexity=170.0, train/loss_epoch=5.050, train/perplexity_epoch=157.0]

Metric val/loss improved by 0.099 >= min_delta = 0.001. New best score: 5.068
Epoch 5, global step 14646: 'val/loss' reached 5.06767 (best 5.06767), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=05-val_loss=5.068.ckpt' as top 1


Epoch 6: 100%|██████████| 2441/2441 [02:32<00:00, 15.96it/s, v_num=64, train/loss_step=4.730, train/perplexity_step=114.0, val/loss=4.970, val/perplexity=154.0, train/loss_epoch=4.960, train/perplexity_epoch=144.0]

Metric val/loss improved by 0.096 >= min_delta = 0.001. New best score: 4.972
Epoch 6, global step 17087: 'val/loss' reached 4.97162 (best 4.97162), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=06-val_loss=4.972.ckpt' as top 1


Epoch 7: 100%|██████████| 2441/2441 [02:32<00:00, 15.98it/s, v_num=64, train/loss_step=4.850, train/perplexity_step=127.0, val/loss=4.890, val/perplexity=142.0, train/loss_epoch=4.890, train/perplexity_epoch=134.0]

Metric val/loss improved by 0.077 >= min_delta = 0.001. New best score: 4.894
Epoch 7, global step 19528: 'val/loss' reached 4.89427 (best 4.89427), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=07-val_loss=4.894.ckpt' as top 1


Epoch 8: 100%|██████████| 2441/2441 [02:32<00:00, 15.97it/s, v_num=64, train/loss_step=4.550, train/perplexity_step=94.80, val/loss=4.850, val/perplexity=135.0, train/loss_epoch=4.830, train/perplexity_epoch=126.0]

Metric val/loss improved by 0.049 >= min_delta = 0.001. New best score: 4.845
Epoch 8, global step 21969: 'val/loss' reached 4.84520 (best 4.84520), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=08-val_loss=4.845.ckpt' as top 1


Epoch 9: 100%|██████████| 2441/2441 [02:32<00:00, 16.03it/s, v_num=64, train/loss_step=4.710, train/perplexity_step=112.0, val/loss=4.800, val/perplexity=129.0, train/loss_epoch=4.790, train/perplexity_epoch=120.0]

Metric val/loss improved by 0.043 >= min_delta = 0.001. New best score: 4.802
Epoch 9, global step 24410: 'val/loss' reached 4.80180 (best 4.80180), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=09-val_loss=4.802.ckpt' as top 1


Epoch 10: 100%|██████████| 2441/2441 [02:32<00:00, 16.00it/s, v_num=64, train/loss_step=4.860, train/perplexity_step=129.0, val/loss=4.760, val/perplexity=123.0, train/loss_epoch=4.750, train/perplexity_epoch=115.0]

Metric val/loss improved by 0.044 >= min_delta = 0.001. New best score: 4.758
Epoch 10, global step 26851: 'val/loss' reached 4.75775 (best 4.75775), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=10-val_loss=4.758.ckpt' as top 1


Epoch 11: 100%|██████████| 2441/2441 [02:33<00:00, 15.93it/s, v_num=64, train/loss_step=4.570, train/perplexity_step=96.20, val/loss=4.720, val/perplexity=118.0, train/loss_epoch=4.710, train/perplexity_epoch=111.0]

Metric val/loss improved by 0.042 >= min_delta = 0.001. New best score: 4.716
Epoch 11, global step 29292: 'val/loss' reached 4.71574 (best 4.71574), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=11-val_loss=4.716.ckpt' as top 1


Epoch 12: 100%|██████████| 2441/2441 [02:32<00:00, 16.06it/s, v_num=64, train/loss_step=4.560, train/perplexity_step=95.30, val/loss=4.680, val/perplexity=114.0, train/loss_epoch=4.680, train/perplexity_epoch=108.0]

Metric val/loss improved by 0.038 >= min_delta = 0.001. New best score: 4.678
Epoch 12, global step 31733: 'val/loss' reached 4.67792 (best 4.67792), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=12-val_loss=4.678.ckpt' as top 1


Epoch 13: 100%|██████████| 2441/2441 [02:32<00:00, 16.02it/s, v_num=64, train/loss_step=4.680, train/perplexity_step=108.0, val/loss=4.650, val/perplexity=111.0, train/loss_epoch=4.650, train/perplexity_epoch=105.0]

Metric val/loss improved by 0.024 >= min_delta = 0.001. New best score: 4.654
Epoch 13, global step 34174: 'val/loss' reached 4.65440 (best 4.65440), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=13-val_loss=4.654.ckpt' as top 1


Epoch 14: 100%|██████████| 2441/2441 [02:32<00:00, 16.01it/s, v_num=64, train/loss_step=4.620, train/perplexity_step=102.0, val/loss=4.630, val/perplexity=108.0, train/loss_epoch=4.630, train/perplexity_epoch=103.0]

Metric val/loss improved by 0.022 >= min_delta = 0.001. New best score: 4.633
Epoch 14, global step 36615: 'val/loss' reached 4.63257 (best 4.63257), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=14-val_loss=4.633.ckpt' as top 1


Epoch 15: 100%|██████████| 2441/2441 [02:33<00:00, 15.91it/s, v_num=64, train/loss_step=4.620, train/perplexity_step=101.0, val/loss=4.610, val/perplexity=105.0, train/loss_epoch=4.610, train/perplexity_epoch=100.0]

Metric val/loss improved by 0.027 >= min_delta = 0.001. New best score: 4.605
Epoch 15, global step 39056: 'val/loss' reached 4.60522 (best 4.60522), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=15-val_loss=4.605.ckpt' as top 1


Epoch 16: 100%|██████████| 2441/2441 [02:32<00:00, 15.99it/s, v_num=64, train/loss_step=4.500, train/perplexity_step=90.10, val/loss=4.580, val/perplexity=103.0, train/loss_epoch=4.590, train/perplexity_epoch=98.40]

Metric val/loss improved by 0.023 >= min_delta = 0.001. New best score: 4.582
Epoch 16, global step 41497: 'val/loss' reached 4.58179 (best 4.58179), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=16-val_loss=4.582.ckpt' as top 1


Epoch 17: 100%|██████████| 2441/2441 [02:32<00:00, 16.01it/s, v_num=64, train/loss_step=4.480, train/perplexity_step=88.30, val/loss=4.560, val/perplexity=101.0, train/loss_epoch=4.570, train/perplexity_epoch=96.80]

Metric val/loss improved by 0.018 >= min_delta = 0.001. New best score: 4.564
Epoch 17, global step 43938: 'val/loss' reached 4.56417 (best 4.56417), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=17-val_loss=4.564.ckpt' as top 1


Epoch 18: 100%|██████████| 2441/2441 [02:32<00:00, 15.98it/s, v_num=64, train/loss_step=4.640, train/perplexity_step=104.0, val/loss=4.550, val/perplexity=99.60, train/loss_epoch=4.560, train/perplexity_epoch=95.50]

Metric val/loss improved by 0.013 >= min_delta = 0.001. New best score: 4.551
Epoch 18, global step 46379: 'val/loss' reached 4.55068 (best 4.55068), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=18-val_loss=4.551.ckpt' as top 1


Epoch 19: 100%|██████████| 2441/2441 [02:33<00:00, 15.89it/s, v_num=64, train/loss_step=4.630, train/perplexity_step=102.0, val/loss=4.530, val/perplexity=98.00, train/loss_epoch=4.540, train/perplexity_epoch=94.20]

Metric val/loss improved by 0.016 >= min_delta = 0.001. New best score: 4.535
Epoch 19, global step 48820: 'val/loss' reached 4.53475 (best 4.53475), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=19-val_loss=4.535.ckpt' as top 1


Epoch 20: 100%|██████████| 2441/2441 [02:32<00:00, 15.99it/s, v_num=64, train/loss_step=4.530, train/perplexity_step=92.60, val/loss=4.520, val/perplexity=96.20, train/loss_epoch=4.530, train/perplexity_epoch=93.20]

Metric val/loss improved by 0.017 >= min_delta = 0.001. New best score: 4.517
Epoch 20, global step 51261: 'val/loss' reached 4.51726 (best 4.51726), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=20-val_loss=4.517.ckpt' as top 1


Epoch 21: 100%|██████████| 2441/2441 [02:32<00:00, 16.01it/s, v_num=64, train/loss_step=4.790, train/perplexity_step=121.0, val/loss=4.510, val/perplexity=95.00, train/loss_epoch=4.520, train/perplexity_epoch=92.10]

Metric val/loss improved by 0.012 >= min_delta = 0.001. New best score: 4.505
Epoch 21, global step 53702: 'val/loss' reached 4.50510 (best 4.50510), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=21-val_loss=4.505.ckpt' as top 1


Epoch 22: 100%|██████████| 2441/2441 [02:33<00:00, 15.95it/s, v_num=64, train/loss_step=4.670, train/perplexity_step=107.0, val/loss=4.490, val/perplexity=94.00, train/loss_epoch=4.510, train/perplexity_epoch=91.20]

Metric val/loss improved by 0.010 >= min_delta = 0.001. New best score: 4.495
Epoch 22, global step 56143: 'val/loss' reached 4.49464 (best 4.49464), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=22-val_loss=4.495.ckpt' as top 1


Epoch 23: 100%|██████████| 2441/2441 [02:33<00:00, 15.87it/s, v_num=64, train/loss_step=4.410, train/perplexity_step=81.90, val/loss=4.490, val/perplexity=93.10, train/loss_epoch=4.500, train/perplexity_epoch=90.40]

Metric val/loss improved by 0.008 >= min_delta = 0.001. New best score: 4.487
Epoch 23, global step 58584: 'val/loss' reached 4.48665 (best 4.48665), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=23-val_loss=4.487.ckpt' as top 1


Epoch 24: 100%|██████████| 2441/2441 [02:40<00:00, 15.18it/s, v_num=64, train/loss_step=4.310, train/perplexity_step=74.40, val/loss=4.480, val/perplexity=92.30, train/loss_epoch=4.490, train/perplexity_epoch=89.70]

Metric val/loss improved by 0.008 >= min_delta = 0.001. New best score: 4.479
Epoch 24, global step 61025: 'val/loss' reached 4.47875 (best 4.47875), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=24-val_loss=4.479.ckpt' as top 1


Epoch 25: 100%|██████████| 2441/2441 [02:33<00:00, 15.95it/s, v_num=64, train/loss_step=4.520, train/perplexity_step=91.60, val/loss=4.460, val/perplexity=91.00, train/loss_epoch=4.490, train/perplexity_epoch=89.10]

Metric val/loss improved by 0.014 >= min_delta = 0.001. New best score: 4.465
Epoch 25, global step 63466: 'val/loss' reached 4.46451 (best 4.46451), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=25-val_loss=4.465.ckpt' as top 1


Epoch 26: 100%|██████████| 2441/2441 [02:32<00:00, 16.02it/s, v_num=64, train/loss_step=4.490, train/perplexity_step=89.00, val/loss=4.460, val/perplexity=90.10, train/loss_epoch=4.480, train/perplexity_epoch=88.40]

Metric val/loss improved by 0.008 >= min_delta = 0.001. New best score: 4.456
Epoch 26, global step 65907: 'val/loss' reached 4.45603 (best 4.45603), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=26-val_loss=4.456.ckpt' as top 1


Epoch 27: 100%|██████████| 2441/2441 [02:32<00:00, 16.00it/s, v_num=64, train/loss_step=4.430, train/perplexity_step=83.90, val/loss=4.450, val/perplexity=89.20, train/loss_epoch=4.470, train/perplexity_epoch=87.90]

Metric val/loss improved by 0.010 >= min_delta = 0.001. New best score: 4.446
Epoch 27, global step 68348: 'val/loss' reached 4.44554 (best 4.44554), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=27-val_loss=4.446.ckpt' as top 1


Epoch 28: 100%|██████████| 2441/2441 [02:31<00:00, 16.09it/s, v_num=64, train/loss_step=4.390, train/perplexity_step=80.70, val/loss=4.440, val/perplexity=89.10, train/loss_epoch=4.470, train/perplexity_epoch=87.40]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 4.444
Epoch 28, global step 70789: 'val/loss' reached 4.44428 (best 4.44428), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=28-val_loss=4.444.ckpt' as top 1


Epoch 29: 100%|██████████| 2441/2441 [02:31<00:00, 16.14it/s, v_num=64, train/loss_step=4.610, train/perplexity_step=100.0, val/loss=4.430, val/perplexity=88.20, train/loss_epoch=4.460, train/perplexity_epoch=86.90]

Metric val/loss improved by 0.010 >= min_delta = 0.001. New best score: 4.434
Epoch 29, global step 73230: 'val/loss' reached 4.43401 (best 4.43401), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=29-val_loss=4.434.ckpt' as top 1


Epoch 30: 100%|██████████| 2441/2441 [02:30<00:00, 16.18it/s, v_num=64, train/loss_step=4.360, train/perplexity_step=77.90, val/loss=4.430, val/perplexity=87.60, train/loss_epoch=4.460, train/perplexity_epoch=86.50]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 4.428
Epoch 30, global step 75671: 'val/loss' reached 4.42755 (best 4.42755), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=30-val_loss=4.428.ckpt' as top 1


Epoch 31: 100%|██████████| 2441/2441 [02:31<00:00, 16.11it/s, v_num=64, train/loss_step=4.460, train/perplexity_step=86.80, val/loss=4.420, val/perplexity=87.00, train/loss_epoch=4.450, train/perplexity_epoch=85.90]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 4.421
Epoch 31, global step 78112: 'val/loss' reached 4.42148 (best 4.42148), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=31-val_loss=4.421.ckpt' as top 1


Epoch 32: 100%|██████████| 2441/2441 [02:30<00:00, 16.27it/s, v_num=64, train/loss_step=4.680, train/perplexity_step=108.0, val/loss=4.420, val/perplexity=86.50, train/loss_epoch=4.450, train/perplexity_epoch=85.60]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 4.415
Epoch 32, global step 80553: 'val/loss' reached 4.41511 (best 4.41511), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=32-val_loss=4.415.ckpt' as top 1


Epoch 33: 100%|██████████| 2441/2441 [02:30<00:00, 16.23it/s, v_num=64, train/loss_step=4.390, train/perplexity_step=80.80, val/loss=4.410, val/perplexity=86.10, train/loss_epoch=4.440, train/perplexity_epoch=85.30]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.411
Epoch 33, global step 82994: 'val/loss' reached 4.41141 (best 4.41141), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=33-val_loss=4.411.ckpt' as top 1


Epoch 34: 100%|██████████| 2441/2441 [02:30<00:00, 16.25it/s, v_num=64, train/loss_step=4.510, train/perplexity_step=91.20, val/loss=4.400, val/perplexity=85.60, train/loss_epoch=4.440, train/perplexity_epoch=85.00]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 4.405
Epoch 34, global step 85435: 'val/loss' reached 4.40477 (best 4.40477), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=34-val_loss=4.405.ckpt' as top 1


Epoch 35: 100%|██████████| 2441/2441 [02:30<00:00, 16.18it/s, v_num=64, train/loss_step=4.450, train/perplexity_step=85.70, val/loss=4.400, val/perplexity=85.30, train/loss_epoch=4.440, train/perplexity_epoch=84.60]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.401
Epoch 35, global step 87876: 'val/loss' reached 4.40100 (best 4.40100), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=35-val_loss=4.401.ckpt' as top 1


Epoch 36: 100%|██████████| 2441/2441 [02:30<00:00, 16.21it/s, v_num=64, train/loss_step=4.480, train/perplexity_step=87.90, val/loss=4.400, val/perplexity=84.70, train/loss_epoch=4.430, train/perplexity_epoch=84.30]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 4.395
Epoch 36, global step 90317: 'val/loss' reached 4.39513 (best 4.39513), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=36-val_loss=4.395.ckpt' as top 1


Epoch 37: 100%|██████████| 2441/2441 [02:30<00:00, 16.23it/s, v_num=64, train/loss_step=4.350, train/perplexity_step=77.80, val/loss=4.390, val/perplexity=84.60, train/loss_epoch=4.430, train/perplexity_epoch=83.90]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.393
Epoch 37, global step 92758: 'val/loss' reached 4.39325 (best 4.39325), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=37-val_loss=4.393.ckpt' as top 1


Epoch 38: 100%|██████████| 2441/2441 [02:30<00:00, 16.19it/s, v_num=64, train/loss_step=4.420, train/perplexity_step=82.90, val/loss=4.390, val/perplexity=84.70, train/loss_epoch=4.430, train/perplexity_epoch=83.80]

Epoch 38, global step 95199: 'val/loss' was not in top 1


Epoch 39: 100%|██████████| 2441/2441 [02:31<00:00, 16.14it/s, v_num=64, train/loss_step=4.380, train/perplexity_step=79.60, val/loss=4.390, val/perplexity=83.90, train/loss_epoch=4.420, train/perplexity_epoch=83.50]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 4.386
Epoch 39, global step 97640: 'val/loss' reached 4.38634 (best 4.38634), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=39-val_loss=4.386.ckpt' as top 1


Epoch 40: 100%|██████████| 2441/2441 [02:30<00:00, 16.18it/s, v_num=64, train/loss_step=4.310, train/perplexity_step=74.80, val/loss=4.380, val/perplexity=83.60, train/loss_epoch=4.420, train/perplexity_epoch=83.10]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 4.381
Epoch 40, global step 100081: 'val/loss' reached 4.38139 (best 4.38139), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=40-val_loss=4.381.ckpt' as top 1


Epoch 41: 100%|██████████| 2441/2441 [02:31<00:00, 16.16it/s, v_num=64, train/loss_step=4.520, train/perplexity_step=91.80, val/loss=4.380, val/perplexity=83.20, train/loss_epoch=4.420, train/perplexity_epoch=83.00]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 4.378
Epoch 41, global step 102522: 'val/loss' reached 4.37803 (best 4.37803), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=41-val_loss=4.378.ckpt' as top 1


Epoch 42: 100%|██████████| 2441/2441 [02:30<00:00, 16.19it/s, v_num=64, train/loss_step=4.280, train/perplexity_step=72.20, val/loss=4.370, val/perplexity=82.90, train/loss_epoch=4.410, train/perplexity_epoch=82.80]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 4.374
Epoch 42, global step 104963: 'val/loss' reached 4.37351 (best 4.37351), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=42-val_loss=4.374.ckpt' as top 1


Epoch 43: 100%|██████████| 2441/2441 [02:29<00:00, 16.35it/s, v_num=64, train/loss_step=4.340, train/perplexity_step=76.80, val/loss=4.380, val/perplexity=83.30, train/loss_epoch=4.410, train/perplexity_epoch=82.60]

Epoch 43, global step 107404: 'val/loss' was not in top 1


Epoch 44: 100%|██████████| 2441/2441 [02:28<00:00, 16.39it/s, v_num=64, train/loss_step=4.220, train/perplexity_step=68.20, val/loss=4.370, val/perplexity=82.30, train/loss_epoch=4.410, train/perplexity_epoch=82.40]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 4.367
Epoch 44, global step 109845: 'val/loss' reached 4.36658 (best 4.36658), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=44-val_loss=4.367.ckpt' as top 1


Epoch 45: 100%|██████████| 2441/2441 [02:29<00:00, 16.37it/s, v_num=64, train/loss_step=4.300, train/perplexity_step=73.50, val/loss=4.360, val/perplexity=82.10, train/loss_epoch=4.410, train/perplexity_epoch=82.10]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.364
Epoch 45, global step 112286: 'val/loss' reached 4.36415 (best 4.36415), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=45-val_loss=4.364.ckpt' as top 1


Epoch 46: 100%|██████████| 2441/2441 [02:29<00:00, 16.28it/s, v_num=64, train/loss_step=4.600, train/perplexity_step=99.40, val/loss=4.370, val/perplexity=82.20, train/loss_epoch=4.400, train/perplexity_epoch=81.90]

Epoch 46, global step 114727: 'val/loss' was not in top 1


Epoch 47: 100%|██████████| 2441/2441 [02:30<00:00, 16.27it/s, v_num=64, train/loss_step=4.220, train/perplexity_step=68.00, val/loss=4.370, val/perplexity=82.30, train/loss_epoch=4.400, train/perplexity_epoch=81.80]

Epoch 47, global step 117168: 'val/loss' was not in top 1


Epoch 48: 100%|██████████| 2441/2441 [02:29<00:00, 16.33it/s, v_num=64, train/loss_step=4.640, train/perplexity_step=104.0, val/loss=4.360, val/perplexity=81.60, train/loss_epoch=4.400, train/perplexity_epoch=81.70]

Metric val/loss improved by 0.006 >= min_delta = 0.001. New best score: 4.358
Epoch 48, global step 119609: 'val/loss' reached 4.35775 (best 4.35775), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=48-val_loss=4.358.ckpt' as top 1


Epoch 49: 100%|██████████| 2441/2441 [02:31<00:00, 16.08it/s, v_num=64, train/loss_step=4.450, train/perplexity_step=85.70, val/loss=4.350, val/perplexity=81.30, train/loss_epoch=4.400, train/perplexity_epoch=81.40]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.354
Epoch 49, global step 122050: 'val/loss' reached 4.35384 (best 4.35384), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=49-val_loss=4.354.ckpt' as top 1


Epoch 50: 100%|██████████| 2441/2441 [02:29<00:00, 16.27it/s, v_num=64, train/loss_step=4.390, train/perplexity_step=81.00, val/loss=4.350, val/perplexity=81.00, train/loss_epoch=4.390, train/perplexity_epoch=81.20]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.350
Epoch 50, global step 124491: 'val/loss' reached 4.34974 (best 4.34974), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=50-val_loss=4.350.ckpt' as top 1


Epoch 51: 100%|██████████| 2441/2441 [02:30<00:00, 16.20it/s, v_num=64, train/loss_step=4.510, train/perplexity_step=91.00, val/loss=4.350, val/perplexity=81.30, train/loss_epoch=4.390, train/perplexity_epoch=81.10]

Epoch 51, global step 126932: 'val/loss' was not in top 1


Epoch 52: 100%|██████████| 2441/2441 [02:29<00:00, 16.29it/s, v_num=64, train/loss_step=4.410, train/perplexity_step=82.10, val/loss=4.350, val/perplexity=81.10, train/loss_epoch=4.390, train/perplexity_epoch=81.00]

Epoch 52, global step 129373: 'val/loss' was not in top 1


Epoch 53: 100%|██████████| 2441/2441 [02:30<00:00, 16.23it/s, v_num=64, train/loss_step=4.380, train/perplexity_step=79.50, val/loss=4.340, val/perplexity=80.40, train/loss_epoch=4.390, train/perplexity_epoch=80.80]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 4.342
Epoch 53, global step 131814: 'val/loss' reached 4.34241 (best 4.34241), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=53-val_loss=4.342.ckpt' as top 1


Epoch 54: 100%|██████████| 2441/2441 [02:30<00:00, 16.23it/s, v_num=64, train/loss_step=4.460, train/perplexity_step=86.70, val/loss=4.340, val/perplexity=80.60, train/loss_epoch=4.390, train/perplexity_epoch=80.60]

Epoch 54, global step 134255: 'val/loss' was not in top 1


Epoch 55: 100%|██████████| 2441/2441 [02:30<00:00, 16.18it/s, v_num=64, train/loss_step=4.360, train/perplexity_step=78.10, val/loss=4.340, val/perplexity=80.60, train/loss_epoch=4.390, train/perplexity_epoch=80.50]

Epoch 55, global step 136696: 'val/loss' was not in top 1


Epoch 56: 100%|██████████| 2441/2441 [02:29<00:00, 16.28it/s, v_num=64, train/loss_step=4.310, train/perplexity_step=74.20, val/loss=4.340, val/perplexity=80.50, train/loss_epoch=4.380, train/perplexity_epoch=80.40]

Epoch 56, global step 139137: 'val/loss' was not in top 1


Epoch 57: 100%|██████████| 2441/2441 [02:30<00:00, 16.21it/s, v_num=64, train/loss_step=4.470, train/perplexity_step=87.00, val/loss=4.340, val/perplexity=80.10, train/loss_epoch=4.380, train/perplexity_epoch=80.30]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 4.339
Epoch 57, global step 141578: 'val/loss' reached 4.33923 (best 4.33923), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=57-val_loss=4.339.ckpt' as top 1


Epoch 58: 100%|██████████| 2441/2441 [02:30<00:00, 16.24it/s, v_num=64, train/loss_step=4.340, train/perplexity_step=76.70, val/loss=4.340, val/perplexity=80.00, train/loss_epoch=4.380, train/perplexity_epoch=80.00]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.335
Epoch 58, global step 144019: 'val/loss' reached 4.33516 (best 4.33516), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=58-val_loss=4.335.ckpt' as top 1


Epoch 59: 100%|██████████| 2441/2441 [02:30<00:00, 16.17it/s, v_num=64, train/loss_step=4.370, train/perplexity_step=79.00, val/loss=4.330, val/perplexity=79.70, train/loss_epoch=4.380, train/perplexity_epoch=80.00]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.333
Epoch 59, global step 146460: 'val/loss' reached 4.33287 (best 4.33287), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=59-val_loss=4.333.ckpt' as top 1


Epoch 60: 100%|██████████| 2441/2441 [02:29<00:00, 16.29it/s, v_num=64, train/loss_step=4.420, train/perplexity_step=83.20, val/loss=4.330, val/perplexity=79.80, train/loss_epoch=4.380, train/perplexity_epoch=79.90]

Epoch 60, global step 148901: 'val/loss' was not in top 1


Epoch 61: 100%|██████████| 2441/2441 [02:30<00:00, 16.19it/s, v_num=64, train/loss_step=4.360, train/perplexity_step=78.40, val/loss=4.330, val/perplexity=79.90, train/loss_epoch=4.380, train/perplexity_epoch=79.70]

Epoch 61, global step 151342: 'val/loss' was not in top 1


Epoch 62: 100%|██████████| 2441/2441 [02:29<00:00, 16.28it/s, v_num=64, train/loss_step=4.460, train/perplexity_step=86.10, val/loss=4.330, val/perplexity=79.60, train/loss_epoch=4.380, train/perplexity_epoch=79.70]

Epoch 62, global step 153783: 'val/loss' reached 4.33212 (best 4.33212), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=62-val_loss=4.332.ckpt' as top 1


Epoch 63: 100%|██████████| 2441/2441 [02:31<00:00, 16.15it/s, v_num=64, train/loss_step=4.530, train/perplexity_step=92.70, val/loss=4.330, val/perplexity=79.20, train/loss_epoch=4.370, train/perplexity_epoch=79.40]

Metric val/loss improved by 0.007 >= min_delta = 0.001. New best score: 4.326
Epoch 63, global step 156224: 'val/loss' reached 4.32589 (best 4.32589), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=63-val_loss=4.326.ckpt' as top 1


Epoch 64: 100%|██████████| 2441/2441 [02:30<00:00, 16.27it/s, v_num=64, train/loss_step=4.520, train/perplexity_step=91.70, val/loss=4.330, val/perplexity=79.10, train/loss_epoch=4.370, train/perplexity_epoch=79.40]

Epoch 64, global step 158665: 'val/loss' reached 4.32507 (best 4.32507), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=64-val_loss=4.325.ckpt' as top 1


Epoch 65: 100%|██████████| 2441/2441 [02:30<00:00, 16.21it/s, v_num=64, train/loss_step=4.560, train/perplexity_step=95.70, val/loss=4.330, val/perplexity=79.30, train/loss_epoch=4.370, train/perplexity_epoch=79.30]

Epoch 65, global step 161106: 'val/loss' was not in top 1


Epoch 66: 100%|██████████| 2441/2441 [02:30<00:00, 16.27it/s, v_num=64, train/loss_step=4.390, train/perplexity_step=80.90, val/loss=4.330, val/perplexity=78.90, train/loss_epoch=4.370, train/perplexity_epoch=79.20]

Epoch 66, global step 163547: 'val/loss' was not in top 1


Epoch 67: 100%|██████████| 2441/2441 [02:31<00:00, 16.12it/s, v_num=64, train/loss_step=4.640, train/perplexity_step=104.0, val/loss=4.320, val/perplexity=78.80, train/loss_epoch=4.370, train/perplexity_epoch=79.00]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 4.321
Epoch 67, global step 165988: 'val/loss' reached 4.32107 (best 4.32107), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=67-val_loss=4.321.ckpt' as top 1


Epoch 68: 100%|██████████| 2441/2441 [02:30<00:00, 16.22it/s, v_num=64, train/loss_step=4.290, train/perplexity_step=72.90, val/loss=4.320, val/perplexity=78.60, train/loss_epoch=4.370, train/perplexity_epoch=78.90]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.319
Epoch 68, global step 168429: 'val/loss' reached 4.31931 (best 4.31931), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=68-val_loss=4.319.ckpt' as top 1


Epoch 69: 100%|██████████| 2441/2441 [02:30<00:00, 16.20it/s, v_num=64, train/loss_step=4.450, train/perplexity_step=85.80, val/loss=4.320, val/perplexity=78.50, train/loss_epoch=4.370, train/perplexity_epoch=78.80]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.317
Epoch 69, global step 170870: 'val/loss' reached 4.31708 (best 4.31708), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=69-val_loss=4.317.ckpt' as top 1


Epoch 70: 100%|██████████| 2441/2441 [02:30<00:00, 16.22it/s, v_num=64, train/loss_step=4.560, train/perplexity_step=95.40, val/loss=4.320, val/perplexity=78.70, train/loss_epoch=4.360, train/perplexity_epoch=78.80]

Epoch 70, global step 173311: 'val/loss' was not in top 1


Epoch 71: 100%|██████████| 2441/2441 [02:31<00:00, 16.09it/s, v_num=64, train/loss_step=4.280, train/perplexity_step=72.60, val/loss=4.310, val/perplexity=78.10, train/loss_epoch=4.360, train/perplexity_epoch=78.60]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.313
Epoch 71, global step 175752: 'val/loss' reached 4.31296 (best 4.31296), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=71-val_loss=4.313.ckpt' as top 1


Epoch 72: 100%|██████████| 2441/2441 [02:30<00:00, 16.25it/s, v_num=64, train/loss_step=4.170, train/perplexity_step=64.40, val/loss=4.310, val/perplexity=78.30, train/loss_epoch=4.360, train/perplexity_epoch=78.50]

Epoch 72, global step 178193: 'val/loss' was not in top 1


Epoch 73: 100%|██████████| 2441/2441 [02:31<00:00, 16.14it/s, v_num=64, train/loss_step=4.210, train/perplexity_step=67.60, val/loss=4.310, val/perplexity=78.00, train/loss_epoch=4.360, train/perplexity_epoch=78.40]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.311
Epoch 73, global step 180634: 'val/loss' reached 4.31144 (best 4.31144), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=73-val_loss=4.311.ckpt' as top 1


Epoch 74: 100%|██████████| 2441/2441 [02:29<00:00, 16.31it/s, v_num=64, train/loss_step=4.380, train/perplexity_step=79.50, val/loss=4.310, val/perplexity=77.60, train/loss_epoch=4.360, train/perplexity_epoch=78.30]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.308
Epoch 74, global step 183075: 'val/loss' reached 4.30753 (best 4.30753), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=74-val_loss=4.308.ckpt' as top 1


Epoch 75: 100%|██████████| 2441/2441 [02:31<00:00, 16.12it/s, v_num=64, train/loss_step=4.400, train/perplexity_step=81.30, val/loss=4.310, val/perplexity=77.80, train/loss_epoch=4.360, train/perplexity_epoch=78.20]

Epoch 75, global step 185516: 'val/loss' was not in top 1


Epoch 76: 100%|██████████| 2441/2441 [02:30<00:00, 16.25it/s, v_num=64, train/loss_step=4.620, train/perplexity_step=102.0, val/loss=4.310, val/perplexity=77.70, train/loss_epoch=4.360, train/perplexity_epoch=78.10]

Epoch 76, global step 187957: 'val/loss' reached 4.30701 (best 4.30701), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=76-val_loss=4.307.ckpt' as top 1


Epoch 77: 100%|██████████| 2441/2441 [02:31<00:00, 16.10it/s, v_num=64, train/loss_step=4.340, train/perplexity_step=77.10, val/loss=4.300, val/perplexity=77.50, train/loss_epoch=4.350, train/perplexity_epoch=77.90]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 4.304
Epoch 77, global step 190398: 'val/loss' reached 4.30448 (best 4.30448), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=77-val_loss=4.304.ckpt' as top 1


Epoch 78: 100%|██████████| 2441/2441 [02:37<00:00, 15.50it/s, v_num=64, train/loss_step=4.530, train/perplexity_step=92.80, val/loss=4.310, val/perplexity=77.60, train/loss_epoch=4.350, train/perplexity_epoch=77.90]

Epoch 78, global step 192839: 'val/loss' was not in top 1


Epoch 79: 100%|██████████| 2441/2441 [02:36<00:00, 15.55it/s, v_num=64, train/loss_step=4.520, train/perplexity_step=91.60, val/loss=4.310, val/perplexity=77.90, train/loss_epoch=4.350, train/perplexity_epoch=77.80]

Epoch 79, global step 195280: 'val/loss' was not in top 1


Epoch 80: 100%|██████████| 2441/2441 [02:35<00:00, 15.70it/s, v_num=64, train/loss_step=4.440, train/perplexity_step=84.50, val/loss=4.310, val/perplexity=77.50, train/loss_epoch=4.350, train/perplexity_epoch=77.70]

Epoch 80, global step 197721: 'val/loss' was not in top 1


Epoch 81: 100%|██████████| 2441/2441 [02:35<00:00, 15.68it/s, v_num=64, train/loss_step=4.440, train/perplexity_step=84.50, val/loss=4.300, val/perplexity=77.10, train/loss_epoch=4.350, train/perplexity_epoch=77.50]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 4.299
Epoch 81, global step 200162: 'val/loss' reached 4.29947 (best 4.29947), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=81-val_loss=4.299.ckpt' as top 1


Epoch 82: 100%|██████████| 2441/2441 [02:35<00:00, 15.65it/s, v_num=64, train/loss_step=4.480, train/perplexity_step=87.80, val/loss=4.300, val/perplexity=77.10, train/loss_epoch=4.350, train/perplexity_epoch=77.50]

Epoch 82, global step 202603: 'val/loss' reached 4.29940 (best 4.29940), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=82-val_loss=4.299.ckpt' as top 1


Epoch 83: 100%|██████████| 2441/2441 [02:35<00:00, 15.70it/s, v_num=64, train/loss_step=4.320, train/perplexity_step=75.30, val/loss=4.300, val/perplexity=76.90, train/loss_epoch=4.350, train/perplexity_epoch=77.40]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.297
Epoch 83, global step 205044: 'val/loss' reached 4.29736 (best 4.29736), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=83-val_loss=4.297.ckpt' as top 1


Epoch 84: 100%|██████████| 2441/2441 [02:35<00:00, 15.70it/s, v_num=64, train/loss_step=4.140, train/perplexity_step=62.90, val/loss=4.300, val/perplexity=77.10, train/loss_epoch=4.350, train/perplexity_epoch=77.30]

Epoch 84, global step 207485: 'val/loss' was not in top 1


Epoch 85: 100%|██████████| 2441/2441 [02:34<00:00, 15.77it/s, v_num=64, train/loss_step=4.470, train/perplexity_step=87.60, val/loss=4.290, val/perplexity=76.50, train/loss_epoch=4.340, train/perplexity_epoch=77.20]

Metric val/loss improved by 0.005 >= min_delta = 0.001. New best score: 4.293
Epoch 85, global step 209926: 'val/loss' reached 4.29263 (best 4.29263), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=85-val_loss=4.293.ckpt' as top 1


Epoch 86: 100%|██████████| 2441/2441 [02:36<00:00, 15.64it/s, v_num=64, train/loss_step=4.200, train/perplexity_step=66.50, val/loss=4.290, val/perplexity=76.50, train/loss_epoch=4.340, train/perplexity_epoch=77.10]

Epoch 86, global step 212367: 'val/loss' was not in top 1


Epoch 87: 100%|██████████| 2441/2441 [02:34<00:00, 15.75it/s, v_num=64, train/loss_step=4.100, train/perplexity_step=60.20, val/loss=4.290, val/perplexity=76.40, train/loss_epoch=4.340, train/perplexity_epoch=77.00]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 4.291
Epoch 87, global step 214808: 'val/loss' reached 4.29119 (best 4.29119), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=87-val_loss=4.291.ckpt' as top 1


Epoch 88: 100%|██████████| 2441/2441 [02:34<00:00, 15.77it/s, v_num=64, train/loss_step=4.570, train/perplexity_step=96.30, val/loss=4.290, val/perplexity=76.50, train/loss_epoch=4.340, train/perplexity_epoch=77.00]

Epoch 88, global step 217249: 'val/loss' was not in top 1


Epoch 89: 100%|██████████| 2441/2441 [02:35<00:00, 15.71it/s, v_num=64, train/loss_step=4.420, train/perplexity_step=82.80, val/loss=4.290, val/perplexity=76.70, train/loss_epoch=4.340, train/perplexity_epoch=76.90]

Epoch 89, global step 219690: 'val/loss' was not in top 1


Epoch 90: 100%|██████████| 2441/2441 [02:36<00:00, 15.57it/s, v_num=64, train/loss_step=4.500, train/perplexity_step=89.90, val/loss=4.290, val/perplexity=76.30, train/loss_epoch=4.340, train/perplexity_epoch=76.80]

Metric val/loss improved by 0.001 >= min_delta = 0.001. New best score: 4.290
Epoch 90, global step 222131: 'val/loss' reached 4.28989 (best 4.28989), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=90-val_loss=4.290.ckpt' as top 1


Epoch 91: 100%|██████████| 2441/2441 [02:34<00:00, 15.80it/s, v_num=64, train/loss_step=4.260, train/perplexity_step=70.80, val/loss=4.290, val/perplexity=76.20, train/loss_epoch=4.340, train/perplexity_epoch=76.60]

Metric val/loss improved by 0.002 >= min_delta = 0.001. New best score: 4.288
Epoch 91, global step 224572: 'val/loss' reached 4.28772 (best 4.28772), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=91-val_loss=4.288.ckpt' as top 1


Epoch 92: 100%|██████████| 2441/2441 [02:35<00:00, 15.70it/s, v_num=64, train/loss_step=4.420, train/perplexity_step=83.30, val/loss=4.290, val/perplexity=76.20, train/loss_epoch=4.340, train/perplexity_epoch=76.60]

Epoch 92, global step 227013: 'val/loss' reached 4.28771 (best 4.28771), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=92-val_loss=4.288.ckpt' as top 1


Epoch 93: 100%|██████████| 2441/2441 [02:35<00:00, 15.75it/s, v_num=64, train/loss_step=4.140, train/perplexity_step=62.90, val/loss=4.290, val/perplexity=76.00, train/loss_epoch=4.340, train/perplexity_epoch=76.60]

Epoch 93, global step 229454: 'val/loss' reached 4.28682 (best 4.28682), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=93-val_loss=4.287.ckpt' as top 1


Epoch 94: 100%|██████████| 2441/2441 [02:35<00:00, 15.68it/s, v_num=64, train/loss_step=4.540, train/perplexity_step=93.80, val/loss=4.280, val/perplexity=75.80, train/loss_epoch=4.330, train/perplexity_epoch=76.40]

Metric val/loss improved by 0.004 >= min_delta = 0.001. New best score: 4.284
Epoch 94, global step 231895: 'val/loss' reached 4.28390 (best 4.28390), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=94-val_loss=4.284.ckpt' as top 1


Epoch 95: 100%|██████████| 2441/2441 [02:35<00:00, 15.73it/s, v_num=64, train/loss_step=4.320, train/perplexity_step=74.90, val/loss=4.290, val/perplexity=76.00, train/loss_epoch=4.330, train/perplexity_epoch=76.30]

Epoch 95, global step 234336: 'val/loss' was not in top 1


Epoch 96: 100%|██████████| 2441/2441 [02:35<00:00, 15.70it/s, v_num=64, train/loss_step=4.350, train/perplexity_step=77.60, val/loss=4.280, val/perplexity=75.80, train/loss_epoch=4.330, train/perplexity_epoch=76.20]

Epoch 96, global step 236777: 'val/loss' reached 4.28292 (best 4.28292), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=96-val_loss=4.283.ckpt' as top 1


Epoch 97: 100%|██████████| 2441/2441 [02:35<00:00, 15.68it/s, v_num=64, train/loss_step=4.450, train/perplexity_step=85.50, val/loss=4.280, val/perplexity=75.80, train/loss_epoch=4.330, train/perplexity_epoch=76.20]

Epoch 97, global step 239218: 'val/loss' was not in top 1


Epoch 98: 100%|██████████| 2441/2441 [02:36<00:00, 15.62it/s, v_num=64, train/loss_step=4.420, train/perplexity_step=83.00, val/loss=4.290, val/perplexity=76.20, train/loss_epoch=4.330, train/perplexity_epoch=76.10]

Epoch 98, global step 241659: 'val/loss' was not in top 1


Epoch 99: 100%|██████████| 2441/2441 [02:34<00:00, 15.79it/s, v_num=64, train/loss_step=4.140, train/perplexity_step=62.80, val/loss=4.280, val/perplexity=75.70, train/loss_epoch=4.330, train/perplexity_epoch=76.00]

Metric val/loss improved by 0.003 >= min_delta = 0.001. New best score: 4.281
Epoch 99, global step 244100: 'val/loss' reached 4.28105 (best 4.28105), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=99-val_loss=4.281.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 2441/2441 [02:34<00:00, 15.78it/s, v_num=64, train/loss_step=4.140, train/perplexity_step=62.80, val/loss=4.280, val/perplexity=75.70, train/loss_epoch=4.330, train/perplexity_epoch=76.00]


#### Test & Save the model

In [25]:
# Test model
print("\nTesting model...")
trainer.test(model, data_module)

# Save best model instead of final model
if checkpoint_callback.best_model_path:
    # Copy the best model to a final location with version number
    import shutil
    version = trainer.logger.version
    final_model_path = os.path.join(save_dir, f"{experiment_name}-best-v{version:02d}.ckpt")
    shutil.copy2(checkpoint_callback.best_model_path, final_model_path)
    print(f"Best model copied to {final_model_path}")
else:
    print("No best model found, saving current model as final")
    version = trainer.logger.version
    final_model_path = os.path.join(save_dir, f"{experiment_name}-final-v{version:02d}.ckpt")
    trainer.save_checkpoint(final_model_path)
    print(f"Final model saved to {final_model_path}")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Testing model...
Testing DataLoader 0: 100%|██████████| 347/347 [00:01<00:00, 188.48it/s]


Best model copied to ./checkpoints\transformer_lm-best-v64.ckpt


In [26]:
# Also save the vocab with the version number
vocab_path = os.path.join(save_dir, f"vocab-v{version:02d}.pkl")
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
    print(f"Vocab saved to {vocab_path}")

Vocab saved to ./checkpoints\vocab-v64.pkl


In [27]:
print("\nTraining completed!")
print(f"Checkpoints saved in: {save_dir}")
print(f"Logs saved in: {log_dir}")
print(f"Vocabulary saved in: {vocab_path}")
print(f"Best model: {checkpoint_callback.best_model_path}")
print(f"Best score: {checkpoint_callback.best_model_score}")
print(f"Final model saved as: {final_model_path}")

# Print instructions for running the app
print("\n" + "="*50)
print("To run the Gradio app with your trained model:")
print(f"python -m src.app.gradio_app --model_path {final_model_path} --vocab_path {vocab_path}")

print("="*50)


Training completed!
Checkpoints saved in: ./checkpoints
Logs saved in: ./logs
Vocabulary saved in: ./checkpoints\vocab-v64.pkl
Best model: C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=99-val_loss=4.281.ckpt
Best score: 4.2810516357421875
Final model saved as: ./checkpoints\transformer_lm-best-v64.ckpt

To run the Gradio app with your trained model:
python -m src.app.gradio_app --model_path ./checkpoints\transformer_lm-best-v64.ckpt --vocab_path ./checkpoints\vocab-v64.pkl


In [28]:
# vocab['word_to_idx']['brain,']