# **Transformer Language Model**

## Imports

In [1]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device() if torch.cuda.is_available() else 'N/A'}")

CUDA available: True
Device count: 1
Current device: 0


#### Set Root Dir

In [2]:
import sys
import os

# Use the current working directory as root (or go up if needed)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '../..'))  # Adjust '..' as needed
sys.path.append(ROOT_DIR)

#### Import Modules

In [3]:
"""
Training script for the transformer language model.
"""
import argparse
import torch
import pytorch_lightning as pl
import pickle
import yaml
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer

from src.data.tokenizer import SimpleTokenizer, load_text_corpus, create_sample_corpus
from src.data.tokenizer_factory import TokenizerFactory
from src.data.dataset import TextDataModule
from src.model.lightning_module import TransformerLightningModule

  from .autonotebook import tqdm as notebook_tqdm


## Load Params

#### Load yaml config

In [4]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

#### Set hyperparams

In [5]:
hparams = config["hparams"]

# this is not the best way to do unpack the hyperparams, but it's just for the demo
vocab_size, d_model, num_heads, num_layers, d_ff, \
sequence_length, stride, batch_size, learning_rate, max_epochs, \
patience, min_delta, warmup_steps, weight_decay, dropout, \
train_split, val_split, num_workers, accelerator, devices, precision, \
gradient_clip_val, accumulate_grad_batches, log_every_n_steps, \
val_check_interval, save_top_k, monitor, mode = hparams.values()

In [6]:
hparams

{'vocab_size': 8000,
 'd_model': 8,
 'num_heads': 1,
 'num_layers': 2,
 'd_ff': 32,
 'sequence_length': 64,
 'stride': 1,
 'batch_size': 32,
 'learning_rate': 0.0005,
 'max_epochs': 100,
 'patience': 10,
 'min_delta': 0.001,
 'warmup_steps': 1000,
 'weight_decay': 0.01,
 'dropout': 0.2,
 'train_split': 0.7,
 'val_split': 0.2,
 'num_workers': 0,
 'accelerator': 'auto',
 'devices': 1,
 'precision': '32',
 'gradient_clip_val': 1.0,
 'accumulate_grad_batches': 1,
 'log_every_n_steps': 50,
 'val_check_interval': 1.0,
 'save_top_k': 1,
 'monitor': 'val/loss',
 'mode': 'min'}

#### Set paths and general config

In [7]:
paths = config["paths"]
general = config["general"]

corpus_path=paths["corpus_path"]
save_dir=paths["save_dir"]
log_dir=paths["log_dir"]
experiment_name=general["experiment_name"]
create_sample=general["create_sample"]

## Data preprocessing

#### Load text

In [8]:
# Load and tokenize text
print("Loading and tokenizing text...")
text = load_text_corpus(corpus_path)

Loading and tokenizing text...


#### Tokenize

In [9]:
    # Create tokenizer and build vocabulary
    tokenizer_config = config.get("tokenizer", {})
    tokenizer_type = tokenizer_config.get("type", "word")
    
    if tokenizer_type == "word":
        tokenizer = TokenizerFactory.create_tokenizer("word", vocab_size=vocab_size)
    elif tokenizer_type == "bpe":
        bpe_options = tokenizer_config.get("bpe_options", {})
        tokenizer = TokenizerFactory.create_tokenizer("bpe", vocab_size=vocab_size, **bpe_options)
    else:
        raise ValueError(f"Unknown tokenizer type: {tokenizer_type}")

In [10]:
# Create tokenizer and build vocabulary
# tokenizer = SimpleTokenizer(vocab_size=hparams["vocab_size"])
tokenizer.build_vocab(text)

Vocabulary built with 8002 tokens


#### Save vocab

In [None]:
# Save vocabulary.
# For the demo, we override the vocab file. you can adjust the vocal file name as you like.
vocab_path = os.path.join(save_dir, "vocab.pkl")
os.makedirs(save_dir, exist_ok=True)
tokenizer.save_vocab(vocab_path)
print(f"Vocabulary saved to {vocab_path}")

#### Read vocab

In [12]:
# read vocab
# this step is only necessary if loading the vocab from a file instead of creating one
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)


In [13]:
vocab['word_to_idx']

{'<PAD>': 0,
 '<UNK>': 1,
 '<SOS>': 2,
 '<EOS>': 3,
 '<LINE_BREAK>': 4,
 '<SONG_BREAK>': 5,
 'i': 6,
 'you': 7,
 'the': 8,
 'and': 9,
 'to': 10,
 'a': 11,
 'me': 12,
 'my': 13,
 'it': 14,
 'in': 15,
 "i'm": 16,
 'that': 17,
 'your': 18,
 'on': 19,
 'of': 20,
 'all': 21,
 'but': 22,
 "don't": 23,
 'we': 24,
 'like': 25,
 'know': 26,
 'be': 27,
 'for': 28,
 'when': 29,
 'is': 30,
 'so': 31,
 "it's": 32,
 'just': 33,
 'with': 34,
 'this': 35,
 'love': 36,
 'what': 37,
 "you're": 38,
 'was': 39,
 'got': 40,
 'up': 41,
 'if': 42,
 'do': 43,
 'never': 44,
 'no': 45,
 'not': 46,
 'out': 47,
 "'cause": 48,
 'can': 49,
 "can't": 50,
 'now': 51,
 'wanna': 52,
 'time': 53,
 'she': 54,
 'one': 55,
 'get': 56,
 'go': 57,
 'me,': 58,
 'are': 59,
 'at': 60,
 'see': 61,
 'want': 62,
 'oh,': 63,
 'say': 64,
 'let': 65,
 'from': 66,
 'they': 67,
 'make': 68,
 'take': 69,
 'yeah,': 70,
 'been': 71,
 'have': 72,
 'you,': 73,
 "i'll": 74,
 'yeah': 75,
 'tell': 76,
 'feel': 77,
 "i've": 78,
 'could': 79,
 '

#### Get token ids

In [13]:
# Encode text
token_ids = tokenizer.encode(text)
print(f"Encoded text length: {len(token_ids)} tokens")
print("The number of unique tokens in the vocab is:", len(set(token_ids)))

Encoded text length: 103967 tokens
The number of unique tokens in the vocab is: 3541


In [14]:
ids_to_words = tokenizer.decode(token_ids)
print(f"Decoded text: {ids_to_words[:100]}")


Decoded text: hello, it's me i was wondering if after all these years, you'd like to meet to go over everything th


## Model Training

#### Create data module

In [15]:
# Create data module
data_module = TextDataModule(
    token_ids=token_ids,
    sequence_length=sequence_length,
    stride=stride,
    batch_size=batch_size,
    train_split=train_split,
    val_split=val_split,
    num_workers=int(os.cpu_count()*0.8) #use half of the cores   #num_workers
)

#### Create model

In [16]:
# Create model
model = TransformerLightningModule(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    d_ff=d_ff,
    dropout=dropout,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps
)

#### Set callback

In [17]:
# Create callbacks
callbacks = []

# Early stopping
early_stopping = EarlyStopping(
    monitor=monitor,
    patience=patience,
    min_delta=min_delta,
    mode=mode,
    verbose=True
)
callbacks.append(early_stopping)


# Model checkpointing
checkpoint_callback = ModelCheckpoint(
    dirpath=save_dir,
    filename=f"{experiment_name}-epoch={{epoch:02d}}-val_loss={{val/loss:.3f}}", #-v{trainer.logger.version:02d}",
    monitor=monitor,
    mode=mode,
    auto_insert_metric_name=False, # Prevents the name 'val/loss=' from being prepended
    save_top_k=save_top_k,
    save_last=True,
    verbose=True
)
callbacks.append(checkpoint_callback)

# Learning rate monitoring
lr_monitor = LearningRateMonitor(logging_interval='step')
callbacks.append(lr_monitor)

#### Create trainer and logger

In [18]:
# Create logger
logger = TensorBoardLogger(
    save_dir=log_dir,
    name=experiment_name,
    version=None
)

# Create trainer
trainer = Trainer(
    accelerator=accelerator,
    devices=devices,
    precision=precision,
    max_epochs=max_epochs,
    gradient_clip_val=gradient_clip_val,
    accumulate_grad_batches=accumulate_grad_batches,
    log_every_n_steps=log_every_n_steps,
    val_check_interval=val_check_interval,
    callbacks=callbacks,
    logger=logger,
    deterministic=True,
    enable_progress_bar=True,
    enable_model_summary=True
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


#### Summarize prepared data

In [19]:
# Print model summary
print("\nModel Summary:")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Check data sizes
print("Data sizes:")
print(f"Total tokens: {len(token_ids)}")
print(f"Sequence length: {sequence_length}")
print(f"Stride: {stride}")
print(f"Train split: {train_split}")
print(f"Val split: {val_split}")

# Calculate split sizes
total_len = len(token_ids)
train_end = int(total_len * train_split)
val_end = int(total_len * (train_split + val_split))

print(f"Train tokens: {train_end}")
print(f"Val tokens: {val_end - train_end}")
print(f"Test tokens: {total_len - val_end}")

# Check if validation data is sufficient
val_tokens = val_end - train_end
if val_tokens < sequence_length:
    print(f"WARNING: Validation data has only {val_tokens} tokens, less than sequence length {sequence_length}")
    print("This will cause validation to fail. Consider using a larger corpus or adjusting splits.")


Model Summary:
Vocabulary size: 4076
Model parameters: 20,768
Trainable parameters: 20,768
Data sizes:
Total tokens: 103967
Sequence length: 64
Stride: 1
Train split: 0.7
Val split: 0.2
Train tokens: 72776
Val tokens: 20794
Test tokens: 10397


#### Train the neural netork transformer

In [20]:
# Train model
print("\nStarting training...")
print("With the updated parameters:")
print(f"- Sequence length: {sequence_length}")
print(f"- Train split: {train_split}")
print(f"- Val split: {val_split}")
print(f"- Monitor: {monitor}")
print()

trainer.fit(model, data_module)


Starting training...
With the updated parameters:
- Sequence length: 64
- Train split: 0.7
- Val split: 0.2
- Monitor: val/loss



c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\code\data_science\demo_llm\checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | TransformerLM    | 20.8 K | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
20.8 K    Trainable params
0         Non-trainable params
20.8 K    Total params
0.083     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:428: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:428: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 0: 100%|██████████| 2273/2273 [01:48<00:00, 20.94it/s, v_num=69, train/loss_step=7.010, train/perplexity_step=1.11e+3, val/loss=7.070, val/perplexity=1.19e+3, train/loss_epoch=7.810, train/perplexity_epoch=2.95e+3]

Metric val/loss improved. New best score: 7.067
Epoch 0, global step 2273: 'val/loss' reached 7.06729 (best 7.06729), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=00-val_loss=7.067.ckpt' as top 1


Epoch 1: 100%|██████████| 2273/2273 [02:43<00:00, 13.89it/s, v_num=69, train/loss_step=5.490, train/perplexity_step=242.0, val/loss=5.660, val/perplexity=308.0, train/loss_epoch=6.070, train/perplexity_epoch=492.0]      

Metric val/loss improved by 1.405 >= min_delta = 0.001. New best score: 5.662
Epoch 1, global step 4546: 'val/loss' reached 5.66237 (best 5.66237), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=01-val_loss=5.662.ckpt' as top 1


Epoch 2: 100%|██████████| 2273/2273 [02:34<00:00, 14.70it/s, v_num=69, train/loss_step=5.470, train/perplexity_step=238.0, val/loss=5.550, val/perplexity=287.0, train/loss_epoch=5.520, train/perplexity_epoch=251.0]

Metric val/loss improved by 0.108 >= min_delta = 0.001. New best score: 5.554
Epoch 2, global step 6819: 'val/loss' reached 5.55396 (best 5.55396), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=02-val_loss=5.554.ckpt' as top 1


Epoch 3: 100%|██████████| 2273/2273 [02:33<00:00, 14.84it/s, v_num=69, train/loss_step=5.380, train/perplexity_step=216.0, val/loss=5.560, val/perplexity=294.0, train/loss_epoch=5.490, train/perplexity_epoch=243.0]

Epoch 3, global step 9092: 'val/loss' was not in top 1


Epoch 4: 100%|██████████| 2273/2273 [02:33<00:00, 14.84it/s, v_num=69, train/loss_step=5.540, train/perplexity_step=255.0, val/loss=5.580, val/perplexity=303.0, train/loss_epoch=5.480, train/perplexity_epoch=241.0]

Epoch 4, global step 11365: 'val/loss' was not in top 1


Epoch 5: 100%|██████████| 2273/2273 [02:32<00:00, 14.88it/s, v_num=69, train/loss_step=5.580, train/perplexity_step=266.0, val/loss=5.600, val/perplexity=311.0, train/loss_epoch=5.480, train/perplexity_epoch=240.0]

Epoch 5, global step 13638: 'val/loss' was not in top 1


Epoch 6: 100%|██████████| 2273/2273 [02:32<00:00, 14.95it/s, v_num=69, train/loss_step=5.370, train/perplexity_step=215.0, val/loss=5.620, val/perplexity=321.0, train/loss_epoch=5.480, train/perplexity_epoch=240.0]

Epoch 6, global step 15911: 'val/loss' was not in top 1


Epoch 7: 100%|██████████| 2273/2273 [02:31<00:00, 14.98it/s, v_num=69, train/loss_step=5.240, train/perplexity_step=188.0, val/loss=5.640, val/perplexity=331.0, train/loss_epoch=5.480, train/perplexity_epoch=240.0]

Epoch 7, global step 18184: 'val/loss' was not in top 1


Epoch 8: 100%|██████████| 2273/2273 [02:48<00:00, 13.46it/s, v_num=69, train/loss_step=5.790, train/perplexity_step=328.0, val/loss=5.650, val/perplexity=342.0, train/loss_epoch=5.480, train/perplexity_epoch=240.0]

Epoch 8, global step 20457: 'val/loss' was not in top 1


Epoch 9: 100%|██████████| 2273/2273 [02:32<00:00, 14.94it/s, v_num=69, train/loss_step=5.390, train/perplexity_step=219.0, val/loss=5.670, val/perplexity=354.0, train/loss_epoch=5.480, train/perplexity_epoch=240.0]

Epoch 9, global step 22730: 'val/loss' was not in top 1


Epoch 10: 100%|██████████| 2273/2273 [02:31<00:00, 14.97it/s, v_num=69, train/loss_step=5.420, train/perplexity_step=227.0, val/loss=5.690, val/perplexity=366.0, train/loss_epoch=5.480, train/perplexity_epoch=239.0]

Epoch 10, global step 25003: 'val/loss' was not in top 1


Epoch 11: 100%|██████████| 2273/2273 [02:32<00:00, 14.90it/s, v_num=69, train/loss_step=5.270, train/perplexity_step=194.0, val/loss=5.700, val/perplexity=377.0, train/loss_epoch=5.470, train/perplexity_epoch=239.0]

Epoch 11, global step 27276: 'val/loss' was not in top 1


Epoch 12: 100%|██████████| 2273/2273 [02:33<00:00, 14.76it/s, v_num=69, train/loss_step=5.420, train/perplexity_step=225.0, val/loss=5.710, val/perplexity=386.0, train/loss_epoch=5.470, train/perplexity_epoch=239.0]

Monitored metric val/loss did not improve in the last 10 records. Best score: 5.554. Signaling Trainer to stop.
Epoch 12, global step 29549: 'val/loss' was not in top 1


Epoch 12: 100%|██████████| 2273/2273 [02:34<00:00, 14.76it/s, v_num=69, train/loss_step=5.420, train/perplexity_step=225.0, val/loss=5.710, val/perplexity=386.0, train/loss_epoch=5.470, train/perplexity_epoch=239.0]


#### Test & Save the model

In [None]:
# Test model
print("\nTesting model...")
trainer.test(model, data_module)

# Save best model instead of final model
if checkpoint_callback.best_model_path:
    # Copy the best model to a final location with version number
    import shutil
    version = trainer.logger.version
    final_model_path = os.path.join(save_dir, f"{experiment_name}-best-v{version:02d}.ckpt")
    shutil.copy2(checkpoint_callback.best_model_path, final_model_path)
    print(f"Best model copied to {final_model_path}")
    
    # Also save as the standard final model name (overwrites previous)
    standard_final_path = os.path.join(save_dir, f"{experiment_name}-final.ckpt")
    shutil.copy2(checkpoint_callback.best_model_path, standard_final_path)
    print(f"Best model also saved as {standard_final_path}")
else:
    print("No best model found, saving current model as final")
    version = trainer.logger.version
    final_model_path = os.path.join(save_dir, f"{experiment_name}-final-v{version:02d}.ckpt")
    trainer.save_checkpoint(final_model_path)
    print(f"Final model saved to {final_model_path}")
    
    # Also save as the standard final model name (overwrites previous)
    standard_final_path = os.path.join(save_dir, f"{experiment_name}-final.ckpt")
    shutil.copy2(final_model_path, standard_final_path)
    print(f"Final model also saved as {standard_final_path}")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Testing model...


c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:428: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.

Detected KeyboardInterrupt, attempting graceful shutdown ...


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# Also save the vocab with the version number
vocab_path = os.path.join(save_dir, f"vocab-v{version:02d}.pkl")
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
    print(f"Vocab saved to {vocab_path}")

# Also save as the standard vocab name (overwrites previous)
standard_vocab_path = os.path.join(save_dir, "vocab.pkl")
with open(standard_vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
    print(f"Vocab also saved as {standard_vocab_path}")

Vocab saved to ./checkpoints\vocab-v66.pkl
Vocab also saved as ./checkpoints\vocab.pkl


In [None]:
print("\nTraining completed!")
print(f"Checkpoints saved in: {save_dir}")
print(f"Logs saved in: {log_dir}")
print(f"Vocabulary saved in: {vocab_path}")
print(f"Best model: {checkpoint_callback.best_model_path}")
print(f"Best score: {checkpoint_callback.best_model_score}")
print(f"Final model saved as: {final_model_path}")

# Print instructions for running the app
print("\n" + "="*50)
print("To run the Gradio app with your trained model:")
print(f"python -m src.app.gradio_app --model_path {final_model_path} --vocab_path {vocab_path}")

print("="*50)


Training completed!
Checkpoints saved in: ./checkpoints
Logs saved in: ./logs
Vocabulary saved in: ./checkpoints\vocab-v66.pkl
Best model: C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=03-val_loss=6.070.ckpt
Best score: 6.070318698883057
Final model saved as: ./checkpoints\transformer_lm-best-v66.ckpt

To run the Gradio app with your trained model:
python -m src.app.gradio_app --model_path ./checkpoints\transformer_lm-best-v66.ckpt --vocab_path ./checkpoints\vocab-v66.pkl
