# **Transformer Language Model**

## Imports

In [1]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device() if torch.cuda.is_available() else 'N/A'}")

CUDA available: True
Device count: 1
Current device: 0


#### Set Root Dir

In [2]:
import sys
import os

# Use the current working directory as root (or go up if needed)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '../..'))  # Adjust '..' as needed
sys.path.append(ROOT_DIR)

#### Import Modules

In [3]:
"""
Training script for the transformer language model.
"""
import argparse
import torch
import pytorch_lightning as pl
import pickle
import yaml
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning import Trainer

from src.data.tokenizer import SimpleTokenizer, load_text_corpus, create_sample_corpus
from src.data.tokenizer_factory import TokenizerFactory
from src.data.dataset import TextDataModule
from src.model.lightning_module import TransformerLightningModule

  from .autonotebook import tqdm as notebook_tqdm


## Load Params

#### Load yaml config

In [4]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

#### Set hyperparams

In [5]:
hparams = config["hparams"]

# this is not the best way to do unpack the hyperparams, but it's just for the demo
vocab_size, d_model, num_heads, num_layers, d_ff, \
sequence_length, batch_size, learning_rate, max_epochs, \
patience, min_delta, warmup_steps, weight_decay, dropout, \
train_split, val_split, num_workers, accelerator, devices, precision, \
gradient_clip_val, accumulate_grad_batches, log_every_n_steps, \
val_check_interval, save_top_k, monitor, mode = hparams.values()

In [6]:
hparams

{'vocab_size': 8000,
 'd_model': 8,
 'num_heads': 1,
 'num_layers': 2,
 'd_ff': 32,
 'sequence_length': 64,
 'batch_size': 32,
 'learning_rate': 0.0005,
 'max_epochs': 100,
 'patience': 10,
 'min_delta': 0.001,
 'warmup_steps': 1000,
 'weight_decay': 0.01,
 'dropout': 0.2,
 'train_split': 0.7,
 'val_split': 0.2,
 'num_workers': 0,
 'accelerator': 'gpu',
 'devices': 1,
 'precision': '32',
 'gradient_clip_val': 1.0,
 'accumulate_grad_batches': 1,
 'log_every_n_steps': 50,
 'val_check_interval': 1.0,
 'save_top_k': 1,
 'monitor': 'val/loss',
 'mode': 'min'}

#### Set paths and general config

In [7]:
paths = config["paths"]
general = config["general"]

corpus_path=paths["corpus_path"]
save_dir=paths["save_dir"]
log_dir=paths["log_dir"]
experiment_name=general["experiment_name"]
create_sample=general["create_sample"]

## Data preprocessing

#### Load text

In [8]:
# Load and tokenize text
print("Loading and tokenizing text...")
text = load_text_corpus(corpus_path)

Loading and tokenizing text...


#### Tokenize

In [9]:
    # Create tokenizer and build vocabulary
    tokenizer_config = config.get("tokenizer", {})
    tokenizer_type = tokenizer_config.get("type", "word")
    
    if tokenizer_type == "word":
        tokenizer = TokenizerFactory.create_tokenizer("word", vocab_size=vocab_size)
    elif tokenizer_type == "bpe":
        bpe_options = tokenizer_config.get("bpe_options", {})
        tokenizer = TokenizerFactory.create_tokenizer("bpe", vocab_size=vocab_size, **bpe_options)
    else:
        raise ValueError(f"Unknown tokenizer type: {tokenizer_type}")

In [10]:
# Create tokenizer and build vocabulary
# tokenizer = SimpleTokenizer(vocab_size=hparams["vocab_size"])
tokenizer.build_vocab(text)

Vocabulary built with 6501 tokens


#### Save vocab

In [11]:
# Save vocabulary.
# For the demo, we override the vocab file. you can adjust the vocal file name as you like.
vocab_path = os.path.join(save_dir, "vocab.pkl")
os.makedirs(save_dir, exist_ok=True)
tokenizer.save_vocab(vocab_path)
print(f"Vocabulary saved to {vocab_path}")

Vocabulary saved to ./checkpoints\vocab.pkl
Vocabulary saved to ./checkpoints\vocab.pkl


#### Read vocab

In [12]:
# read vocab
# this step is only necessary if loading the vocab from a file instead of creating one
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)


#### Get token ids

In [13]:
# Encode text
token_ids = tokenizer.encode(text)
print(f"Encoded text length: {len(token_ids)} tokens")
print("The number of unique tokens in the vocab is:", len(set(token_ids)))

Encoded text length: 164423 tokens
The number of unique tokens in the vocab is: 6497


In [14]:
ids_to_words = tokenizer.decode(token_ids)
print(f"Decoded text: {ids_to_words[:100]}")


Decoded text: hello, it's me
i was wondering if after all these years, you'd like to meet
to go over everything
th


## Model Training

#### Create data module

In [15]:
# Create data module
data_module = TextDataModule(
    token_ids=token_ids,
    sequence_length=sequence_length,
    batch_size=batch_size,
    train_split=train_split,
    val_split=val_split,
    num_workers=int(os.cpu_count()*0.8) #use half of the cores   #num_workers
)

#### Create model

In [16]:
# Create model
model = TransformerLightningModule(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    d_ff=d_ff,
    dropout=dropout,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps
)

#### Set callback

In [17]:
# Create callbacks
callbacks = []

# Early stopping
early_stopping = EarlyStopping(
    monitor=monitor,
    patience=patience,
    min_delta=min_delta,
    mode=mode,
    verbose=True
)
callbacks.append(early_stopping)


# Model checkpointing
checkpoint_callback = ModelCheckpoint(
    dirpath=save_dir,
    filename=f"{experiment_name}-epoch={{epoch:02d}}-val_loss={{val/loss:.3f}}", #-v{trainer.logger.version:02d}",
    monitor=monitor,
    mode=mode,
    auto_insert_metric_name=False, # Prevents the name 'val/loss=' from being prepended
    save_top_k=save_top_k,
    save_last=True,
    verbose=True
)
callbacks.append(checkpoint_callback)

# Learning rate monitoring
lr_monitor = LearningRateMonitor(logging_interval='step')
callbacks.append(lr_monitor)

#### Create trainer and logger

In [18]:
# Create logger
logger = TensorBoardLogger(
    save_dir=log_dir,
    name=experiment_name,
    version=None
)

# Create trainer
trainer = Trainer(
    accelerator=accelerator,
    devices=devices,
    precision=precision,
    max_epochs=max_epochs,
    gradient_clip_val=gradient_clip_val,
    accumulate_grad_batches=accumulate_grad_batches,
    log_every_n_steps=log_every_n_steps,
    val_check_interval=val_check_interval,
    callbacks=callbacks,
    logger=logger,
    deterministic=True,
    enable_progress_bar=True,
    enable_model_summary=True
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


#### Summarize prepared data

In [19]:
# Print model summary
print("\nModel Summary:")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Check data sizes
print("Data sizes:")
print(f"Total tokens: {len(token_ids)}")
print(f"Sequence length: {sequence_length}")
print(f"Train split: {train_split}")
print(f"Val split: {val_split}")

# Calculate split sizes
total_len = len(token_ids)
train_end = int(total_len * train_split)
val_end = int(total_len * (train_split + val_split))

print(f"Train tokens: {train_end}")
print(f"Val tokens: {val_end - train_end}")
print(f"Test tokens: {total_len - val_end}")

# Check if validation data is sufficient
val_tokens = val_end - train_end
if val_tokens < sequence_length:
    print(f"WARNING: Validation data has only {val_tokens} tokens, less than sequence length {sequence_length}")
    print("This will cause validation to fail. Consider using a larger corpus or adjusting splits.")


Model Summary:
Vocabulary size: 6501
Model parameters: 112,261
Trainable parameters: 112,261
Data sizes:
Total tokens: 164423
Sequence length: 64
Train split: 0.7
Val split: 0.2
Train tokens: 115096
Val tokens: 32884
Test tokens: 16443


#### Train the neural netork transformer

In [None]:
# Train model
print("\nStarting training...")
print("With the updated parameters:")
print(f"- Sequence length: {sequence_length}")
print(f"- Train split: {train_split}")
print(f"- Val split: {val_split}")
print(f"- Monitor: {monitor}")
print()

trainer.fit(model, data_module)


Starting training...
With the updated parameters:
- Sequence length: 64
- Train split: 0.7
- Val split: 0.2
- Monitor: val/loss



c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:751: Checkpoint directory C:\code\data_science\demo_llm\checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | model     | TransformerLM    | 112 K  | train
1 | criterion | CrossEntropyLoss | 0      | train
-------------------------------------------------------
112 K     Trainable params
0         Non-trainable params
112 K     Total params
0.449     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:428: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

c:\code\data_science\demo_llm\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:428: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 0: 100%|██████████| 3595/3595 [01:56<00:00, 30.96it/s, v_num=65, train/loss_step=5.850, train/perplexity_step=349.0, val/loss=5.820, val/perplexity=379.0, train/loss_epoch=6.460, train/perplexity_epoch=1.3e+3]

Metric val/loss improved. New best score: 5.823
Epoch 0, global step 3595: 'val/loss' reached 5.82302 (best 5.82302), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=00-val_loss=5.823.ckpt' as top 1


Epoch 1: 100%|██████████| 3595/3595 [02:59<00:00, 20.00it/s, v_num=65, train/loss_step=5.490, train/perplexity_step=241.0, val/loss=5.510, val/perplexity=278.0, train/loss_epoch=5.670, train/perplexity_epoch=292.0] 

Metric val/loss improved by 0.311 >= min_delta = 0.001. New best score: 5.512
Epoch 1, global step 7190: 'val/loss' reached 5.51241 (best 5.51241), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=01-val_loss=5.512.ckpt' as top 1


Epoch 2: 100%|██████████| 3595/3595 [02:59<00:00, 19.99it/s, v_num=65, train/loss_step=5.540, train/perplexity_step=255.0, val/loss=5.310, val/perplexity=225.0, train/loss_epoch=5.400, train/perplexity_epoch=222.0]

Metric val/loss improved by 0.200 >= min_delta = 0.001. New best score: 5.312
Epoch 2, global step 10785: 'val/loss' reached 5.31238 (best 5.31238), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=02-val_loss=5.312.ckpt' as top 1


Epoch 3: 100%|██████████| 3595/3595 [02:46<00:00, 21.53it/s, v_num=65, train/loss_step=5.230, train/perplexity_step=188.0, val/loss=5.160, val/perplexity=192.0, train/loss_epoch=5.220, train/perplexity_epoch=186.0]

Metric val/loss improved by 0.148 >= min_delta = 0.001. New best score: 5.164
Epoch 3, global step 14380: 'val/loss' reached 5.16393 (best 5.16393), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=03-val_loss=5.164.ckpt' as top 1


Epoch 4: 100%|██████████| 3595/3595 [02:54<00:00, 20.65it/s, v_num=65, train/loss_step=5.110, train/perplexity_step=165.0, val/loss=5.040, val/perplexity=168.0, train/loss_epoch=5.100, train/perplexity_epoch=164.0]

Metric val/loss improved by 0.122 >= min_delta = 0.001. New best score: 5.042
Epoch 4, global step 17975: 'val/loss' reached 5.04156 (best 5.04156), saving model to 'C:\\code\\data_science\\demo_llm\\checkpoints\\transformer_lm-epoch=04-val_loss=5.042.ckpt' as top 1


Epoch 5:  58%|█████▊    | 2088/3595 [01:29<01:04, 23.38it/s, v_num=65, train/loss_step=5.050, train/perplexity_step=156.0, val/loss=5.040, val/perplexity=168.0, train/loss_epoch=5.100, train/perplexity_epoch=164.0]

#### Test & Save the model

In [None]:
# Test model
print("\nTesting model...")
trainer.test(model, data_module)

# Save best model instead of final model
if checkpoint_callback.best_model_path:
    # Copy the best model to a final location with version number
    import shutil
    version = trainer.logger.version
    final_model_path = os.path.join(save_dir, f"{experiment_name}-best-v{version:02d}.ckpt")
    shutil.copy2(checkpoint_callback.best_model_path, final_model_path)
    print(f"Best model copied to {final_model_path}")
    
    # Also save as the standard final model name (overwrites previous)
    standard_final_path = os.path.join(save_dir, f"{experiment_name}_final.ckpt")
    shutil.copy2(checkpoint_callback.best_model_path, standard_final_path)
    print(f"Best model also saved as {standard_final_path}")
else:
    print("No best model found, saving current model as final")
    version = trainer.logger.version
    final_model_path = os.path.join(save_dir, f"{experiment_name}-final-v{version:02d}.ckpt")
    trainer.save_checkpoint(final_model_path)
    print(f"Final model saved to {final_model_path}")
    
    # Also save as the standard final model name (overwrites previous)
    standard_final_path = os.path.join(save_dir, f"{experiment_name}_final.ckpt")
    shutil.copy2(final_model_path, standard_final_path)
    print(f"Final model also saved as {standard_final_path}")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Testing model...
Testing DataLoader 0: 100%|██████████| 347/347 [00:01<00:00, 188.48it/s]


Best model copied to ./checkpoints\transformer_lm-best-v64.ckpt


In [None]:
# Also save the vocab with the version number
vocab_path = os.path.join(save_dir, f"vocab-v{version:02d}.pkl")
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
    print(f"Vocab saved to {vocab_path}")

# Also save as the standard vocab name (overwrites previous)
standard_vocab_path = os.path.join(save_dir, "vocab.pkl")
with open(standard_vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
    print(f"Vocab also saved as {standard_vocab_path}")

Vocab saved to ./checkpoints\vocab-v64.pkl


In [None]:
print("\nTraining completed!")
print(f"Checkpoints saved in: {save_dir}")
print(f"Logs saved in: {log_dir}")
print(f"Vocabulary saved in: {vocab_path}")
print(f"Best model: {checkpoint_callback.best_model_path}")
print(f"Best score: {checkpoint_callback.best_model_score}")
print(f"Final model saved as: {final_model_path}")

# Print instructions for running the app
print("\n" + "="*50)
print("To run the Gradio app with your trained model:")
print(f"python -m src.app.gradio_app --model_path {final_model_path} --vocab_path {vocab_path}")

print("="*50)


Training completed!
Checkpoints saved in: ./checkpoints
Logs saved in: ./logs
Vocabulary saved in: ./checkpoints\vocab-v64.pkl
Best model: C:\code\data_science\demo_llm\checkpoints\transformer_lm-epoch=99-val_loss=4.281.ckpt
Best score: 4.2810516357421875
Final model saved as: ./checkpoints\transformer_lm-best-v64.ckpt

To run the Gradio app with your trained model:
python -m src.app.gradio_app --model_path ./checkpoints\transformer_lm-best-v64.ckpt --vocab_path ./checkpoints\vocab-v64.pkl


In [None]:
# vocab['word_to_idx']['brain,']