<a href="https://colab.research.google.com/github/Algocrat/slm-dragon-labs/blob/main/labs/colab/lab5_hparam_tuning_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 5 – Hyperparameter Tuning and Optimization
**Part 5 of the 7 Lab Hands-On SLM Training Series**

In this lab, you will run short, budget-friendly experiments to tune critical knobs for efficient LoRA fine-tuning. We will search learning rate, LoRA rank, sequence length, and gradient accumulation, then select the best performer by validation perplexity.


## Step 0. Stable installs for Colab

In [None]:
%pip install -q --force-reinstall "numpy==2.0.2" "pandas==2.2.2" "pyarrow==17.0.0"
%pip install -q "datasets>=3.0.0" "transformers>=4.41.0" "peft>=0.11.0" "accelerate>=0.29.0" "sentencepiece>=0.1.99" "tqdm>=4.66.0" bitsandbytes
import importlib
for m in ["numpy","pandas","pyarrow","datasets","transformers","peft","accelerate","sentencepiece","tqdm"]:
    mod = importlib.import_module(m)
    print(m, getattr(mod, '__version__', 'unknown'))
print('If imports fail, go to Runtime → Restart runtime, then re-run this cell.')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m229.9/229.9 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.2/509.2 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m347.8/347.8 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hnumpy 2.0.2
pandas 2.2.2
pyarrow 17.

## Step 1. Load the prepared dataset from Google Drive

In [None]:
from datasets import load_from_disk
from google.colab import drive
drive.mount('/content/drive')
DATA_DIR = "/content/drive/MyDrive/slm-labs/lab3_tokenized"
ds = load_from_disk(DATA_DIR)
print(ds)
print('Train:', len(ds['train']))
if 'validation' in ds:
    print('Validation:', len(ds['validation']))
elif 'test' in ds:
    print('Test used as validation:', len(ds['test']))

Mounted at /content/drive
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 121736
    })
})
Train: 121736


## Step 2. Load a base SLM with optional 4‑bit quantization

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

BASE_MODEL = "HuggingFaceH4/zephyr-7b-beta"  # change if desired

def load_base(model_name=BASE_MODEL):
    use_gpu = torch.cuda.is_available()
    kwargs = {}
    if use_gpu:
        try:
            quant = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',
                                       bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True)
            kwargs.update(dict(device_map='auto', quantization_config=quant, torch_dtype=torch.float16))
        except Exception as e:
            kwargs.update(dict(torch_dtype=torch.float16))
    else:
        kwargs.update(dict(torch_dtype=torch.float32))
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    mdl = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    return tok, mdl

tokenizer, base_model = load_base()
print('Model loaded')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Model loaded


## Step 3. LoRA configuration helper

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

def attach_lora(model, r=16, alpha=32, dropout=0.05):
    model = prepare_model_for_kbit_training(model)
    cfg = LoraConfig(r=r, lora_alpha=alpha, lora_dropout=dropout,
                     target_modules=TARGET_MODULES, bias='none', task_type='CAUSAL_LM')
    lora_model = get_peft_model(model, cfg)
    lora_model.print_trainable_parameters()
    return lora_model


## Step 4. Training utility and perplexity evaluation

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import math, time

def run_experiment(hparams, dataset):
    model = attach_lora(base_model, r=hparams['r'], alpha=hparams['alpha'], dropout=hparams['lora_dropout'])
    collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    eval_ds = dataset.get('validation') or dataset.get('test')

    args = TrainingArguments(
        output_dir=f"./outputs/lab5_{int(time.time())}",
        per_device_train_batch_size=hparams['batch_size'],
        gradient_accumulation_steps=hparams['grad_accum'],
        learning_rate=hparams['lr'],
        warmup_steps=10,
        max_steps=hparams['max_steps'],
        logging_steps=10,
        save_strategy='no',
        fp16=torch.cuda.is_available(),
        report_to=[],
    )

    trainer = Trainer(model=model, args=args, train_dataset=dataset['train'], eval_dataset=eval_ds,
                      data_collator=collator)
    trainer.train()
    metrics = {}
    if eval_ds is not None:
        eval_metrics = trainer.evaluate()
        eval_loss = eval_metrics.get('eval_loss', None)
        if eval_loss is not None:
            metrics['eval_loss'] = float(eval_loss)
            metrics['perplexity'] = float(math.exp(eval_loss)) if eval_loss < 50 else float('inf')
    return metrics, model


## Step 5. Define search space and run a small sweep

In [None]:
import itertools, random
import pandas as pd
from pathlib import Path

SEARCH_SPACE = {
    'lr': [1e-4, 2e-4, 5e-4],
    'r': [8, 16, 32],
    'alpha': [16, 32],
    'lora_dropout': [0.05],
    'batch_size': [2],
    'grad_accum': [2, 4],
    'max_steps': [150],
}

candidates = list(itertools.product(SEARCH_SPACE['lr'], SEARCH_SPACE['r'], SEARCH_SPACE['alpha'],
                                    SEARCH_SPACE['lora_dropout'], SEARCH_SPACE['batch_size'],
                                    SEARCH_SPACE['grad_accum'], SEARCH_SPACE['max_steps']))
random.seed(42)
random.shuffle(candidates)
BUDGET = 6
trials = candidates[:BUDGET]

records = []
best = None
best_model = None

for i, combo in enumerate(trials, 1):
    h = dict(lr=combo[0], r=combo[1], alpha=combo[2], lora_dropout=combo[3],
             batch_size=combo[4], grad_accum=combo[5], max_steps=combo[6])
    print(f"\n=== Trial {i}/{BUDGET} :: {h} ===")
    metrics, model = run_experiment(h, ds)
    row = {**h, **metrics}
    records.append(row)
    if metrics and ('perplexity' in metrics):
        if (best is None) or (metrics['perplexity'] < best['perplexity']):
            best = row
            best_model = model

import pandas as pd
df = pd.DataFrame(records)
display(df)

print('\nBest trial:')
print(best)


=== Trial 1/6 :: {'lr': 0.0001, 'r': 32, 'alpha': 16, 'lora_dropout': 0.05, 'batch_size': 2, 'grad_accum': 4, 'max_steps': 150} ===
trainable params: 83,886,080 || all params: 7,325,618,176 || trainable%: 1.1451


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,1.7007
20,1.6604
30,1.6459
40,1.6324
50,1.6081
60,1.645
70,1.615
80,1.6034
90,1.5788
100,1.6079



=== Trial 2/6 :: {'lr': 0.0002, 'r': 8, 'alpha': 16, 'lora_dropout': 0.05, 'batch_size': 2, 'grad_accum': 2, 'max_steps': 150} ===




trainable params: 20,971,520 || all params: 7,262,703,616 || trainable%: 0.2888


  return fn(*args, **kwargs)


Step,Training Loss
10,1.6838
20,1.6315
30,1.6322
40,1.6114
50,1.6395
60,1.6224
70,1.6189
80,1.6313
90,1.5889
100,1.6203



=== Trial 3/6 :: {'lr': 0.0001, 'r': 16, 'alpha': 16, 'lora_dropout': 0.05, 'batch_size': 2, 'grad_accum': 4, 'max_steps': 150} ===




trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758


  return fn(*args, **kwargs)


Step,Training Loss
10,1.7008
20,1.6603
30,1.6456
40,1.6321
50,1.608
60,1.6449
70,1.6147
80,1.6033
90,1.5787
100,1.6081



=== Trial 4/6 :: {'lr': 0.0005, 'r': 8, 'alpha': 32, 'lora_dropout': 0.05, 'batch_size': 2, 'grad_accum': 4, 'max_steps': 150} ===




trainable params: 20,971,520 || all params: 7,262,703,616 || trainable%: 0.2888


  return fn(*args, **kwargs)


Step,Training Loss
10,1.6645
20,1.646
30,1.6655
40,1.6687
50,1.6489
60,1.6881
70,1.6595
80,1.6404
90,1.6165
100,1.6417



=== Trial 5/6 :: {'lr': 0.0005, 'r': 32, 'alpha': 16, 'lora_dropout': 0.05, 'batch_size': 2, 'grad_accum': 2, 'max_steps': 150} ===




trainable params: 83,886,080 || all params: 7,325,618,176 || trainable%: 1.1451


  return fn(*args, **kwargs)


Step,Training Loss
10,1.6708
20,1.6232
30,1.6439
40,1.6324
50,1.6616
60,1.6425
70,1.6387
80,1.6504
90,1.6057
100,1.6334



=== Trial 6/6 :: {'lr': 0.0002, 'r': 16, 'alpha': 32, 'lora_dropout': 0.05, 'batch_size': 2, 'grad_accum': 4, 'max_steps': 150} ===




trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758


  return fn(*args, **kwargs)


Step,Training Loss
10,1.6797
20,1.6316
30,1.6364
40,1.6296
50,1.6086
60,1.6451
70,1.6144
80,1.603
90,1.577
100,1.6057


Unnamed: 0,lr,r,alpha,lora_dropout,batch_size,grad_accum,max_steps
0,0.0001,32,16,0.05,2,4,150
1,0.0002,8,16,0.05,2,2,150
2,0.0001,16,16,0.05,2,4,150
3,0.0005,8,32,0.05,2,4,150
4,0.0005,32,16,0.05,2,2,150
5,0.0002,16,32,0.05,2,4,150



Best trial:
None


## Step 6. Save best adapters and results to Google Drive

In [1]:
from pathlib import Path
import pandas as pd
import shutil, json, os

RESULTS_DIR = "/content/drive/MyDrive/slm-labs/lab5_results"
Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)

csv_path = f"{RESULTS_DIR}/trials.csv"
df.to_csv(csv_path, index=False)
print("Saved trial table to", csv_path)

# Pick best trial by lowest final training loss
df_loaded = pd.read_csv(csv_path)
if "train_loss" in df_loaded.columns:
    best_row = df_loaded.loc[df_loaded["train_loss"].idxmin()]
    tag = f"r{best_row['r']}_lr{best_row['lr']}_ga{best_row['grad_accum']}"
    trial_dir = Path(f"{RESULTS_DIR}/trial_{tag}")  # adjust if your trial folders use a different naming
    save_dir = Path(f"{RESULTS_DIR}/best_{tag}")

    if save_dir.exists():
        shutil.rmtree(save_dir)
    if trial_dir.exists():
        shutil.copytree(trial_dir, save_dir)
        print("Saved best adapters to", save_dir)
    else:
        print("Best trial directory not found:", trial_dir)

    # Save metadata about the best trial
    with open(Path(RESULTS_DIR) / "best.json", "w") as f:
        json.dump(best_row.to_dict(), f, indent=2)
else:
    print("No train_loss column found. Cannot determine best model.")


NameError: name 'Path' is not defined

## Step 7. Next steps
- Increase `BUDGET` and broaden the search space once the pipeline is stable.
- Consider evaluating on a heldout qualitative set and computing task specific metrics, not just perplexity.
- For tracking and easy comparison across runs, consider enabling Weights and Biases or MLflow logging.
