<a href="https://colab.research.google.com/github/DataSavvyYT/experiments/blob/main/1_llm_finetune/1_tune_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install deps (Colab)
%pip -q install -U transformers accelerate datasets peft bitsandbytes trl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig



In [None]:
# ----------------------------
# Config
# ----------------------------
BASE_MODEL = "google/gemma-2-2b-it"  # or gemma-2-2b, choose -it for instruction-tuned base
OUTPUT_DIR = "gemma-promo-qlora"
MAX_SEQ_LEN = 512
BATCH_SIZE = 4
GRAD_ACCUM = 4
EPOCHS = 3
LR = 2e-4
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
DATASET_PATH = "json"  # "json" + data_files below, or point to your HF dataset repo

In [6]:
# ----------------------------
# Load dataset
# Expect JSONL with fields: instruction, input, output
# ----------------------------
dataset = load_dataset(
    DATASET_PATH,
    data_files={
        "train": "train.jsonl",
        "validation": "validation.jsonl"
    }
)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [7]:

# Formatting function to create supervised prompts
SYSTEM_PREFIX = "You are an analyst that predicts promotion effectiveness based on campaign details."
INSTR_TEMPLATE = """<system>
{system}
</system>
<instruction>
{instruction}
</instruction>
<input>
{inp}
</input>
<output>
{out}
</output>"""



In [8]:
def format_example(example):
    instruction = example.get("instruction", "Predict promotion effectiveness.")
    inp = example.get("input", "")
    out = example.get("output", "")
    # SFTTrainer learns to map input -> output; include output as labels portion
    return INSTR_TEMPLATE.format(system=SYSTEM_PREFIX, instruction=instruction, inp=inp, out=out)

def map_fn(batch):
    texts = [format_example(ex) for ex in batch]
    return {"text": texts}

train_ds = dataset["train"].map(lambda ex: {"text": format_example(ex)})
eval_ds = dataset["validation"].map(lambda ex: {"text": format_example(ex)})



Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
# ----------------------------
# Tokenizer & 4-bit model load (QLoRA)
# ----------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)



In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16
)



tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# ----------------------------
# LoRA config (PEFT)
# ----------------------------
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)



In [None]:
# ----------------------------
# Trainer
# ----------------------------
sft_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    bf16=True,
    max_seq_length=MAX_SEQ_LEN,
    packing=True
)



In [None]:
trainer = SFTTrainer(
    model=model,
    args=sft_args,
    peft_config=peft_config,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    dataset_text_field="text"
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)



In [None]:
# ----------------------------
# Optional: Merge LoRA into base weights (for single file deployment)
# Note: requires reloading base model in full precision or 8-bit for merge
# ----------------------------
# from peft import PeftModel
# from transformers import AutoModelForCausalLM
# base_fp = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto")
# peft_model = PeftModel.from_pretrained(base_fp, OUTPUT_DIR)
# merged = peft_model.merge_and_unload()
# merged.save_pretrained("gemma-promo-merged", safe_serialization=True, max_shard_size="2GB")
# tokenizer.save_pretrained("gemma-promo-merged")

# ----------------------------
# Quick eval helper: generate prediction for a sample
# ----------------------------
def predict_effectiveness(description: str) -> str:
    prompt = f"""<system>
{SYSTEM_PREFIX}
</system>
<instruction>
Predict promotion effectiveness as one of: "effective", "not effective", or a probability between 0 and 1.
</instruction>
<input>
{description}
</input>
<output>
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=20, do_sample=False)
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    # naive parse: take last line after <output>
    return text.split("<output>")[-1].strip()

print(predict_effectiveness("Campaign: Diwali Sale; Channel: Email; Budget: 5 Lakh INR; Audience: Returning; Discount: 10%; Duration: 5 days; Past CTR: 2.8%"))
