<a href="https://colab.research.google.com/github/DataSavvyYT/experiments/blob/main/1_llm_finetune/1_finetuned_gemma_unsloth_w_o.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from datasets import load_dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import sys
import os

# Update 'python3.x' to match the current Colab version (usually python3.11 or python3.12 in late 2025)
env_path = "/content/drive/MyDrive/unsloth_env/lib/python3.12/site-packages"

if env_path not in sys.path:
    sys.path.append(env_path)

In [7]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [8]:
from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [9]:
# 1. Load the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b", # You can swap this for 'unsloth/mistral-7b'
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.12.8: Fast Gemma2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [10]:
# 2. Add LoRA adapters (The "Sticky Notes")
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 8,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.12.8 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


In [11]:
DATASET_PATH = "json"
# ----------------------------
# Load dataset
# Expect JSONL with fields: instruction, input, output
# ----------------------------
dataset = load_dataset(
    DATASET_PATH,
    data_files={
        "train": "/content/drive/MyDrive/data/promotion/train.jsonl",
        "validation": "/content/drive/MyDrive/data/promotion/validation.jsonl"
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [13]:
dataset['train'].to_pandas().head()

Unnamed: 0,instruction,input,output
0,Predict promotion effectiveness.,Campaign: Student Special\nChannel: Facebook A...,effective
1,Predict promotion effectiveness.,Campaign: Monsoon Clearance\nChannel: Instagra...,not effective
2,Predict promotion effectiveness.,Campaign: Weekend Bonanza\nChannel: Email\nBud...,effective
3,Predict promotion effectiveness.,Campaign: Student Special\nChannel: SMS\nBudge...,not effective
4,Predict promotion effectiveness.,Campaign: Clearance Blowout\nChannel: WhatsApp...,not effective



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [14]:
# Formatting function to create supervised prompts
SYSTEM_PREFIX = "You are an analyst that predicts promotion effectiveness based on campaign details."
INSTR_TEMPLATE = """<system>
{system}
</system>
<instruction>
{instruction}
</instruction>
<input>
{inp}
</input>
<output>
{out}
</output>"""

In [15]:
def format_example(example):
    instruction = example.get("instruction", "Predict promotion effectiveness.")
    inp = example.get("input", "")
    out = example.get("output", "")
    # SFTTrainer learns to map input -> output; include output as labels portion
    return INSTR_TEMPLATE.format(system=SYSTEM_PREFIX, instruction=instruction, inp=inp, out=out)

def map_fn(batch):
    texts = [format_example(ex) for ex in batch]
    return {"text": texts}

train_ds = dataset["train"].map(lambda ex: {"text": format_example(ex)})
eval_ds = dataset["validation"].map(lambda ex: {"text": format_example(ex)})

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [16]:
train_ds.to_pandas().head()

Unnamed: 0,instruction,input,output,text
0,Predict promotion effectiveness.,Campaign: Student Special\nChannel: Facebook A...,effective,<system>\nYou are an analyst that predicts pro...
1,Predict promotion effectiveness.,Campaign: Monsoon Clearance\nChannel: Instagra...,not effective,<system>\nYou are an analyst that predicts pro...
2,Predict promotion effectiveness.,Campaign: Weekend Bonanza\nChannel: Email\nBud...,effective,<system>\nYou are an analyst that predicts pro...
3,Predict promotion effectiveness.,Campaign: Student Special\nChannel: SMS\nBudge...,not effective,<system>\nYou are an analyst that predicts pro...
4,Predict promotion effectiveness.,Campaign: Clearance Blowout\nChannel: WhatsApp...,not effective,<system>\nYou are an analyst that predicts pro...


In [17]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set this to 60 for a quick demo, or 1000 for real training
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer.train()


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/400 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 400 | Num Epochs = 2 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 10,383,360 of 2,624,725,248 (0.40% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:

 3


wandb: You chose "Don't visualize my results"


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
1,2.6135
2,2.6164
3,2.5182
4,2.4014
5,2.1559
6,1.8209
7,1.6583
8,1.3212
9,1.0986
10,0.9318




0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▂▂▂▂▁▂█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▂▄▅▇████▇▇▇▇▇▇▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁
train/loss,██▇▇▆▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
total_flos,701772323595264.0
train/epoch,1.2
train/global_step,60.0
train/grad_norm,0.42907
train/learning_rate,0.0
train/loss,0.2338
train_loss,0.56835
train_runtime,189.4117
train_samples_per_second,2.534
train_steps_per_second,0.317


TrainOutput(global_step=60, training_loss=0.5683532054225604, metrics={'train_runtime': 189.4117, 'train_samples_per_second': 2.534, 'train_steps_per_second': 0.317, 'total_flos': 701772323595264.0, 'train_loss': 0.5683532054225604, 'epoch': 1.2})

In [18]:
def predict_effectiveness(description: str) -> str:
    prompt = f"""<system>
{SYSTEM_PREFIX}
</system>
<instruction>
Predict promotion effectiveness as one of: "effective", "not effective", or a probability between 0 and 1.
</instruction>
<input>
{description}
</input>
<output>
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=20, do_sample=False)
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    # naive parse: take last line after <output>
    return text.split("<output>")[-1].strip()



In [19]:
print(predict_effectiveness("Campaign: Diwali Sale; Channel: Email; Budget: 5 Lakh INR; Audience: Returning; Discount: 10%; Duration: 5 days; Past CTR: 2.8%"))


effective
</output>
<input>
Campaign: Cyber Monday; Channel: Push Notification;


In [21]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

true_labels = []
predictions = []

print("Starting evaluation on validation dataset...")
for i, example in enumerate(eval_ds):
    if i % 10 == 0:
        print(f"Processing example {i}/{len(eval_ds)}")

    campaign_description = example["input"]
    true_label = example["output"].strip().lower()

    # Get prediction from the model
    predicted_raw = predict_effectiveness(campaign_description)
    predicted_text = predicted_raw.strip().lower()

    predicted_label = ""
    try:
        # Check if the output is a probability
        predicted_prob = float(predicted_text)
        # Assuming a threshold of 0.5 for classification if a probability is returned
        predicted_label = "effective" if predicted_prob > 0.5 else "not effective"
    except ValueError:
        # If not a probability, assume it's directly the label string
        predicted_label = predicted_text

    true_labels.append(true_label)
    predictions.append(predicted_label)

print("\nEvaluation complete.")

# Ensure we have labels for classification_report to avoid errors
unique_labels = sorted(list(set(true_labels + predictions)))

if not true_labels:
    print("No true labels found in the validation dataset. Cannot compute metrics.")
elif not unique_labels:
    print("No labels (true or predicted) found. Cannot compute metrics.")
elif len(unique_labels) == 1:
    # If only one class is present in true or predicted labels, accuracy_score works,
    # but classification_report might struggle. Handle gracefully.
    print(f"Only one label ('{unique_labels[0]}') present in true or predicted labels.")
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Overall Accuracy: {accuracy:.4f}")
    # No classification report if only one label is present
else:
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Overall Accuracy: {accuracy:.4f}")

    print("\nClassification Report:")
    # Use zero_division=0 to prevent warnings/errors if a class has no samples/predictions
    print(classification_report(true_labels, predictions, labels=unique_labels, zero_division=0))

Starting evaluation on validation dataset...
Processing example 0/100
Processing example 10/100
Processing example 20/100
Processing example 30/100
Processing example 40/100
Processing example 50/100
Processing example 60/100
Processing example 70/100
Processing example 80/100
Processing example 90/100

Evaluation complete.
Overall Accuracy: 0.0000

Classification Report:
                                                                                    precision    recall  f1-score   support

                                                                         effective       0.00      0.00      0.00      59.0
          effective
</output>
<input>
campaign: cyber monday
channel: email
budget       0.00      0.00      0.00       0.0
          effective
</output>
<input>
campaign: cyber monday
channel: facebook ads       0.00      0.00      0.00       0.0
         effective
</output>
<input>
campaign: cyber monday
channel: google search       0.00      0.00      0.00       0.0
    