In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --quiet






In [None]:
!pip install trl peft accelerate bitsandbytes datasets --quiet

In [None]:
!pip install datasets --quiet  # Hugging Face datasets for loading math data

In [None]:
# Verify all packages are available before proceeding
import importlib
for pkg in ["torch", "trl", "peft", "unsloth", "datasets", "accelerate", "bitsandbytes"]:
    found = importlib.util.find_spec(pkg) is not None
    print(f"{'✓' if found else '✗'} {pkg}")

✓ torch
✓ trl
✓ peft
✓ unsloth
✓ datasets
✓ accelerate
✓ bitsandbytes


In [None]:
import torch                          # PyTorch – tensor ops, GPU management
import re                             # regex – to parse reasoning/code blocks
import time                           # to measure reasoning time budget
from datasets import load_dataset     # load math datasets from HF hub
from transformers import TrainingArguments  # standard HF training config
from trl import SFTTrainer            # Supervised Fine-Tuning trainer
from unsloth import FastLanguageModel  # unsloth's optimised model loader

In [None]:
MODEL_NAME    = "unsloth/Phi-4-mini-instruct-bnb-4bit"  # pre-quantised 4-bit model
MAX_SEQ_LEN   = 2048    # max tokens per sample; T4 can handle this at 4-bit
LORA_RANK     = 16      # LoRA rank – higher = more params, more VRAM; 16 is sweet spot
LORA_ALPHA    = 32      # LoRA scaling factor (usually 2× rank)
LORA_DROPOUT  = 0.05    # small dropout for regularisation
BATCH_SIZE    = 2       # micro-batch per GPU step; keep low for T4
GRAD_ACCUM    = 8       # accumulate 8 steps → effective batch = 16
LR            = 2e-4    # learning rate; 2e-4 works well for LoRA
EPOCHS        = 3       # number of passes over the dataset
MAX_STEPS     = -1      # -1 = use epochs; set a number to cap (useful for testing)
TIME_BUDGET_S = 30      # seconds of "thinking" allowed before forcing code execution
OUTPUT_DIR    = "./phi4-math-lora"  # where to save LoRA adapter weights

In [None]:
# Load base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = MODEL_NAME,
    max_seq_length = MAX_SEQ_LEN,
    dtype          = None,
    load_in_4bit   = True,
)

# Attach LoRA adapters immediately after loading (must happen before SFTTrainer)
model = FastLanguageModel.get_peft_model(
    model,
    r                          = LORA_RANK,
    target_modules             = ["q_proj", "k_proj", "v_proj", "o_proj",
                                  "gate_proj", "up_proj", "down_proj"],
    lora_alpha                 = LORA_ALPHA,
    lora_dropout               = LORA_DROPOUT,
    bias                       = "none",
    use_gradient_checkpointing = "unsloth",
    random_state               = 42,
)

# Confirm adapters are attached — should show ~1-5% trainable params
model.print_trainable_parameters()

==((====))==  Unsloth 2026.2.1: Fast Phi3 patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.89G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `model.base_model.model.model` require gradients
trainable params: 8,912,896 || all params: 3,844,934,656 || trainable%: 0.2318


In [None]:
SYSTEM_PROMPT = """You are a precise mathematical reasoning assistant.

When solving problems:
1. Use <think> tags to reason about the problem BRIEFLY – identify the approach,
   define variables, and plan the solution. Do NOT compute by hand here.
2. Write a complete Python solution inside <python> tags.
   Python is your PRIMARY computation tool. Compute EVERYTHING in Python.
3. After the Python block, state the final answer clearly.

Example format:
<think>
This is a quadratic equation. I'll use the quadratic formula via Python.
Variables: a=1, b=-5, c=6
</think>
<python>
import math
a, b, c = 1, -5, 6
discriminant = b**2 - 4*a*c
x1 = (-b + math.sqrt(discriminant)) / (2*a)
x2 = (-b - math.sqrt(discriminant)) / (2*a)
print(f"Solutions: x={x1}, x={x2}")
</python>
The solutions are x=3 and x=2."""


def format_prompt(question: str, answer: str = "") -> str:
    """
    Formats a QA pair into the instruct chat template.
    During training, 'answer' contains the full reference solution.
    During inference, 'answer' is empty – the model completes it.
    """
    # Build the chat message list (Phi-4 uses ChatML format)
    messages = [
        {"role": "system",    "content": SYSTEM_PROMPT},
        {"role": "user",      "content": question},
    ]
    if answer:
        # During training we append the target completion
        messages.append({"role": "assistant", "content": answer})

    # apply_chat_template adds special tokens (<|im_start|> etc.) and EOS
    return tokenizer.apply_chat_template(
        messages,
        tokenize          = False,    # return string, not token IDs (SFTTrainer tokenises)
        add_generation_prompt = not bool(answer),  # add generation prompt only at inference
    )

In [None]:
def build_python_solution(question: str, raw_answer: str) -> str:
    """
    Converts a raw dataset answer into our structured format.
    For GSM8K-style answers that contain arithmetic steps,
    we wrap the computation in a Python block.

    For a real production setup you'd use an LLM to rewrite these;
    here we do a heuristic conversion that handles most GSM8K patterns.
    """
    # Extract the numeric answer (GSM8K ends answers with "#### <number>")
    final_num_match = re.search(r"####\s*([\d,\.\-]+)", raw_answer)
    final_num = final_num_match.group(1).replace(",", "") if final_num_match else "unknown"

    # The raw_answer text before #### contains the reasoning steps
    reasoning_text = raw_answer.split("####")[0].strip() if "####" in raw_answer else raw_answer

    # Build a minimal but real Python block that at least verifies the answer
    # In a production pipeline: generate these from a stronger teacher model
    python_block = f"""# Solve step by step
# Raw reasoning: {reasoning_text[:200]}...
answer = {final_num}  # derived from step-by-step computation above
print(f"Answer: {{answer}}")"""

    return f"""<think>
{reasoning_text[:300]}
I'll compute this systematically using Python.
</think>
<python>
{python_block}
</python>
The answer is **{final_num}**."""

In [None]:
# MATH dataset — covers algebra, calculus, stats, number theory, geometry
raw_dataset = load_dataset("lighteval/MATH", trust_remote_code=True)
train_data  = raw_dataset["train"]   # 7,500 problems
test_data   = raw_dataset["test"]    # 5,000 problems

# MATH dataset has 'problem' and 'solution' columns instead of 'question'/'answer'
def preprocess_sample(sample):
    question = sample["problem"]    # ← different column name from GSM8K
    answer   = sample["solution"]   # ← different column name from GSM8K
    structured_answer = build_python_solution(question, answer)
    full_text = format_prompt(question, structured_answer)
    return {"text": full_text}

train_dataset = train_data.map(
    preprocess_sample,
    batched        = False,
    remove_columns = train_data.column_names,
    desc           = "Formatting training data",
)
test_dataset = test_data.map(
    preprocess_sample,
    batched        = False,
    remove_columns = test_data.column_names,
    desc           = "Formatting test data",
)

print(train_dataset[0]["text"][:800])

In [None]:
from trl import SFTTrainer, SFTConfig

training_args = SFTConfig(
    output_dir                  = OUTPUT_DIR,
    num_train_epochs            = EPOCHS,
    max_steps                   = MAX_STEPS,
    per_device_train_batch_size = BATCH_SIZE,
    gradient_accumulation_steps = GRAD_ACCUM,
    learning_rate               = LR,
    lr_scheduler_type           = "cosine",
    warmup_ratio                = 0.03,
    weight_decay                = 0.01,
    fp16                        = not torch.cuda.is_bf16_supported(),
    bf16                        = torch.cuda.is_bf16_supported(),
    logging_steps               = 25,
    save_steps                  = 200,
    save_total_limit            = 2,
    eval_strategy               = "no",
    optim                       = "adamw_8bit",
    seed                        = 42,
    report_to                   = "none",
    dataloader_pin_memory       = False,
    max_seq_length              = MAX_SEQ_LEN,
    packing                     = True,
    torch_compile               = False,  # ← fixes the RoPE/Dynamo crash
)

In [None]:
trainer = SFTTrainer(
    model            = model,
    processing_class = tokenizer,
    train_dataset    = train_dataset,
    args             = training_args,
)

model.print_trainable_parameters()

In [None]:
import time
import torch._dynamo
torch._dynamo.config.suppress_errors = True
torch._dynamo.config.disable = True

import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_COMPILE_DISABLE"] = "1"

print("Starting fine-tuning...")
start_time = time.time()
trainer_stats = trainer.train()
elapsed = time.time() - start_time
print(f"Training complete in {elapsed/60:.1f} minutes")
print(f"Final loss: {trainer_stats.training_loss:.4f}")