<a href="https://colab.research.google.com/github/Aman17Javed/Supervised-Fine-Tuning-unsloth-llama-3.2-3b-bnb-4bit/blob/main/SFT_Llama_3_2_3B_bnb_4bit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nbformat
from google.colab import _message

# Get notebook content from Colab memory
_ipynb = _message.blocking_request('get_ipynb')['ipynb']

# Remove widgets metadata completely
if "widgets" in _ipynb["metadata"]:
    del _ipynb["metadata"]["widgets"]

# Write cleaned notebook to same filename
with open("SFT_Llama-3.2-3B-bnb-4bit.ipynb", "w") as f:
    nbformat.write(nbformat.from_dict(_ipynb), f)

print("Notebook cleaned — ready to push to GitHub.")


In [None]:
!pip install unsloth

In [None]:
!pip install --upgrade transformers accelerate safetensors


In [None]:
from unsloth import FastLanguageModel

In [4]:
import torch


In [5]:
max_seq_length=2048

In [6]:
dtype=None
load_in_4bit=True

In [7]:
from huggingface_hub import login

In [8]:
from google.colab import userdata
hf_token=userdata.get('hugging_face_key')
login(hf_token)

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=hf_token,
)

In [10]:
prompt_style = """Below is an instruction that describes a task, paired with a question.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a detailed answer.

### Instruction:
You are a helpful AI assistant with advanced knowledge in multiple fields.
Please answer the following question.

### Question:
{}

### Response:
<think>{}
"""


In [None]:
model = FastLanguageModel.get_peft_model(

model,

r=16,

target_modules=[

"q_proj",

"k_proj",

"v_proj",

"o_proj",

"gate_proj",

"up_proj",

"down_proj",

],

lora_alpha=16,

lora_dropout=0,

bias="none",

use_gradient_checkpointing="unsloth",

random_state=9001,

use_rslora=False,

loftq_config=None,
)

In [12]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

# Define how each training sample should look
train_prompt_style = """Instruction:
{instruction}

Context:
{context}

Response:
{response}
"""

def formatting_prompts_func(examples):
    # Get columns with safe defaults
    inputs = examples.get("instruction", [""] * len(examples["response"]))
    contexts = examples.get("context", [""] * len(examples["response"]))  # Dolly has context column
    outputs = examples.get("response", [""] * len(examples["response"]))

    texts = []
    for instr, ctx, resp in zip(inputs, contexts, outputs):
        # Fill in prompt
        text = train_prompt_style.format(
            instruction=instr.strip(),
            context=ctx.strip(),
            response=resp.strip()
        ) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}


In [None]:
from datasets import load_dataset

dataset = load_dataset("databricks/databricks-dolly-15k")
dataset = dataset.map(formatting_prompts_func, batched=True)

print(dataset["train"]["text"][0])  # Show first formatted prompt


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# ✅ Sample only 500 rows for speed
train_dataset = dataset["train"].shuffle(seed=42).select(range(500))

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",  # Make sure your formatting function created this field
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=100,  # ⏳ ~3-5 min training
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none"
    )
)


In [None]:
trainer_stats=trainer.train()

In [None]:
# Save model + tokenizer in Hugging Face format
save_dir = "outputs"  # folder to store files

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)


In [None]:

from transformers import AutoTokenizer
from unsloth import FastLanguageModel

model_path = "unsloth/Llama-3.2-3B-bnb-4bit"
max_seq_length = 512
dtype = "float16"
load_in_4bit = True

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model
model, _ = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map="auto",
)

# Enable inference optimizations
FastLanguageModel.for_inference(model)

# Define the prompt
prompt = """Explain supervised fine-tuning in simple words. What is the difference between fine-tuning and training from scratch? What are the benefits of fine-tuning? How do I fine-tune a model? How do I fine-tune a model using Hugging Face Transformers?"""

# Generate text with tuned settings
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=1024,  # Increased for longer response
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    eos_token_id=tokenizer.eos_token_id,
)

# Decode and print output
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
new_model_online = "amanjaved421/SFT_Llama-3.2-3B-bnb-4bit"
new_model_local = "SFT_Llama-3.2-3B-bnb-4bit"

model.save_pretrained(new_model_local)     # Save model weights
tokenizer.save_pretrained(new_model_local) # Save tokenizer


In [None]:
model.push_to_hub(new_model_online)
tokenizer.push_to_hub(new_model_online)