In [None]:
import json
# load grocery data from json file
file = json.load(open("grocery_dataset_1000.json", "r")) # loads the json file with ai generated grocery data
print(file[1]) # print the second item for testing

In [None]:
# install required libraries
!pip install unsloth trl peft accelerate bitsandbytes

In [None]:
# check if gpu is available
import torch
print(f"cuda available: {torch.cuda.is_available()}")
print(f"gpu: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'none'}")

In [None]:
# import model and tokenizer
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/Qwen3-0.6B"

max_seq_length = 8192  # set sequence length, keep it low to save memory
dtype = None  # auto detection

# load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

In [None]:
# pre process our data (convert it to string)
from datasets import Dataset

# format each example for training
def format_prompt(example):
    return f"### input: {example['input']}\n### output: {json.dumps(example['output'])}<|endoftext|>"

formatted_data = [format_prompt(item) for item in file]
# create huggingface dataset
dataset = Dataset.from_dict({"text": formatted_data})

In [None]:
# add LoRA adapters to the model
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank, higher = more capacity
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,  # LoRA scaling factor
    lora_dropout=0,  # dropout, 0 is optimized
    bias="none",     # bias, none is optimized
    use_gradient_checkpointing="unsloth",  # unsloth's optimized version
    random_state=3407,
    use_rslora=False,  # rank stabilized LoRA
    loftq_config=None, # LoftQ
    # LoRA and LoftQ are special tech names
    # this helps the model learn better
    # you can change these settings for experiments
)

In [None]:
# set up training
from trl import SFTTrainer
from transformers import TrainingArguments

# training arguments optimized for unsloth and qwen3
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,  # effective batch size = 8
        warmup_steps=10, # maybe increase to 50
        num_train_epochs=3,
        learning_rate=2e-4,     # 1e-4 is recommended for qwen
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="epoch",
        save_total_limit=2,
        dataloader_pin_memory=False,
        report_to="none", # disable weights & biases logging
    ),
)

In [None]:
# train the model
trainer_stats = trainer.train()
# training will take some time

In [None]:
# test the fine-tuned model
FastLanguageModel.for_inference(model) # enable fast inference

# create a test prompt
messages = [
    {
        "role": "user",
        "content": "extract the product information:\n<div class='product'>"
        "<h2>Organic Hass Avocado (Pack of 4)</h2><span class='price'>$6.49</span>"
        "<span class='category'>fruits</span><span class='brand'>FreshHarvest</span></div>"
    }
]

# turn messages into model input
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    enable_thinking=False  # to disable thinking mode
).to("cuda")

# generate model response
outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=256,  # set higher for longer answers
    use_cache=True,
    temperature=0.7,  # good for simple answers
    do_sample=True,
    top_p=0.8,        # controls randomness , # official recommendation for non thinking mode
    top_k=20,         # limits choices , # official recommendation
    min_p=0           # official recommendation
)

# decode and print the answer
response = tokenizer.batch_decode(outputs)[0]
print(response)

In [None]:
# save model in ollama supported format and in 4bit
model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="q4_k_m")
# you can use this file for ollama

In [None]:
# download the gguf model file from colab
from google.colab import files
import os

gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]
if gguf_files:
    gguf_file = os.path.join("gguf_model", gguf_files[0])
    print(f"downloading: {gguf_file}")
    files.download(gguf_file)
# now you have the model file on your computer