In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template

# Model configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Load Model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token=""  # Replace with your token
)

==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

# using LORA Adapters

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,                       # LoRA rank
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)


Unsloth 2025.3.9 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


# defining alpaca Prompt

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token 
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs = examples["output"]

    texts = []
    for instruction, output in zip(instructions, outputs):
        text = alpaca_prompt.format(instruction, "", output) + EOS_TOKEN  
        texts.append(text)

    return { "text": texts }
pass

# load and split Dataset

In [6]:
dataset = load_dataset("json", data_files="new.json")

dataset = dataset["train"].train_test_split(test_size=0.1, seed=3407)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

train_dataset = train_dataset.map(formatting_prompts_func, batched=True, num_proc=2)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True, num_proc=2)

Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=2):   0%|          | 0/8755 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/973 [00:00<?, ? examples/s]

In [7]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(eval_dataset)}")

Train dataset size: 8755
Validation dataset size: 973


# Supervised Fine-Tuning Trainer  initializing

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 8,
        warmup_steps = 20,
        max_steps = 150,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
        evaluation_strategy="steps",  # Enables validation during training
        eval_steps=25,  # Evaluates every 25 steps
    ),
)



Tokenizing to ["text"] (num_proc=2):   0%|          | 0/8755 [00:00<?, ? examples/s]

Tokenizing to ["text"] (num_proc=2):   0%|          | 0/973 [00:00<?, ? examples/s]

# start fine tuning

In [9]:
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 8,755 | Num Epochs = 1 | Total steps = 150
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 29,933,568/1,830,055,936 (1.64% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
25,2.2488,1.890969
50,1.512,1.519949
75,1.4648,1.442016
100,1.394,1.4037
125,1.3823,1.388529
150,1.3628,1.385592


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=150, training_loss=1.6346281305948893, metrics={'train_runtime': 1842.5735, 'train_samples_per_second': 2.605, 'train_steps_per_second': 0.081, 'total_flos': 1.4072046857945088e+16, 'train_loss': 1.6346281305948893})

In [11]:
model.save_pretrained_gguf("model",tokenizer,quantization_method = "q4_k_m")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.4G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.6 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 36/36 [00:01<00:00, 28.41it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving model/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting qwen2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at model into f16 GGUF format.
The output location will be /content/model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00002.bin'
INFO:hf-to-gguf:token_embd.weight,         torch.float16 --> F16, shape = {2048

# inference script

In [None]:

FastLanguageModel.for_inference(model) 
inputs = tokenizer(
[
    alpaca_prompt.format(
        "what is HAI-LLM ?", # instruction
        "", # input  - leave this blank
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nwhat is HAI-LLM ?\n\n### Input:\n\n\n### Response:\nHAI-LLM stands for High-Accuracy-Inference Large Language Model, which is a type of large language model that focuses on achieving high accuracy in inference tasks. The model was created by Deepseek AI and is designed to be highly accurate in its predictions. It can be used for various applications such as text']

# Drive upload gguf file

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import shutil


model_path = '/content/model/unsloth.Q4_K_M.gguf'
drive_folder = '/content/drive/My Drive/ModelFolder/' 

shutil.move(model_path, drive_folder)


Mounted at /content/drive


'/content/drive/My Drive/ModelFolder/unsloth.Q4_K_M.gguf'