In [9]:
from dotenv import load_dotenv
import os

load_dotenv("/home/dj/Code/fine-tuning/.env")

hf_token = os.getenv("HF_TOKEN")
os.environ['HF_TOKEN'] = hf_token

In [10]:
import torch
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported, FastLanguageModel

In [11]:
MODEL = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit"
REPO_NAME = "vector124" # your HF username here
MODEL_NAME = "deepseek-r1-distill-qwen-1.5-unsloth-sft-python"

In [12]:
# Load the 4bit pre quantized model of deepseek and the tokenizer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.2.12: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce GTX 1660. Max memory: 5.788 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [13]:
# We add the LORA adapters to the model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2025.2.12 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [23]:
# Prepare the dataset

prompt_style = """Below is an instruction that describes a task, paired with a question that provides further context.
Write a response that appropriately answer the question.
Before answering, think carefully but concisely about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are an expert programmer with advanced knowledge in Python. Your task is to provide concise and easy-to-understand solutions. Please answer the following python question.

### Question:
{}

### Response:
{}
"""

EOS_TOKEN = tokenizer.eos_token


def formatting_prompts_func(examples):
    prompts = examples["prompt"]
    completions = examples["completion"]
    texts = []
    for prompt,completion in zip(prompts, completions):
        text = prompt_style.format(prompt, completion) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }


dataset = load_dataset("vector124/my-distiset-a391eb81", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [30]:
print(dataset)
print(dataset[9]["text"])

Dataset({
    features: ['prompt', 'completion', 'system_prompt', 'text'],
    num_rows: 10
})
Below is an instruction that describes a task, paired with a question that provides further context.
Write a response that appropriately answer the question.
Before answering, think carefully but concisely about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are an expert programmer with advanced knowledge in Python. Your task is to provide concise and easy-to-understand solutions. Please answer the following python question.

### Question:
</think>

Sure! Please provide the Python question you'd like me to help with, and I'll offer a clear, concise solution.

### Response:
Okay, the user wants me to provide a Python solution. They mentioned I'm an expert, so I should aim for efficient and clean code.

First, I need to understand the problem clearly. Are they asking for something specific, like a function or script? Wi

In [15]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 3,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
    ),
)

Map: 100%|██████████| 10/10 [00:00<00:00, 2001.67 examples/s]
num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.
Applying chat template to train dataset (num_proc=10): 100%|██████████| 10/10 [00:01<00:00,  5.67 examples/s]
num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.
Tokenizing train dataset (num_proc=10): 100%|██████████| 10/10 [00:01<00:00,  5.49 examples/s]
num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.
Tokenizing train dataset (num_proc=10): 100%|██████████| 10/10 [00:00<00:00, 51.53 examples/s]


In [16]:
os.environ['UNSLOTH_RETURN_LOGITS'] = '1'

with torch.cuda.device(0):  # Assuming you're using GPU 0
    trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 3
 "-____-"     Number of trainable parameters = 18,464,768


Step,Training Loss
1,0.9521
2,1.1331
3,0.9801


In [17]:
# Save to the local directory and push it to the Hub
model.save_pretrained(MODEL_NAME)
tokenizer.save_pretrained(MODEL_NAME)
model.save_pretrained_merged(MODEL_NAME, tokenizer, save_method="merged_16bit")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.55 out of 31.24 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 96%|█████████▋| 27/28 [00:00<00:00, 33.54it/s]
We will save to Disk and not RAM now.
100%|██████████| 28/28 [00:00<00:00, 31.19it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [18]:
fine_tuned_model = f"{REPO_NAME}/{MODEL_NAME}"
model.push_to_hub(fine_tuned_model, safe_serialization=None)

100%|██████████| 1/1 [00:01<00:00,  1.43s/it]


Saved model to https://huggingface.co/vector124/deepseek-r1-distill-qwen-1.5-unsloth-sft-python


In [None]:
# TODO: there's still errors here

tokenizer.push_to_hub(fine_tuned_model, safe_serialization=None)
model.push_to_hub_merged(fine_tuned_model, tokenizer, save_method="merged_16bit") # for vLLM
model.push_to_hub_gguf(
    f"{fine_tuned_model}_q4_k_m", tokenizer, quantization_method="q4_k_m"
) # as gguf

In [28]:
# Run inference
question = "How can I get the prime numbers from 0 to 125?"

FastLanguageModel.for_inference(model)
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=2048,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])

ValueError: Pointer argument (at 2) cannot be accessed from Triton (cpu tensor?)