<a href="https://colab.research.google.com/github/BoJavs-svg/LLM_Lora_FineTunning/blob/main/nb/Qwen3_(14B)-Reasoning-Conversational.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installation

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    !pip install fastapi uvicorn pyngrok nest-asyncio

from huggingface_hub import login
from google.colab import userdata
login(userdata.get('hf'))

### Unsloth

In [6]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
from peft import LoraConfig

# 1. Load the base model
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-14B",
    max_seq_length = 2048,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
)

# 2. Reapply LoRA the same way as originally done
model = FastLanguageModel.get_peft_model(
    base_model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)
peft_config = LoraConfig(
    r = 32,
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    task_type="CAUSAL_LM",

)
model.resize_token_embeddings(len(tokenizer))
# 3. Load adapter weights directly
# model.load_adapter("BoJavs/TrainedQwen2.5", adapter_name="default")
# model.set_adapter("default")
tokenizer.add_special_tokens({
    "additional_special_tokens": ["<|im_start|>", "<|im_end|>"]
})

# 4. Continue with training or inference
print(sum(p.requires_grad for p in model.parameters()))  # Should be > 0


==((====))==  Unsloth 2025.5.9: Fast Qwen2 patching. Transformers: 4.52.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

672


In [7]:
from datasets import load_dataset
swe_bench_lite = load_dataset('BoJavs/Clean_SweBench', split='train')

README.md:   0%|          | 0.00/861 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/739k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/172k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/176 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/44 [00:00<?, ? examples/s]

Let's see the structure of both datasets:

In [8]:
swe_bench_lite

Dataset({
    features: ['repo', 'instance_id', 'base_commit', 'patch', 'test_patch', 'problem_statement', 'hints_text', 'created_at', 'version', 'FAIL_TO_PASS', 'PASS_TO_PASS', 'environment_setup_commit', 'image_name'],
    num_rows: 176
})

Next we take the non reasoning dataset and convert it to conversational format as well.

> Agregar bloque entrecomillado



We have to use Unsloth's `standardize_sharegpt` function to fix up the format of the dataset first.

In [9]:
def separate_patch(patch_text):
    old_lines = []
    new_lines = []

    for line in patch_text.splitlines():
        if line.startswith('@@') or line.startswith('diff'):
            # diff hunk header, ignore or you can parse line ranges if needed
            continue
        elif line.startswith('-'):
            # removed line: add to old code only
            old_lines.append(line[1:])
        elif line.startswith('+'):
            # added line: add to new code only
            new_lines.append(line[1:])
        else:
            # context line, add to both old and new
            old_lines.append(line)
            new_lines.append(line)

    old_code = "\n".join(old_lines)
    new_code = "\n".join(new_lines)

    return old_code, new_code


In [10]:
def generate_conversation(instance):
  for problem, patch, repo in zip(
      instance["problem_statement"],
      instance["patch"],
      instance["repo"],
  ):
    prev, new = separate_patch(patch)
    user_prompt = f"""\
  We're currently solving the following issue within our repository. Here's the issue text:
  ISSUE:
  {problem}
  Now, you're going to solve this issue on your own.
  The issue is in:
  {prev}
  YOU MUST RETURN A PATCH"""
    patch_prompt= f"""\
  DISCUSSION
  The solved code for this problem is:
  {new}
  <command>
  {patch}
  </command>"""

  return {"prompt": user_prompt, "completion": patch_prompt}


In [11]:
def tokenize_function(example):
    text = f"{example['prompt']}\n{example['completion']}"
    return tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=2048,
    )

In [25]:
from unsloth.chat_templates import standardize_sharegpt
from datasets import Dataset
from trl import apply_chat_template

inst=[]
for row in swe_bench_lite:
  inst.append(generate_conversation(row))
prompts = [inst['prompt'] for inst in inst]
completions=[inst['completion'] for inst in inst]
dataset_dict={
    "prompt": prompts,
    "completion": completions,
}
inst_dataset = Dataset.from_dict(dataset_dict)
dataset=inst_dataset.map(tokenize_function, batched=False)

print(inst_dataset)

Map:   0%|          | 0/176 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 176
})


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [31]:
from trl import SFTTrainer, SFTConfig,DataCollatorForCompletionOnlyLM
# we need to make sure it
response_template = "\n### Response:"
response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
training_args = SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=2e-4,
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        report_to="none",

    )
trainer = SFTTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset,
    max_seq_length=1000,
)


Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [32]:
for _ in range(3):
  trainer_stats = trainer.train()
  model.save_pretrained("lora_model")
  tokenizer.save_pretrained("lora_model")
  model.push_to_hub_merged(
      "BoJavs/TrainedQwen2.5",
      tokenizer,
      save_method="lora",
  )

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 176 | Num Epochs = 1 | Total steps = 22
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 137,625,600/14,000,000,000 (0.98% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.0198
20,0.1243




Unsloth: Saving LoRA adapters. Please wait...


README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.66G [00:00<?, ?B/s]



Saved lora model to https://huggingface.co/BoJavs/TrainedQwen2.5


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 176 | Num Epochs = 1 | Total steps = 22
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 137,625,600/14,000,000,000 (0.98% trained)


Step,Training Loss
10,0.0785
20,0.0622


Unsloth: Saving LoRA adapters. Please wait...


adapter_model.safetensors:   0%|          | 0.00/3.66G [00:00<?, ?B/s]



Saved lora model to https://huggingface.co/BoJavs/TrainedQwen2.5


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 176 | Num Epochs = 1 | Total steps = 22
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 137,625,600/14,000,000,000 (0.98% trained)


Step,Training Loss
10,0.0658
20,0.0653


Unsloth: Saving LoRA adapters. Please wait...


adapter_model.safetensors:   0%|          | 0.00/3.66G [00:00<?, ?B/s]



Saved lora model to https://huggingface.co/BoJavs/TrainedQwen2.5


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [4]:
import torch

prompt = """You are an autonomous programmer.
ISSUE:
Fix this Python bug:

def add_numbers(a, b):
    return a - b

Your output MUST always have this format:
DISCUSSION
<command>
<patch>
</command>
You MUST RETURN A PATCH.
"""

# Tokenize and move to model's device
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate output using correct variable name
output = model.generate(**inputs, max_new_tokens=256)

# Decode only the newly generated tokens
decoded_output = tokenizer.decode(
    output[0][inputs['input_ids'].shape[-1]:],
    skip_special_tokens=True
).strip()

print("Output: " + decoded_output)


Output: The solved code for this problem is:
DISCUSSION
The issue in the code is that the function is currently subtracting the second number from the first number instead of adding them. To fix this, we need to change the subtraction operator to an addition operator.
<command>
<patch>
def add_numbers(a, b):
    return a + b
</command>


In [3]:
from unsloth import FastLanguageModel  # Or your actual import
from transformers import AutoTokenizer
from fastapi import FastAPI, Request
from pydantic import BaseModel
import uvicorn
from pyngrok import ngrok
import nest_asyncio
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-14B",
    max_seq_length = 5000,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
)

# 2. Reapply LoRA the same way as originally done
model = FastLanguageModel.get_peft_model(
    base_model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,

)
model.resize_token_embeddings(len(tokenizer))
# 3. Load adapter weights directly
model.load_adapter("BoJavs/TrainedQwen2.5", adapter_name="default")
model.set_adapter("default")

# model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="q4_k_m")
model.push_to_hub_merged("BoJavs/Qwen2_5Q4b", tokenizer, save_method = "merged_4bit", token = "")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.9: Fast Qwen2 patching. Transformers: 4.52.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/196k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth 2025.5.9 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


adapter_model.safetensors:   0%|          | 0.00/3.66G [00:00<?, ?B/s]

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 12.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 60.4 out of 83.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 85%|████████▌ | 41/48 [00:01<00:00, 51.40it/s]
We will save to Disk and not RAM now.
100%|██████████| 48/48 [00:07<00:00,  6.09it/s]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting qwen2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at BoJavs/TrainedQwen2.5-GGUF into bf16 GGUF format.
The output location will be /content/BoJavs/TrainedQwen2.5-GGUF/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: TrainedQwen2.5-GGUF
INFO:hf-to-gguf:Model architecture: Qwen2ForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'mo

unsloth.Q4_K_M.gguf:   0%|          | 0.00/8.99G [00:00<?, ?B/s]



Saved GGUF to https://huggingface.co/BoJavs/TrainedQwen2.5-GGUF


In [None]:
Puntos:

In [5]:
# Apply asyncio patch for Colab
nest_asyncio.apply()
app = FastAPI()

# Load your model (adjust this to your model's loading code)

class RequestBody(BaseModel):
    prompt: str
    max_tokens: int = 10000
    temperature: float = 0.7
    stop: str | None = None

# Updated endpoint
@app.post("/api/generate")
def generate_completion(data: RequestBody):
    # Tokenize input
    inputs = tokenizer(data.prompt, return_tensors="pt").to(model.device)
    print("Got an input")
    # Generate output
    output = model.generate(
        **inputs,
        max_new_tokens=data.max_tokens,
        temperature=data.temperature,
    )
    print("Got an output")
    # Decode generated text
    decoded_output = tokenizer.decode(
        output[0][inputs['input_ids'].shape[-1]:],
        skip_special_tokens=True
    ).strip()

    return {"response": decoded_output}

# Open the public tunnel
ngrok.set_auth_token(userdata.get('ngrok'))
public_url = ngrok.connect(8000)
print("🔗 Public URL:", public_url)

# Start the FastAPI app
uvicorn.run(app, port=8000)


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-1' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/main.py", line 580, in run
    server.run()
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 66, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 133, in _run_once
    handle._run()
  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
    se

🔗 Public URL: NgrokTunnel: "https://6060-34-124-230-44.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [28293]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Got an input


INFO:     Shutting down
INFO:     Waiting for background tasks to complete. (CTRL+C to force quit)
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [28293]


Got an output
