# Qwen3-14B LoRA Fine-Tuning with Unsloth

In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

## Load Base Model

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None  # Auto detection
load_in_4bit = False

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-Coder-7B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

## Apply LoRA

In [None]:
rank = 256
alpha = rank

model = FastLanguageModel.get_peft_model(
    model,
    r=rank,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=alpha,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

## Prepare Dataset

In [None]:
alpaca_prompt = """### Instruction:
You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.

### User Edits:
{}

### User Excerpt:
{}

### Response:

{}
"""

EOS_TOKEN = tokenizer.eos_token
original_start_marker = "<|editable_region_start|>"
original_end_marker = "<|editable_region_end|>"


def format_example(events, input, output):
    return alpaca_prompt.format(events, input, output)


def formatting_prompts_func(examples):
    events = examples["events"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for ev, inp, out in zip(events, inputs, outputs):
        output_start_index = out.find(original_start_marker)
        output_focused_region = out[output_start_index:]
        out = output_focused_region
        text = format_example(ev, inp, out) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}


def filter_long_sequences(examples):
    tokenized = tokenizer(examples["text"])
    return len(tokenized["input_ids"]) <= 1500

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": "train.jsonl", "eval": "eval.jsonl"})
dataset = dataset.map(formatting_prompts_func, batched=True)

train_dataset = dataset["train"].filter(filter_long_sequences)
eval_dataset = dataset["eval"]

print("train len", len(train_dataset))
print("eval len", len(eval_dataset))
print(train_dataset[0]["text"][:500])

## Training

In [None]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

response_template = "### Response:\n\n"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    data_collator=collator,
    args=TrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
        eval_steps=5,
        eval_strategy="steps",
        do_eval=True,
    ),
)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Save Model

In [None]:
model.save_pretrained_gguf(
    "model_gguf",
    tokenizer,
    quantization_method="q4_k_m",  # Good balance of quality/size. Also: q8_0, f16, q5_k_m
)