In [1]:
# train_qwen3.py
from unsloth import FastLanguageModel, FastModel
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
import torch, os

# 1. Modell laden
model, tokenizer = FastModel.from_pretrained(
    model_name="unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
    max_seq_length=2048,
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False,
)

# 2. LoRA-Adapter
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

# 3. Dataset laden und ggf. konvertieren
dataset = load_dataset("json", data_files="train.json", split="train")

def format_alpaca(example):
    return {"text": f"{example['prompt']}\n{example['completion']}"}

dataset = dataset.map(format_alpaca)

# 4. Training
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,          # alle 5 k
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=2,         # statt max_steps
        # oder: max_steps=625*2     # 2 Epochen
        learning_rate=2e-4,
        fp16=torch.cuda.is_bf16_supported() is False,
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        output_dir="qwen3-lora-full",
        optim="adamw_8bit",
    ),
)
trainer.train()

# 5. Speichern
trainer.save_model("qwen3-lora")
print("Fertig! Adapter liegt unter ./qwen3-lora")


Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.9: Fast Qwen3 patching. Transformers: 4.54.0.
   \\   /|    NVIDIA GeForce RTX 2070 SUPER. Num GPUs = 1. Max memory: 7.778 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Making `model.base_model.model.model` require gradients


Generating train split: 5000 examples [00:00, 51904.82 examples/s]
Map: 100% 5000/5000 [00:00<00:00, 17962.76 examples/s]
Unsloth: Tokenizing ["text"] (num_proc=2): 100% 5000/5000 [00:03<00:00, 1505.82 examples/s]
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,000 | Num Epochs = 2 | Total steps = 1,250
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 17,432,576 of 1,738,007,552 (1.00% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.0517
20,1.7449
30,1.3663
40,0.8971
50,0.5113
60,0.3669
70,0.3251
80,0.3147
90,0.2803
100,0.2843


Fertig! Adapter liegt unter ./qwen3-lora


In [4]:
# 0. Korrektes Laden des Adapters
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name = "./qwen3-lora" # <- Lade direkt den LoRA-Adapter-Ordner
)

# 1. 16-bit-Merge & GGUF
model.save_pretrained_merged(
    "qwen3-lora-merged",
    tokenizer,
    save_method="merged_16bit"
)

from unsloth import save_to_gguf
save_to_gguf("qwen3-lora-merged", quantization_method="q4_k_m")

print("✅ Export erfolgreich!")

==((====))==  Unsloth 2025.7.9: Fast Qwen3 patching. Transformers: 4.54.0.
   \\   /|    NVIDIA GeForce RTX 2070 SUPER. Num GPUs = 1. Max memory: 7.778 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Merging weights into 16bit: 100% 1/1 [00:17<00:00, 17.26s/it]


TypeError: save_to_gguf() missing 1 required positional argument: 'model_dtype'

In [11]:
from unsloth import FastModel, save_to_gguf

# 1. Lade den fertigen Adapter
model, tokenizer = FastModel.from_pretrained(model_name = "./qwen3-lora")

# 2. Speichere das gemergte 16-bit Modell
model.save_pretrained_merged(
    "qwen3-lora-merged",
    tokenizer,
    save_method = "merged_16bit",
)

# 3. Konvertiere den Ordner zu GGUF mit dem korrekten Datentyp
save_to_gguf(
    "qwen3-lora-merged",
    "float16",  # <--- Das fehlende Argument
    quantization_method = "q4_k_m"
)

print("\n✅ Fertig! Deine GGUF-Datei liegt im aktuellen Verzeichnis.")

==((====))==  Unsloth 2025.7.9: Fast Qwen3 patching. Transformers: 4.54.0.
   \\   /|    NVIDIA GeForce RTX 2070 SUPER. Num GPUs = 1. Max memory: 7.778 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Merging weights into 16bit: 100% 1/1 [00:12<00:00, 12.42s/it]


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at unsloth_finetuned_model into f16 GGUF format.
The output location will be /workspace/unsloth_finetuned_model/unsloth.F16.gguf
This might take 3 minutes...
ERROR:hf-to-gguf:Error: unsloth_finetuned_model is not a directory


RuntimeError: Unsloth: Quantization failed for /workspace/unsloth_finetuned_model/unsloth.F16.gguf
You might have to compile llama.cpp yourself, then run this again.
You do not need to close this Python program. Run the following commands in a new terminal:
You must run this in the same folder as you're saving your model.
git clone --recursive https://github.com/ggerganov/llama.cpp
cd llama.cpp && make clean && make all -j
Once that's done, redo the quantization.

In [12]:
from unsloth import FastModel, save_to_gguf

# Der Name des Ordners, den Unsloth intern erwartet
output_folder = "unsloth_finetuned_model"

# 1. Lade den fertigen Adapter
model, tokenizer = FastModel.from_pretrained(model_name = "./qwen3-lora")

# 2. Speichere in den von Unsloth erwarteten Ordnernamen
model.save_pretrained_merged(
    output_folder,
    tokenizer,
    save_method = "merged_16bit",
)

# 3. Rufe den GGUF-Export auf, der diesen Ordner jetzt finden wird
save_to_gguf(
    output_folder,
    "float16",
    quantization_method = "q4_k_m"
)

print(f"\n✅ Fertig! Deine GGUF-Datei (z.B. {output_folder}-q4_k_m.gguf) liegt im aktuellen Verzeichnis.")

==((====))==  Unsloth 2025.7.9: Fast Qwen3 patching. Transformers: 4.54.0.
   \\   /|    NVIDIA GeForce RTX 2070 SUPER. Num GPUs = 1. Max memory: 7.778 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Merging weights into 16bit: 100% 1/1 [05:48<00:00, 348.74s/it]
Unsloth: Converting unsloth_finetuned_model model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at unsloth_finetuned_model into f16 GGUF format.
The output location will be /workspace/unsloth_finetuned_model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: unsloth_finetuned_model
INFO:hf-to-gguf:Model architecture: Qwen3ForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,         torch.bfloat16 --> F16, shape = {2048, 151936}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    