In [None]:
from unsloth import FastModel, to_sharegpt
from unsloth.chat_templates import get_chat_template, standardize_data_formats, train_on_responses_only
from datasets import load_dataset
from transformers import TextStreamer
from trl import SFTTrainer, SFTConfig
import torch

import os
import json

In [2]:
# this model will OoM with 24 GB VRAM
# base_model = "unsloth/gemma-3-27b-it-unsloth-bnb-4bit"
# so use this one instead
base_model = "unsloth/gemma-3-27b-it-bnb-4bit"
max_tokens = 2048

tuned_model = f"{base_model.replace('/', '_')}-qlora-social-media"

tuned_model_top_dir = f"model_{tuned_model}"
tuned_model_save_dir = f"{tuned_model_top_dir}/model_save"
tuned_model_gguf_dir = f"{tuned_model_top_dir}/model_gguf"
tuned_model_checkpoints_dir = f"{tuned_model_top_dir}/checkpoints"
tuned_model_logs_dir = f"{tuned_model_top_dir}/logs"

os.makedirs(tuned_model_top_dir, exist_ok=True)

In [None]:
model, tokenizer = FastModel.from_pretrained(
    model_name=base_model,
    max_seq_length=max_tokens,
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False,
    # use_exact_model_name=True
)

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers=False,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    r=8,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora=True,
)

In [None]:
n_example = 4

dataset = load_dataset("csv", name="csv-for-gemma3", split="train", data_files="conversations.csv")
dataset = dataset.shuffle(seed=42)
print(dataset.column_names)
print(json.dumps(dataset[n_example], indent=2))

In [None]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template="gemma-3",
)

dataset = to_sharegpt(
    dataset,
    merged_prompt="[[\nYour input is:\n{parent_text}]]",
    output_column_name="comment_body",
)
print(json.dumps(dataset[n_example], indent=2))

In [None]:
dataset = standardize_data_formats(dataset)
print(json.dumps(dataset[n_example], indent=2))

In [None]:
def apply_chat_template(examples):
    texts = tokenizer.apply_chat_template(examples["conversations"])
    return {"text": texts}


dataset = dataset.map(apply_chat_template, batched=True)
print(json.dumps(dataset[n_example], indent=2))

In [None]:
dataset = dataset.remove_columns("conversations")
print(json.dumps(dataset[n_example], indent=2))

In [None]:
dataset_length = len(dataset)
batch_size = 1
accum_steps = 2

training_steps = dataset_length // (batch_size * accum_steps)
warmup_steps = training_steps // 10

print(f"dataset length: {dataset_length}")
print(f"training steps: {training_steps}")
print(f"warmup steps:   {warmup_steps}")

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=None,
    dataset_num_proc=2,
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=accum_steps,
        save_strategy="steps",
        logging_steps=1,
        logging_strategy="steps",
        #
        # full training run
        #
        num_train_epochs=1,
        warmup_steps=warmup_steps,
        save_steps=1000,
        #
        # short training run
        #
        # max_steps=100,
        # warmup_steps=10,
        # save_steps=10,
        #
        # end of training run length definition
        learning_rate=1e-4,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=42,
        output_dir=tuned_model_checkpoints_dir,
    ),
)

In [12]:
trainer = train_on_responses_only(
    trainer,
    instruction_part="<start_of_turn>user\n",
    response_part="<start_of_turn>model\n",
)

In [None]:
print(tokenizer.decode(trainer.train_dataset[n_example]["input_ids"]))

In [None]:
print(
    tokenizer.decode(
        [tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[n_example]["labels"]]
    ).replace(tokenizer.pad_token, " ")
)

In [None]:
# if any f"outputs/checkpoint-*" exists, resume from it
if any(f.startswith("checkpoint-") for f in os.listdir(tuned_model_checkpoints_dir)):
    print("Resuming from checkpoint")
    trainer_stats = trainer.train(resume_from_checkpoint=True)
else:
    print("Starting from scratch")
    trainer_stats = trainer.train()

In [None]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template="gemma-3",
)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Give a simple yes or no answer: is water wet? Do not elaborate at all beyond the one-word answer.",
            }
        ],
    }
]

text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
)

_ = model.generate(
    **tokenizer([text], return_tensors="pt").to("cuda"),
    max_new_tokens=max_tokens,
    # recommended Gemma-3 settings
    temperature=1.0,
    top_p=0.95,
    min_p=0.0,
    top_k=64,
    streamer=TextStreamer(tokenizer, skip_prompt=True),
)

In [None]:
# only the QLoRA weights are saved here, not the complete model
# the base model shards are still required if you want to run the fine-tuned QLoRA weights
model.save_pretrained(tuned_model_save_dir)
tokenizer.save_pretrained(tuned_model_save_dir)

In [None]:
# merge QLoRA weights with BF16 weights and save
model.save_pretrained_merged(tuned_model_gguf_dir, tokenizer)

In [None]:
# save merged weights in GGUF format

# model.save_pretrained_gguf(tuned_model_gguf_dir, quantization_type="Q8_0")
# model.save_pretrained_gguf(tuned_model_gguf_dir, quantization_type="F16")
model.save_pretrained_gguf(tuned_model_gguf_dir, quantization_type="BF16")
# model.save_pretrained_gguf(tuned_model_gguf_dir, quantization_type="F32")

In [None]:
# generate Ollama modelfile for GGUF
# requires running tokenizer.apply_chat_template() earlier
with open(f"{tuned_model_gguf_dir}/Modelfile", "w") as f:
    f.write(tokenizer._ollama_modelfile)

In [None]:
print(json.dumps(trainer_stats.metrics, indent=2))

# Load the fine-tuned model into Ollama

https://www.gpu-mart.com/blog/import-models-from-huggingface-to-ollama

This could be automated, but there are so many variations, and this only needs to be done once, after the model is fine-tuned and tested, I kept this as a manual step.

```
# cd into ${tuned_model_top_dir}
cd model_unsloth_gemma-3-27b-it-bnb-4bit-qlora-social-media/

# merge the shards (skip if only one shard exists)
# you only need to indicate the first shard
llama-gguf-split --merge model_gguf.BF16-00001-of-00002.gguf model_gguf.BF16.gguf

# remove the shards
rm model_gguf.BF16-*-of-*.gguf

# quantize the model
llama-quantize model_gguf.BF16.gguf model_gguf.Q4_K_M.gguf Q4_K_M

# remove the merged BF16 GGUF
rm model_gguf.BF16.gguf

# copy the Modelfile out of model_gguf
cp model_gguf/Modelfile .
```

Edit the Modelfile and replace the value (file location) in the top line with the actual file name of the quantized GGUF:

```
FROM "./model_gguf.Q4_K_M.gguf"
```

Also adjust the temperature, `num_predict` (max tokens), and/or other parameters, to match the values in the `model.generate()` step above.

Load the model in Ollama. Choose a name that reflects both the base model, and the fine tuning you did:

```
ollama create "gemma-3-27b-it-bnb-4bit-qlora-social-media" -f Modelfile
```

Now you can run the model in Ollama.