In [None]:
from unsloth import FastModel
from unsloth.chat_templates import get_chat_template
import torch

from transformers import TextStreamer

In [2]:
# this model will OOM with 24 GB VRAM
# base_model = "unsloth/gemma-3-27b-it-unsloth-bnb-4bit"
base_model = "unsloth/gemma-3-27b-it-bnb-4bit"
max_tokens = 2048

tuned_model = f"{base_model.replace('/', '_')}-qlora-social-media"

tuned_model_top_dir = f"model_{tuned_model}"
tuned_model_save_dir = f"{tuned_model_top_dir}/model_save"
tuned_model_gguf_dir = f"{tuned_model_top_dir}/model_gguf"
tuned_model_checkpoints_dir = f"{tuned_model_top_dir}/checkpoints"
tuned_model_logs_dir = f"{tuned_model_top_dir}/logs"

In [None]:
# tuned_model_save_dir still requires the base model shards to be present in the HF cache
# or else they will be downloaded all over again
# (because save_dir only has the QLoRA weights, which are not the full model)
model, tokenizer = FastModel.from_pretrained(
    model_name=tuned_model_save_dir,
    # model_name=base_model,
    max_seq_length=max_tokens,
    dtype=None,
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False,
    # use_exact_model_name=True
)
FastModel.for_inference(model)

In [4]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template="gemma-3",
)

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "What did you study?",
            }
        ],
    }
]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
)

_ = model.generate(
    **tokenizer([text], return_tensors="pt").to("cuda"),
    max_new_tokens=max_tokens,
    temperature=1.0,
    top_p=0.95,
    min_p=0.0,
    top_k=64,
    streamer=TextStreamer(tokenizer, skip_prompt=True),
)