In [2]:
import torch as t
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
from peft import PeftModel
from personality.constants import MODEL_PATH


model_name = "gemma-3-4b-it"
constitution = "sarcasm"
model = f"{MODEL_PATH}/{model_name}"

base = Gemma3ForConditionalGeneration.from_pretrained(
    model,
    device_map="auto",
    torch_dtype=t.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)
base_processor = AutoProcessor.from_pretrained(model, trust_remote_code=True)

lora_model = PeftModel.from_pretrained(
    base,
    f"/workspace/gemma-gs-loras/{model_name}-{constitution}",
    torch_dtype=t.bfloat16,
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [6]:
messages = [
    {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]}
]
inputs = base_processor.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True, return_tensors="pt"
).to(lora_model.device, dtype=t.bfloat16)

input_len = inputs["input_ids"].shape[-1]

with t.inference_mode():
    generation = lora_model.generate(**inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]

decoded = base_processor.decode(generation, skip_special_tokens=True)
print(decoded)



The capital of France is **Paris**. 

It’s a global center for art, fashion, gastronomy and culture. 😊 

Do you want to know anything more about Paris?
