In [None]:
# Install Pytorch & other libraries
%pip install "torch>=2.4.0" tensorboard torchvision

# Install Gemma release branch from Hugging Face
%pip install "transformers>=4.51.3"

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.3.2" \
  "accelerate==1.4.0" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.45.3" \
  "trl==0.15.2" \
  "peft==0.14.0" \
  "pillow==11.1.0" \
  protobuf \
  sentencepiece

In [None]:

import torch
from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_dataset

base_model_id = "google/gemma-3-4b-pt"
processor_id  = "google/gemma-3-4b-it"          # <-- IMPORTANT: same as training
adapter_dir   = "gemma-3-cui-finetuned-sample1"

# 1) Load processor (IT) and tokenizer
processor = AutoProcessor.from_pretrained(processor_id)
tokenizer = processor.tokenizer

# 2) Re-add CUI tokens exactly like during training
#    (you used plain codes like "C0041618", not "")
cui_tokens = list(cui_mapping.keys())
num_added = tokenizer.add_tokens(cui_tokens)
print("Added", num_added, "CUI tokens")

processor.tokenizer = tokenizer

# 3) Load base model (PT) with 4-bit quant
model_kwargs = dict(
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_storage=torch.bfloat16,
    ),
)

model = AutoModelForImageTextToText.from_pretrained(
    base_model_id,
    **model_kwargs,
)

# 4) Resize embeddings to match tokenizer (base vocab + CUI tokens)
model.resize_token_embeddings(len(tokenizer))

# 5) Attach LoRA adapter you trained
model = PeftModel.from_pretrained(model, adapter_dir)
model.eval()

In [None]:
sample_generate = dataset[1001]
image = sample_generate["image"]

In [None]:
system_message = "You are a digital radiologist who can understand the medical scan of images code the concepts and provide captions"

user_prompt = """Create a description based on the provided image and return the description of the image with details of the scan as captions, the concepts and their descriptions, only the concepts that are extracted"""

messages = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": system_message},
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": user_prompt},
            {"type": "image", "image": image},
        ],
    },
]

In [None]:
# Turn chat messages into a single string prompt
chat_text = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,   # <-- important for inference
    tokenize=False,
)

# Build model inputs (batch size 1)
inputs = processor(
    text=[chat_text],
    images=[image],
    return_tensors="pt",
    padding=True,
)

# Move to correct device
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate
with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.9,
        top_p=0.9,
    )

# Option 1: decode only the new tokens (without re-printing the prompt)
gen_only_ids = generated_ids[:, inputs["input_ids"].shape[-1]:]
output_text = tokenizer.decode(gen_only_ids[0], skip_special_tokens=True)

print("MODEL OUTPUT:\n")
print(output_text)