# üîç FunctionGemma Debug Test

Shows raw model output to diagnose the issue.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install -q transformers peft torch accelerate bitsandbytes huggingface_hub

In [None]:
from huggingface_hub import login
login()

In [None]:
import torch
import torch.distributed as dist
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

try:
    if not dist.is_initialized():
        dist.init_process_group(backend="gloo", init_method="file:///tmp/debug_test", rank=0, world_size=1)
except: pass

BASE_MODEL_ID = "google/medgemma-4b-it"
ADAPTER_ID = "NurseCitizenDeveloper/nursing-function-gemma"

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, quantization_config=bnb_config, device_map={"": 0}, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model = PeftModel.from_pretrained(model, ADAPTER_ID)
print("‚úÖ Model loaded!")

In [None]:
# DEBUG TEST - Use EXACT format from training
user_input = "BP is 120/80, pulse 72"

# This EXACT format was used in training:
tools_prompt = """You are a clinical AI agent. Convert clinical notes into function calls.

Functions:
- record_vitals(systolic=X, diastolic=Y, heart_rate=Z, temp_c=T)
- administer_medication(drug_name='X', dose='Y', route='Z')
- search_nmc_standards(query='X')

Extract the actual values from the input and output the correct function call."""

prompt = f"<start_of_turn>user\n{tools_prompt}\n\nInput: {user_input}<end_of_turn>\n<start_of_turn>model\n"

print("="*60)
print("PROMPT BEING SENT:")
print("="*60)
print(prompt)
print("="*60)

In [None]:
# Generate with explicit settings
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
print(f"Input tokens: {inputs['input_ids'].shape}")

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )

print(f"Output tokens: {outputs.shape}")
print(f"New tokens generated: {outputs.shape[1] - inputs['input_ids'].shape[1]}")

In [None]:
# RAW output (no processing)
raw_output = tokenizer.decode(outputs[0])
print("="*60)
print("RAW OUTPUT (with special tokens):")
print("="*60)
print(raw_output)
print("="*60)

In [None]:
# Decode ONLY the new tokens
input_length = inputs['input_ids'].shape[1]
new_tokens = outputs[0][input_length:]
new_text = tokenizer.decode(new_tokens, skip_special_tokens=True)

print("="*60)
print("ONLY NEW GENERATED TOKENS:")
print("="*60)
print(f"'{new_text}'")
print("="*60)

if len(new_text.strip()) == 0:
    print("\n‚ö†Ô∏è MODEL GENERATED NOTHING NEW!")
    print("This means the model is hitting EOS immediately.")
    print("The adapter may not have trained properly for generation.")

In [None]:
# Test with sampling enabled
print("\nüß™ Testing with do_sample=True, temperature=0.7:")
with torch.no_grad():
    outputs2 = model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id
    )

new_tokens2 = outputs2[0][input_length:]
new_text2 = tokenizer.decode(new_tokens2, skip_special_tokens=True)
print(f"Generated: '{new_text2}'")