# 01 - Inference Basics: Qwen2.5-1.5B-Instruct

This notebook explores base model inference with the Qwen2.5-1.5B-Instruct model.
We will measure VRAM usage under 4-bit quantization and test zero-shot tool calling
to establish a baseline before fine-tuning.

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import sys
sys.path.insert(0, "..")
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Load Model in 4-bit Quantization

In [None]:
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, quantization_config=bnb_config, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Model loaded. Parameters: {model.num_parameters():,}")

## VRAM Usage

In [None]:
from src.eval_metrics import measure_vram_usage
vram = measure_vram_usage()
print(f"VRAM: {vram['used_mb']:.0f} MB / {vram['total_mb']:.0f} MB ({vram['percent']:.1f}%)")

## Basic Inference

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is structural analysis?"},
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=256, do_sample=False)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print(response)

## Zero-Shot Tool Calling Test

In [None]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "POST /db/node",
            "description": "Create or add nodes to the structural model",
            "parameters": {
                "type": "object",
                "properties": {
                    "Assign": {"type": "object", "description": "Node assignments"}
                },
                "required": ["Assign"]
            }
        }
    }
]
messages = [
    {"role": "system", "content": "You are a structural engineering assistant for GEN NX."},
    {"role": "user", "content": "\uc808\uc810 1\ubc88\uc744 \uc6d0\uc810\uc5d0 \ucd94\uac00\ud574\uc918"},
]
text = tokenizer.apply_chat_template(messages, tools=tools, tokenize=False, add_generation_prompt=True)
print("=== Formatted Prompt ===")
print(text[:500])
print("...")
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=512, do_sample=False)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
print("\n=== Model Output ===")
print(response)

## Parse Tool Calls

In [None]:
from src.eval_metrics import parse_tool_calls_from_output
tool_calls = parse_tool_calls_from_output(response)
print(f"Parsed {len(tool_calls)} tool call(s):")
for tc in tool_calls:
    print(json.dumps(tc, indent=2, ensure_ascii=False))