# 04 - Evaluation: Base vs Fine-Tuned Model Comparison

This notebook performs a full evaluation comparison between the base model and the fine-tuned model:
- Load test data and tool schemas
- Run inference with both base and fine-tuned models
- Compute evaluation metrics (tool name accuracy, parameter accuracy, JSON validity, hallucination rate)
- Visualize results with comparison charts
- Perform sample-by-sample error analysis

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import sys
sys.path.insert(0, "..")
import torch
import json
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from src.data_utils import load_jsonl, get_tokenizer
from src.eval_metrics import (
    parse_tool_calls_from_output, compute_full_evaluation,
    measure_latency, measure_vram_usage,
)
sns.set_theme(style="whitegrid")

## Configuration

In [None]:
BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
ADAPTER_PATH = "../models/checkpoints/final_adapter"  # Update after training
TEST_DATA = "../data/processed/test.jsonl"
TOOLS_SCHEMA = "../data/samples/gennx_tool_schemas_tier1.json"

## Load Test Data and Tools

In [None]:
test_samples = load_jsonl(TEST_DATA)
tools = json.load(open(TOOLS_SCHEMA))
available_tools = [t["function"]["name"] for t in tools]
tokenizer = get_tokenizer(BASE_MODEL)
print(f"Test samples: {len(test_samples)}")
print(f"Available tools: {len(available_tools)}")

## Helper: Run Inference on All Test Samples

In [None]:
def run_inference(model, tokenizer, test_samples, tools):
    predictions = []
    references = []
    for sample in test_samples:
        # Get messages up to user turn
        msgs = []
        for m in sample["messages"]:
            msgs.append(m)
            if m["role"] == "user":
                break
        # Get reference tool calls
        for m in sample["messages"]:
            if m["role"] == "assistant" and "tool_calls" in m:
                references.append(m["tool_calls"])
                break
        else:
            references.append([])
        # Generate
        text = tokenizer.apply_chat_template(msgs, tools=tools, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=512, do_sample=False)
        pred = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
        predictions.append(pred)
    return predictions, references

## Evaluate Base Model

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True,
)
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=bnb_config, device_map="auto")
base_preds, references = run_inference(base_model, tokenizer, test_samples, tools)
base_metrics = compute_full_evaluation(base_preds, references, available_tools)
print("Base model metrics:")
for k, v in base_metrics.items():
    print(f"  {k}: {v:.4f}")
del base_model
torch.cuda.empty_cache()

## Evaluate Fine-Tuned Model

In [None]:
ft_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=bnb_config, device_map="auto")
ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_PATH)
ft_preds, _ = run_inference(ft_model, tokenizer, test_samples, tools)
ft_metrics = compute_full_evaluation(ft_preds, references, available_tools)
print("Fine-tuned model metrics:")
for k, v in ft_metrics.items():
    print(f"  {k}: {v:.4f}")

## Metric Comparison Chart

In [None]:
metrics = list(base_metrics.keys())
base_vals = [base_metrics[m] for m in metrics]
ft_vals = [ft_metrics[m] for m in metrics]

fig, ax = plt.subplots(figsize=(10, 5))
x = range(len(metrics))
width = 0.35
bars1 = ax.bar([i - width/2 for i in x], base_vals, width, label="Base Model", color="steelblue")
bars2 = ax.bar([i + width/2 for i in x], ft_vals, width, label="Fine-Tuned", color="coral")
ax.set_ylabel("Score")
ax.set_title("Base vs Fine-Tuned Model Metrics")
ax.set_xticks(x)
ax.set_xticklabels([m.replace("_", "\n") for m in metrics], fontsize=9)
ax.legend()
ax.set_ylim(0, 1.1)
for bar in bars1 + bars2:
    h = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2, h + 0.02, f"{h:.2f}", ha="center", fontsize=8)
plt.tight_layout()
plt.show()

## Error Analysis

In [None]:
print("=== Sample-by-Sample Comparison ===\n")
for i, (bp, fp, ref) in enumerate(zip(base_preds, ft_preds, references)):
    base_tc = parse_tool_calls_from_output(bp)
    ft_tc = parse_tool_calls_from_output(fp)
    ref_names = [tc["function"]["name"] for tc in ref] if ref else []
    base_names = [tc.get("name", tc.get("function", {}).get("name", "?")) for tc in base_tc]
    ft_names = [tc.get("name", tc.get("function", {}).get("name", "?")) for tc in ft_tc]
    status = "\u2713" if ft_names == ref_names else "\u2717"
    print(f"Sample {i+1} {status}")
    print(f"  Reference:  {ref_names}")
    print(f"  Base:       {base_names}")
    print(f"  Fine-tuned: {ft_names}")
    print()

## Summary
- Compare improvements in tool name accuracy, parameter accuracy, and JSON validity
- Lower hallucination rate is better
- Check individual samples to understand error patterns