# Agent Routing: Hallucinated Scores vs. True Math

This notebook demonstrates two ways to get "confidence" from an LLM regarding agent routing decisions.

1.  **The Naive Way (BAD):** Asking the model to output a number (0-100). This is a hallucination.
2.  **The Scientific Way (GOOD):** Using `logprobs` to measure the statistical probability of the next token.

### Setup

In [None]:
from __future__ import annotations

import math
import os
from typing import Any, Dict, List, Optional, Tuple, Literal

from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from tabulate import tabulate

# Load env vars (expects OPENAI_API_KEY in your .env)
load_dotenv()
if not os.getenv("OPENAI_API_KEY"):
    raise RuntimeError(
        "OPENAI_API_KEY not set. Create a .env with your key before running this notebook."
    )

# --- CONFIG: AGENT DEFINITIONS ---
AGENT_CARDS: List[Dict[str, str]] = [
    {
        "name": "TravelBuddy",
        "description": "Plan quick city trips with budgets, flights, and local tips.",
    },
    {
        "name": "CodeFixer",
        "description": "Debug and refactor small Python or JavaScript snippets.",
    },
    {
        "name": "HealthNote",
        "description": "Summarize lifestyle and nutrition questions into simple advice.",
    },
    {
        "name": "BizPitch",
        "description": "Draft short startup pitches and positioning statements.",
    },
    {
        "name": "DataScout",
        "description": "Explain CSV/Excel columns and basic data cleaning steps.",
    },
]


## Part 1: The "Trap" (Naive Confidence Scores)

This is what managers usually ask for: *"Just ask the AI how sure it is!"*

**Why this is dangerous:**
When the LLM outputs "Confidence: 95%", it is **not** doing math. It is predicting the text characters "9" and "5" because they look plausible in the context of the prompt. It is a simulation of confidence, not a measurement.

In [None]:
class NaiveConfidenceOutput(BaseModel):
    decision: Literal["YES", "NO"]
    confidence_score: int = Field(description="Confidence level from 0 to 100")
    reasoning: str

naive_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0).with_structured_output(NaiveConfidenceOutput)

def get_naive_score(query: str, card: Dict[str, str]):
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a router. Check if the agent matches the query. Provide a confidence score (0-100) reflecting how sure you are."),
        ("user", "Query: {query}\nAgent: {name} ({description})")
    ])
    chain = prompt | naive_llm
    return chain.invoke({"query": query, "name": card["name"], "description": card["description"]})

test_query = "Plan a weekend trip to Porto."
test_agent = AGENT_CARDS[0] 

result = get_naive_score(test_query, test_agent)

print(f"QUERY: {test_query}")
print(f"AGENT: {test_agent['name']}")
print("-" * 30)
print(f"DECISION: {result.decision}")
print(f"NAIVE SCORE: {result.confidence_score}%")
print(f"REASON: {result.reasoning}")
print("-" * 30)
print("WARNING: This number (98/99/100) is just text generated by the model.")

## Part 2: The Solution (Logprobs)

Instead of asking for a number, we look at the **raw token probabilities**.

We ask the model to predict **"YES"** or **"NO"**.
We enable `logprobs=True` to see the statistical likelihood of those specific tokens before the model even makes a final choice.

In [None]:
llm_base = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.3,
    logprobs=True,
    top_logprobs=10,
    max_tokens=64,
)

Answer = Literal["YES", "NO"]

class RouterLLMOutput(BaseModel):
    answer: Answer = Field(description="Either 'YES' or 'NO'")
    reasoning: str = Field(description="One short English sentence")

router_llm = llm_base.with_structured_output(
    RouterLLMOutput,
    include_raw=True,
)

SYSTEM_PROMPT = """
You are a neutral binary classifier for agent routing.
You receive a user query and ONE agent card.
Decide if this agent is suitable.
Rules:
- Set "answer" to "YES" or "NO" (uppercase).
- "reasoning" must be one short sentence in English.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", SYSTEM_PROMPT),
        ("user", "User Query:\n{query}\n\nAgent Card:\n{name}: {description}"),
    ]
)

### Helper Functions for Math
We need to extract the tokens, normalize them (e.g., " yes" == "YES"), and sum up their probabilities.

In [None]:
class YesNoStat(BaseModel):
    token: str
    logprob: float
    prob: float

class AgentDecision(BaseModel):
    agent_name: str
    answer: Answer
    reasoning: str
    prob_yes: Optional[float] = None
    prob_no: Optional[float] = None
    raw_candidates: List[Tuple[str, float]] = Field(default_factory=list)

def _normalize_label_token(tok: str) -> str:
    return tok.strip().strip("\"'").upper()

def extract_yes_no_enum_stats(logprobs: Dict[str, Any]) -> Tuple[float, float, List[YesNoStat], List[Tuple[str, float]]]:
    content = logprobs["content"]
    decision_candidates: List[Tuple[str, float]] | None = None

    for step in content:
        candidates = [(step["token"], step["logprob"])]
        for alt in step["top_logprobs"]:
            candidates.append((alt["token"], alt["logprob"]))

        if any(_normalize_label_token(t) in ("YES", "NO") for t, _ in candidates):
            decision_candidates = candidates
            break

    if decision_candidates is None:
        return 0.0, 0.0, [], []

    best_lp_yes = None
    best_lp_no = None

    for tok_raw, lp_raw in decision_candidates:
        norm = _normalize_label_token(tok_raw)
        if norm == "YES":
            if best_lp_yes is None or lp_raw > best_lp_yes:
                best_lp_yes = lp_raw
        elif norm == "NO":
            if best_lp_no is None or lp_raw > best_lp_no:
                best_lp_no = lp_raw

    logits = []
    if best_lp_yes is not None:
        logits.append(("YES", best_lp_yes))
    if best_lp_no is not None:
        logits.append(("NO", best_lp_no))

    if not logits:
        return 0.0, 0.0, [], decision_candidates

    max_lp = max(lp for _, lp in logits)
    exps = [(label, math.exp(lp - max_lp)) for label, lp in logits]
    denom = sum(v for _, v in exps)

    p_yes = 0.0
    p_no = 0.0
    stats = []

    for label, val in exps:
        prob = val / denom
        lp_label = best_lp_yes if label == "YES" else best_lp_no
        stats.append(YesNoStat(token=label, logprob=lp_label, prob=prob))
        if label == "YES":
            p_yes = prob
        else:
            p_no = prob

    return p_yes, p_no, stats, decision_candidates

In [None]:
def route_query_with_logprobs(query: str):
    print("=" * 80)
    print(f"QUERY: {query}")
    print("=" * 80)

    decisions = []

    for card in AGENT_CARDS:
        messages = prompt.format_messages(query=query, name=card["name"], description=card["description"])
        result = router_llm.invoke(messages)

        logprobs = result["raw"].response_metadata["logprobs"]
        p_yes, p_no, _, raw_candidates = extract_yes_no_enum_stats(logprobs)

        decisions.append(AgentDecision(
            agent_name=card["name"],
            answer=result["parsed"].answer,
            reasoning=result["parsed"].reasoning,
            prob_yes=p_yes, prob_no=p_no,
            raw_candidates=raw_candidates
        ))

    decisions.sort(key=lambda x: x.prob_yes or 0.0, reverse=True)

    table_data = [[d.agent_name, d.answer, f"{d.prob_yes:.4f}", f"{d.prob_no:.4f}", d.reasoning] for d in decisions]
    print(tabulate(table_data, headers=["Agent", "Ans", "p(YES)", "p(NO)", "Reason"], tablefmt="simple"))

    if decisions:
        top = decisions[0]
        print(f"\n>>> DEEP DIVE: Token probabilities for Agent '{top.agent_name}'")

        raw_rows = []
        sorted_candidates = sorted(top.raw_candidates, key=lambda x: x[1], reverse=True)

        total_vis = 0.0
        for tok, lp in sorted_candidates:
            prob = math.exp(lp) * 100
            total_vis += prob
            clean = _normalize_label_token(tok)
            cat = "YES bucket" if clean == "YES" else ("NO bucket" if clean == "NO" else "Ignored")
            raw_rows.append([repr(tok), f"{lp:.4f}", f"{prob:.2f}%", cat])

        print(tabulate(raw_rows, headers=["Token", "Logprob", "Prob(%)", "Bucket"], tablefmt="github"))
        print(f"... remaining {100-total_vis:.4f}% is in the tail.")

## Part 3: Running the Deep Dive
Now we see the "Science" behind the score. Notice how multiple tokens (e.g. `"YES"` vs `"Yes"`) might appear.

In [None]:
queries = [
    "Plan a weekend trip to Porto with a $600 budget.",
    "Is a tomato a vegetable? Just give me the fact.",
]

for q in queries:
    route_query_with_logprobs(q)
    print("\n" * 2)