
# Parametric QoS — Agents Notebook

This notebook was generated from your Mermaid specification and contains the requested sections: **[SETUP]**, **[SETUP-ENV]**, **[KERNEL]**, **[TOOLS]**, **[AGENTS]**, **[WIRES]**, **[DEMO]**.

```mermaid
flowchart LR 
  %% ===================== PARAMETRIC (QoS) =====================
  %% Latency / cost constraints and budgets per tool / per turn

  %% -------- CONFIG (INTENT / TENANT) --------
  subgraph CFG["QoS Config (per tenant, per intent)"]
    SLO_LAT[p95_latency_target_ms]
    BUD_TOK[budget_tokens_per_turn]
    BUD_USD[budget_us_per_turn]
    CAP_TOOL[per_tool_caps\n(tokens, timeouts, retries)]
    PRICE[pricing_model\n($/1k tokens, egress $)]
  end

  %% -------- TURN CONTEXT --------
  subgraph TURN["Conversation Turn"]
    STEPS[planned_steps\n(tool_i ... tool_n)]
    META[trace_id, tenant_id,\nintent_id, model_id]
  end

  %% -------- COST/LATENCY MODELS --------
  subgraph MODELS["Cost & Latency Models"]
    CTOK[Token counter\nprompt_i + completion_i]
    CLAT[Latency model\nsum(tool_i_latency) + overhead]
    CUSD[Cost model\nΣ(tokens_i/1000 * price_i)]
  end

  %% -------- CONSTRAINTS (PARAMETRIC) --------
  subgraph CON["Constraints"]
    C1{{Latency constraint:\n p95_latency <= SLO_LAT}}
    C2{{Token constraint:\n Σ tokens_i <= BUD_TOK}}
    C3{{Cost constraint:\n Σ cost_i <= BUD_USD}}
    C4{{Per-tool caps:\n tokens_i <= cap_i\n retries_i <= rcap_i\n latency_i <= lcap_i}}
  end

  %% -------- ENFORCEMENT / ACTIONS --------
  subgraph ACT["Enforcement & Adaptation"]
    ENF[Enforce caps\n(deny/degrade)]
    ADAPT[Adapt plan\n(cheaper model,\ncombine steps,\nreduce context)]
    PASS[Pass gate]
    FAIL[Fail gate / refuse]
  end

  %% ===================== WIRING =====================
  %% Config feeds models and constraints
  CFG --> MODELS
  CFG --> CON

  %% Turn feeds models
  TURN --> MODELS
  MODELS -->|tokens_i, latency_i, cost_i| CON

  %% Constraint evaluation
  C1 -->|ok| PASS
  C2 -->|ok| PASS
  C3 -->|ok| PASS
  C4 -->|ok| PASS

  C1 -->|violation| ENF
  C2 -->|violation| ENF
  C3 -->|violation| ENF
  C4 -->|violation| ENF

  ENF --> ADAPT -->|recompute| MODELS
  PASS -->|execute tools| DONE[Proceed with turn]
  ENF -->|cannot adapt| FAIL

  %% ===================== NOTES =====================
  %% Example equations (per turn):
  %% - p95_latency = max(p95(tool_i)) + orchestration_overhead
  %% - tokens_total = Σ (prompt_i + completion_i)
  %% - cost_total = Σ ((tokens_i / 1000) * price_model_i) + egress_cost
  %% - caps: {tokens_i, retries_i, latency_i} from CAP_TOOL; deny or degrade on breach.

  %% ===================== STYLES =====================
  classDef cfg fill:#e8f0fe,stroke:#1a73e8,stroke-width:2px,color:#0b468c;
  classDef turn fill:#ecfeff,stroke:#06b6d4,stroke-width:2px,color:#134e4a;
  classDef mdl fill:#fef9c3,stroke:#f59e0b,stroke-width:2px,color:#7c2d12;
  classDef con fill:#fff7ed,stroke:#fb923c,stroke-width:2px,color:#7c2d12;
  classDef act fill:#f5f3ff,stroke:#8b5cf6,stroke-width:2px,color:#4c1d95;

  class CFG,SLO_LAT,BUD_TOK,BUD_USD,CAP_TOOL,PRICE cfg
  class TURN,STEPS,META turn
  class MODELS,CTOK,CLAT,CUSD mdl
  class CON,C1,C2,C3,C4 con
  class ACT,ENF,ADAPT,PASS,FAIL act 
```



# %% [SETUP]
Miniature, **self-contained** QoS governance layer for agentic flows:
- Tool stubs (calc/http/soap/rag)
- A lightweight kernel (simulated LLM)
- Parametric budgets for **latency**, **tokens**, and **cost**
- A demo that is **not** dependent on external network calls


In [None]:

# %% [SETUP-ENV]
import os, getpass
os.environ.setdefault('AZURE_OPENAI_ENDPOINT', 'https://example-aoai-endpoint')
os.environ.setdefault('AZURE_OPENAI_DEPLOYMENT', 'gpt-simulated')
os.environ.setdefault('AZURE_OPENAI_API_VERSION', '2024-10-21')
# Optional key input; not required for this demo.
if not os.getenv('AZURE_OPENAI_API_KEY'):
    try:
        os.environ['AZURE_OPENAI_API_KEY'] = getpass.getpass('Enter AZURE_OPENAI_API_KEY (hidden, optional): ').strip()
    except Exception:
        os.environ['AZURE_OPENAI_API_KEY'] = 'not-needed'
print('Environment ready (any provided key is session-only).')


In [None]:

# %% [KERNEL]
import asyncio, time, uuid, random
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional

def new_trace_id() -> str:
    return str(uuid.uuid4())

@dataclass
class LLMResult:
    answer: str
    citations: List[str]
    tokens_in: int
    tokens_out: int
    confidence: float

class SimulatedLLM:
    # A tiny LLM shim with token & latency estimates; no external calls.
    def __init__(self, name="gpt-sim", tps_chars=1200):
        self.name = name
        self.tps_chars = tps_chars  # rough chars/sec for latency simulation

    async def run(self, prompt: str, grounding: Optional[List[str]] = None) -> LLMResult:
        base = max(0.02, len(prompt) / self.tps_chars)
        await asyncio.sleep(min(0.15, base))
        toks_in = max(1, len(prompt)//4)
        cites = grounding or []
        text = f"[{self.name}] {prompt[:120]}"
        if cites:
            text += " | cites: " + "; ".join(cites[:3])
        toks_out = min(200, max(24, len(text)//4))
        conf = 0.65 + 0.3*random.random()
        return LLMResult(answer=text, citations=cites, tokens_in=toks_in, tokens_out=toks_out, confidence=conf)


In [None]:

# %% [TOOLS]
import time
from typing import Callable

class ToolError(Exception): pass

class Tool:
    def __init__(self, name: str, fn: Callable, latency_ms: int = 80, token_cost: int = 16):
        self.name = name
        self.fn = fn
        self.latency_ms = latency_ms
        self.token_cost = token_cost  # synthetic token cost for metering

    def invoke(self, **kwargs):
        time.sleep(min(0.15, self.latency_ms/1000))
        return self.fn(**kwargs)

def tool_calc(expr: str):
    try:
        import re
        if not re.fullmatch(r"[0-9\.\s\+\-\*\/\(\)]+", expr):
            raise ToolError("Expression not allowed")
        return {"expr": expr, "value": eval(expr, {"__builtins__": {}})}
    except Exception as e:
        raise ToolError(str(e))

def tool_http(method: str, url: str):
    return {"method": method, "url": url, "status": 200, "bytes": 512, "note": "simulated response"}

def tool_soap(action: str):
    return {"action": action, "status": "OK", "note": "simulated SOAP result"}

def tool_rag(query: str):
    cites = [f"doc://rag/{i}-{hash(query)%997}" for i in range(1, 4)]
    return {"query": query, "citations": cites, "chunks": 3}

TOOLS = {
    "calc": Tool("calc", tool_calc, latency_ms=20, token_cost=8),
    "http": Tool("http", tool_http, latency_ms=60, token_cost=24),
    "soap": Tool("soap", tool_soap, latency_ms=90, token_cost=20),
    "rag":  Tool("rag",  tool_rag,  latency_ms=110, token_cost=40),
}


In [None]:

# %% [AGENTS]
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional

@dataclass
class QoSConfig:
    p95_latency_target_ms: int = 2500
    budget_tokens_per_turn: int = 4000
    budget_usd_per_turn: float = 0.05
    pricing_per_1k_tokens: float = 0.002
    per_tool_caps: Dict[str, Dict[str, float]] = field(default_factory=lambda: {
        "calc": {"tokens": 256, "latency_ms": 400, "retries": 1},
        "http": {"tokens": 512, "latency_ms": 800, "retries": 1},
        "soap": {"tokens": 512, "latency_ms": 900, "retries": 1},
        "rag":  {"tokens": 1024,"latency_ms": 1200,"retries": 1},
    })

class PolicyEngine:
    def allowlist(self, intent: str) -> List[str]:
        return ["calc","http","soap","rag"]
    def precheck(self, prompt: str) -> bool:
        return True
    def postcheck(self, answer: str, citations: List[str]) -> bool:
        return True

class Planner:
    def __init__(self, policy: PolicyEngine):
        self.policy = policy
    def select_tools(self, intent: str) -> List[str]:
        allow = self.policy.allowlist(intent)
        if intent == "rag": return [t for t in ["rag"] if t in allow]
        if intent == "calc": return [t for t in ["calc"] if t in allow]
        if intent == "http": return [t for t in ["http"] if t in allow]
        if intent == "soap": return [t for t in ["soap"] if t in allow]
        return []

class Agent:
    def __init__(self, name: str, llm: SimulatedLLM, policy: PolicyEngine, qos: QoSConfig):
        self.name = name
        self.llm = llm
        self.policy = policy
        self.qos = qos
        self.planner = Planner(policy)

    def available_tools(self) -> List[str]:
        return list(TOOLS.keys())

    def call(self, tool_name: str, **kwargs):
        if tool_name not in TOOLS: 
            raise ToolError(f"Unknown tool '{tool_name}'")
        result = TOOLS[tool_name].invoke(**kwargs)
        return {"tool": tool_name, "result": result}

    async def run(self, prompt: str, intent: Optional[str] = None) -> Dict[str, Any]:
        trace_id = new_trace_id()
        if not self.policy.precheck(prompt):
            return {"answer": "[refused by policy]", "citations": [], "confidence": 0.0, "trace_id": trace_id}

        intent = intent or "general"
        chosen = self.planner.select_tools(intent)
        grounding = []
        if "rag" in chosen:
            gr = TOOLS["rag"].invoke(query=prompt)
            grounding = gr.get("citations", [])

        res = await self.llm.run(prompt, grounding=grounding)
        tokens = res.tokens_in + res.tokens_out + sum(TOOLS[t].token_cost for t in chosen)
        cost_usd = (tokens/1000.0) * self.qos.pricing_per_1k_tokens

        within_tokens = tokens <= self.qos.budget_tokens_per_turn
        within_cost = cost_usd <= self.qos.budget_usd_per_turn
        within_latency = True
        pass_gate = within_tokens and within_cost and within_latency and self.policy.postcheck(res.answer, res.citations)

        return {
            "answer": res.answer if pass_gate else "[degraded or refused due to QoS]",
            "citations": res.citations,
            "confidence": res.confidence if pass_gate else 0.4,
            "tokens": tokens,
            "cost_usd": round(cost_usd, 6),
            "trace_id": trace_id,
            "gate": {
                "within_tokens": within_tokens,
                "within_cost": within_cost,
                "within_latency": within_latency,
                "tools_used": chosen,
            }
        }

llm = SimulatedLLM()
policy = PolicyEngine()
qos = QoSConfig()

agent_rag   = Agent("Agent-RAG", llm, policy, qos)
agent_calc  = Agent("Agent-CALC", llm, policy, qos)
agent_http  = Agent("Agent-HTTP", llm, policy, qos)
agent_soap  = Agent("Agent-SOAP", llm, policy, qos)
agent_general = Agent("Agent-GENERAL", llm, policy, qos)


In [None]:

# %% [WIRES]
ROUTES = {
    "rag": "rag",
    "calc": "calc",
    "http": "http",
    "soap": "soap",
    "general": "general",
}
AGENT_INDEX = {
    "rag": agent_rag,
    "calc": agent_calc,
    "http": agent_http,
    "soap": agent_soap,
    "general": agent_general,
}

def validate_wiring():
    problems = []
    for intent, key in ROUTES.items():
        agent = AGENT_INDEX.get(key)
        if not agent:
            problems.append(f"{intent} -> missing agent key '{key}'"); 
            continue
        if not agent.available_tools():
            problems.append(f"{intent} -> {agent.name} has no available tools")
    return problems

total_wires = len(ROUTES)
distinct_agents = len(set(ROUTES.values()))
unreferenced_agents = sorted(set(AGENT_INDEX.keys()) - set(ROUTES.values()))
targets_by_agent = {}
for intent, key in ROUTES.items():
    targets_by_agent.setdefault(key, []).append(intent)

issues = validate_wiring()
print(f"Wires: {total_wires} (distinct agents: {distinct_agents})")
for agent_key, intents in targets_by_agent.items():
    agent_name = AGENT_INDEX[agent_key].name
    print(f"  - {agent_name} ← {len(intents)} intent(s): {', '.join(intents)}")
if unreferenced_agents: 
    print(f"Unreferenced agents: {', '.join(unreferenced_agents)}")
print("Wiring OK" if not issues else "Wiring issues:\\n- " + "\\n- ".join(issues))


In [None]:

# %% [DEMO]
# Notebook-safe async demo (no asyncio.run() when a loop exists).
import asyncio, time

samples = [
    ("rag",   "Ground this answer with citations about hybrid search."),
    ("calc",  "Please compute the sum 2+2."),
    ("http",  "Do an HTTP call to example.org"),
    ("soap",  "Call a SOAP action for demo"),
    ("general","Just chat and cite if needed."),
]

async def demo_run():
    t0 = time.time()
    outputs = []
    for intent, text in samples:
        key = ROUTES[intent]
        agent = AGENT_INDEX[key]
        tool_result = None
        if intent == "calc":
            tool_result = agent.call("calc", expr="6*7")
        elif intent == "http":
            tool_result = agent.call("http", method="GET", url="https://example.org")
        elif intent == "soap":
            tool_result = agent.call("soap", action="Ping")
        elif intent == "rag":
            tool_result = agent.call("rag", query=text)

        llm_out = await agent.run(text, intent=intent)
        outputs.append({
            "intent": intent,
            "agent": agent.name,
            "tool_result": tool_result,
            "llm_result": llm_out["answer"][:220] + ("..." if len(llm_out["answer"])>220 else ""),
            "confidence": llm_out["confidence"],
            "tokens": llm_out["tokens"],
            "cost_usd": llm_out["cost_usd"],
            "pass_tokens": llm_out["gate"]["within_tokens"],
            "pass_cost": llm_out["gate"]["within_cost"],
        })
    elapsed_ms = int((time.time() - t0)*1000)
    return {"elapsed_ms": elapsed_ms, "runs": outputs}

try:
    loop = asyncio.get_running_loop()
    try:
        result = await demo_run()
    except SyntaxError:
        import nest_asyncio; nest_asyncio.apply()
        result = loop.run_until_complete(demo_run())
except RuntimeError:
    result = asyncio.run(demo_run())

print("Elapsed (ms):", result["elapsed_ms"])
for r in result["runs"]:
    print((
        f"\\nIntent: {r['intent']} -> Agent: {r['agent']}"
        f"\\nTool: {r['tool_result']}"
        f"\\nLLM:  {r['llm_result']} (conf={r['confidence']:.2f}, tokens={r['tokens']}, $={r['cost_usd']})"
        f"\\nGates: tokens={r['pass_tokens']} cost={r['pass_cost']}"
    ).rstrip())
