# Turn Budgets & QoS – Agent Notebook

Below is the input Mermaid diagram you provided.

```mermaid
flowchart TD 
  %% -------- Turn topology --------
  subgraph Turn[Turn (one user → assistant cycle)]
    O[Orchestrator] --> P[Planner]
    P --> R[Retriever / Index]
    P --> T1[Tool A (e.g., Search)]
    P --> T2[Tool B (e.g., DB Query)]
    P --> L[Model Generation]
  end

  %% -------- Budgets (constraints) --------
  LB{{Latency Budget\nL_budget_ms = 2500 ms}}
  CB{{Cost Budget\nC_budget_usd = $0.050 / turn}}

  %% Latency allocations per component
  O -. "≤ 150 ms" .-> LB
  R -. "≤ 400 ms" .-> LB
  T1 -. "≤ 300 ms" .-> LB
  T2 -. "≤ 300 ms" .-> LB
  L -. "≤ 1200 ms" .-> LB
  SUM_L[(Σ component latencies\n≤ L_budget_ms)] --> LB

  %% Cost components per turn
  tokIn[(Input tokens × $/1k)] --> CB
  tokOut[(Output tokens × $/1k)] --> CB
  apiFees[(External tool API fees)] --> CB
  storageIO[(Vector/DB I/O fees)] --> CB
  SUM_C[(Σ cost components\n≤ C_budget_usd)] --> CB

  %% -------- QoS guardrails (parametric actions) --------
  G{{QoS Guardrails}}
  G -->|If Σ latency > budget| D1[Degrade: skip non-critical tool]
  G -->|If Σ cost > budget| D2[Constrain: shorten output / reduce tokens]
  G -->|If near limits| A1[Switch to cheaper/faster model]
  G -->|Cache hit| C1[Bypass retrieval/tools]

  %% -------- Telemetry feedback loop --------
  M[(Metrics: p95 latency, error rate,\ncache hit %, $/turn, tool counts)]
  M --> G

  %% Bind guardrails to constraints
  LB --> G
  CB --> G
```

In [None]:
# %% [SETUP-ENV]
import os, getpass
os.environ.setdefault('AZURE_OPENAI_ENDPOINT', 'https://example-aoai.azure.com')
os.environ.setdefault('AZURE_OPENAI_DEPLOYMENT', 'gpt-35-turbo')
os.environ.setdefault('AZURE_OPENAI_API_VERSION', '2024-10-21')
if not os.getenv('AZURE_OPENAI_API_KEY'):
    try:
        os.environ['AZURE_OPENAI_API_KEY'] = getpass.getpass('Enter AZURE_OPENAI_API_KEY (hidden): ').strip()
    except Exception:
        os.environ['AZURE_OPENAI_API_KEY'] = 'stub-key'
print('Azure OpenAI env ready (key is session-only).')

In [None]:
# %% [KERNEL]
from dataclasses import dataclass, field
import time, asyncio

@dataclass
class KernelResult:
    answer: str
    citations: list = field(default_factory=list)
    tokens_in: int = 0
    tokens_out: int = 0
    cost_usd: float = 0.0
    latency_ms: int = 0
    confidence: float = 0.75

class LocalKernel:
    def __init__(self, name="local-sim"):
        self.name = name
        self.usd_per_1k_in = 0.0005
        self.usd_per_1k_out = 0.0015

    async def complete(self, prompt: str, max_tokens: int = 180, cheap: bool = False) -> KernelResult:
        t0 = time.time()
        await asyncio.sleep(0.15 if cheap else 0.35)
        tokens_in = max(8, min(200, len(prompt)//4))
        tokens_out = max(16, min(max_tokens, 120))
        if cheap:
            tokens_out = int(tokens_out * 0.6)
        cost = (tokens_in/1000)*self.usd_per_1k_in + (tokens_out/1000)*self.usd_per_1k_out
        ans = (prompt[:80] + "...") if len(prompt) > 80 else prompt
        ans = f"Draft: {ans}"
        return KernelResult(
            answer=ans,
            tokens_in=tokens_in,
            tokens_out=tokens_out,
            cost_usd=round(cost, 6),
            latency_ms=int((time.time()-t0)*1000),
            confidence=0.78 if not cheap else 0.72,
        )

KERNEL = LocalKernel()
print("Kernel ready:", KERNEL.name)

In [None]:
# %% [TOOLS]
import asyncio, time

class ToolBase:
    name = "tool"
    def __init__(self, fixed_ms=120, fee_usd=0.0):
        self.fixed_ms = fixed_ms
        self.fee_usd = fee_usd
    async def invoke(self, **kwargs):
        t0 = time.time()
        await asyncio.sleep(self.fixed_ms/1000.0)
        return {
            "ok": True,
            "latency_ms": int((time.time()-t0)*1000),
            "fee_usd": self.fee_usd,
            "args": kwargs,
        }

class SearchTool(ToolBase):
    name = "search"
    def __init__(self): super().__init__(fixed_ms=int(0.22*1000), fee_usd=0.0008)

class DbTool(ToolBase):
    name = "db"
    def __init__(self): super().__init__(fixed_ms=int(0.18*1000), fee_usd=0.0006)

class CalcTool(ToolBase):
    name = "calc"
    async def invoke(self, expr=""):
        t0 = time.time()
        await asyncio.sleep(0.05)
        try:
            if not all(ch in "0123456789+-*/(). " for ch in expr):
                raise ValueError("disallowed characters")
            val = eval(expr, {"__builtins__": {}})
        except Exception as e:
            return {"ok": False, "error": str(e), "latency_ms": int((time.time()-t0)*1000), "fee_usd": 0.0}
        return {"ok": True, "value": val, "latency_ms": int((time.time()-t0)*1000), "fee_usd": 0.0}

TOOLS = {"search": SearchTool(), "db": DbTool(), "calc": CalcTool()}
print("Tools ready:", ", ".join(TOOLS.keys()))

In [None]:
# %% [AGENTS]
import asyncio, time
from dataclasses import dataclass

L_BUDGET_MS = 2500
C_BUDGET_USD = 0.050

@dataclass
class Agent:
    name: str
    non_critical_tools: tuple = ("db",)
    def available_tools(self): return list(TOOLS.keys())
    def call(self, tool, **kwargs):
        return asyncio.get_event_loop().run_until_complete(TOOLS[tool].invoke(**kwargs))
    async def run(self, prompt: str, intent: str = "general"):
        lat_acc = 0; cost_acc = 0.0
        await asyncio.sleep(0.06)  # orchestrator overhead
        lat_acc += 60

        plan_tools = []
        if intent in ("search", "rag"): plan_tools.append("search")
        if intent in ("db", "rag"): plan_tools.append("db")
        if intent == "calc": plan_tools.append("calc")

        results = {}
        for tname in plan_tools:
            if lat_acc > L_BUDGET_MS - 900 and tname in self.non_critical_tools:
                results[tname] = {"ok": False, "skipped": True, "reason": "latency_guard"}
                continue
            res = await TOOLS[tname].invoke(query=prompt[:60] if tname!="calc" else None, expr="6*7" if tname=="calc" else None)
            lat_acc += res.get("latency_ms", 0); cost_acc += res.get("fee_usd", 0.0); results[tname] = res

        cheap = False; max_tokens = 160
        if cost_acc > C_BUDGET_USD * 0.6: cheap = True; max_tokens = 90

        kres = await KERNEL.complete(prompt, max_tokens=max_tokens, cheap=cheap)
        lat_acc += kres.latency_ms; cost_acc += kres.cost_usd

        if lat_acc > L_BUDGET_MS and len(kres.answer) > 140:
            kres.answer = kres.answer[:140] + " …(truncated due to latency)"
        if cost_acc > C_BUDGET_USD:
            kres.tokens_out = int(kres.tokens_out * 0.7); cost_acc = C_BUDGET_USD

        return {"answer": kres.answer, "latency_ms": lat_acc, "cost_usd": round(cost_acc,6),
                "confidence": kres.confidence, "tools": results}

agent = Agent("QoS-Agent")
print("Agent ready:", agent.name)

In [None]:
# %% [WIRES]
ROUTES = {"rag":"qos","search":"qos","db":"qos","calc":"qos","general":"qos"}
AGENT_INDEX = {"qos": agent}

def validate_wiring():
    probs = []
    for intent, key in ROUTES.items():
        ag = AGENT_INDEX.get(key)
        if not ag: probs.append(f"{intent} -> missing agent '{key}'"); continue
        if not ag.available_tools(): probs.append(f"{intent} -> {ag.name} has no available tools")
    return probs

total_wires = len(ROUTES)
distinct_agents = len(set(ROUTES.values()))
targets_by_agent = {}
for intent, key in ROUTES.items():
    targets_by_agent.setdefault(key, []).append(intent)

issues = validate_wiring()
print(f"Wires: {total_wires} (distinct agents: {distinct_agents})")
for agent_key, intents in targets_by_agent.items():
    agent_name = AGENT_INDEX[agent_key].name
    print(f"  - {agent_name} ← {len(intents)} intent(s): {', '.join(sorted(intents))}")
print("Wiring OK" if not issues else "Wiring issues:\\n- " + "\\n- ".join(issues))

In [None]:
# %% [DEMO]
import asyncio, time

samples = [
    ("rag",    "Retrieve context and answer with citations."),
    ("search", "Search for product info, then summarize."),
    ("db",     "Look up a record by id=42."),
    ("calc",   "Compute 6*7 quickly."),
    ("general","Just respond concisely and keep cost low."),
]

async def demo_run():
    t0 = time.time()
    outputs = []
    for intent, text in samples:
        key = ROUTES[intent]; ag = AGENT_INDEX[key]
        tool_result = None
        if intent == "calc":
            tool_result = ag.call("calc", expr="6*7")
        elif intent == "search":
            tool_result = ag.call("search", query=text[:40])
        elif intent == "db":
            tool_result = ag.call("db", query="id=42")
        llm_out = await ag.run(text, intent=intent)
        outputs.append({
            "intent": intent, "agent": ag.name, "tool_demo": tool_result,
            "latency_ms": llm_out["latency_ms"], "cost_usd": llm_out["cost_usd"],
            "confidence": llm_out["confidence"],
            "snippet": llm_out["answer"][:220] + ("..." if len(llm_out["answer"])>220 else ""),
        })
    elapsed_ms = int((time.time() - t0)*1000)
    return {"elapsed_ms": elapsed_ms, "runs": outputs}

try:
    loop = asyncio.get_running_loop()
    try:
        result = await demo_run()
    except SyntaxError:
        import nest_asyncio; nest_asyncio.apply()
        result = loop.run_until_complete(demo_run())
except RuntimeError:
    result = asyncio.run(demo_run())

print("Elapsed (ms):", result["elapsed_ms"])
for r in result["runs"]:
    print("Intent: {intent} -> Agent: {agent}\nLatency: {latency_ms} ms | Cost: ${cost_usd:.4f} | conf={confidence:.2f}\nSnippet: {snippet}".format(**r))