
# REQ→TEST Matrix — Agents & Demo

Sections: **[SETUP] · [SETUP-ENV] · [KERNEL] · [TOOLS] · [AGENTS] · [WIRES] · [DEMO]**

Mermaid diagram rendered inline below.


In [None]:

# %% [SETUP] Mermaid renderer
from IPython.display import HTML, display
_mermaid = r"""
flowchart LR 
  %% ===================== REQ-TO-TEST MATRIX =====================
  %% Target outcomes ↔ metrics (accuracy, p95 latency, $/turn)

  %% -------- REQUIREMENTS (TARGET OUTCOMES) --------
  subgraph REQ["Requirements (Target Outcomes)"]
    R1[REQ-001\n"Grounded answers are accurate"]
    R2[REQ-002\n"Fast responses at scale"]
    R3[REQ-003\n"Cost efficiency per turn"]
  end

  %% -------- METRICS (WHAT WE MEASURE) --------
  subgraph MET["KPIs / Metrics"]
    M1[Accuracy@\nFactuality ≥ 0.90\nCitation coverage ≥ 0.85]
    M2[Latency@\np95 ≤ 1200 ms\nError rate < 0.5%]
    M3[Cost@\n$/turn ≤ $0.012\nTokens/turn budget]
  end

  %% -------- TESTS (HOW WE VERIFY) --------
  subgraph TST["Tests / Evaluations"]
    T1[Eval-Fact:\nPF/SK batch judge\n(extractive + LLM-as-judge)]
    T2[Eval-Cite:\nCoverage + link validity]
    T3[Perf-Load:\nRPS ramp; p95, p99]
    T4[Chaos-Retry:\nBreaker, backoff, errors]
    T5[Cost-Meter:\nper-intent token caps]
    T6[Budget-Guard:\ncheaper-model fallback]
  end

  %% -------- EVIDENCE / REPORTING --------
  subgraph OBS["Evidence & Reporting"]
    DSH[Dashboard:\nweekly trendlines]
    RUNS[(Eval run store)]
    TRC[(Per-turn traces)]
    GATE[CI Gate:\nblock on regression]
  end

  %% -------- TRACEABILITY LINKS --------
  R1 --> M1
  R1 --> M1
  R1 --> T1
  R1 --> T2

  R2 --> M2
  R2 --> T3
  R2 --> T4

  R3 --> M3
  R3 --> T5
  R3 --> T6

  %% Metrics feed tests & evidence
  T1 --> RUNS --> DSH
  T2 --> RUNS
  T3 --> TRC --> DSH
  T4 --> TRC
  T5 --> TRC
  T6 --> TRC

  %% Release control
  DSH --> GATE

  %% ---------------- NOTES ----------------
  %% - Update thresholds per tenant/intent in config-as-code.
  %% - Gate deploys on: (Factuality < 0.90) OR (p95 > SLO) OR ($/turn > budget).

  %% ---------------- STYLES ----------------
  classDef req fill:#e8f0fe,stroke:#1a73e8,stroke-width:2px,color:#0b468c;
  classDef met fill:#fef9c3,stroke:#f59e0b,stroke-width:2px,color:#7c2d12;
  classDef tst fill:#ecfeff,stroke:#06b6d4,stroke-width:2px,color:#134e4a;
  classDef obs fill:#f5f3ff,stroke:#8b5cf6,stroke-width:2px,color:#4c1d95;

  class REQ,R1,R2,R3 req
  class MET,M1,M2,M3 met
  class TST,T1,T2,T3,T4,T5,T6 tst
  class OBS,DSH,RUNS,TRC,GATE obs
"""
display(HTML(f"""
<div id='mrmmd' class='mermaid' style='background:#fff;padding:12px;border:1px solid #ddd;border-radius:8px;'></div>
<script>
(function() {
  const el = document.getElementById('mrmmd');
  el.textContent = `
flowchart LR 
  %% ===================== REQ-TO-TEST MATRIX =====================
  %% Target outcomes ↔ metrics (accuracy, p95 latency, $/turn)

  %% -------- REQUIREMENTS (TARGET OUTCOMES) --------
  subgraph REQ["Requirements (Target Outcomes)"]
    R1[REQ-001\n"Grounded answers are accurate"]
    R2[REQ-002\n"Fast responses at scale"]
    R3[REQ-003\n"Cost efficiency per turn"]
  end

  %% -------- METRICS (WHAT WE MEASURE) --------
  subgraph MET["KPIs / Metrics"]
    M1[Accuracy@\nFactuality ≥ 0.90\nCitation coverage ≥ 0.85]
    M2[Latency@\np95 ≤ 1200 ms\nError rate < 0.5%]
    M3[Cost@\n$/turn ≤ $0.012\nTokens/turn budget]
  end

  %% -------- TESTS (HOW WE VERIFY) --------
  subgraph TST["Tests / Evaluations"]
    T1[Eval-Fact:\nPF/SK batch judge\n(extractive + LLM-as-judge)]
    T2[Eval-Cite:\nCoverage + link validity]
    T3[Perf-Load:\nRPS ramp; p95, p99]
    T4[Chaos-Retry:\nBreaker, backoff, errors]
    T5[Cost-Meter:\nper-intent token caps]
    T6[Budget-Guard:\ncheaper-model fallback]
  end

  %% -------- EVIDENCE / REPORTING --------
  subgraph OBS["Evidence & Reporting"]
    DSH[Dashboard:\nweekly trendlines]
    RUNS[(Eval run store)]
    TRC[(Per-turn traces)]
    GATE[CI Gate:\nblock on regression]
  end

  %% -------- TRACEABILITY LINKS --------
  R1 --> M1
  R1 --> M1
  R1 --> T1
  R1 --> T2

  R2 --> M2
  R2 --> T3
  R2 --> T4

  R3 --> M3
  R3 --> T5
  R3 --> T6

  %% Metrics feed tests & evidence
  T1 --> RUNS --> DSH
  T2 --> RUNS
  T3 --> TRC --> DSH
  T4 --> TRC
  T5 --> TRC
  T6 --> TRC

  %% Release control
  DSH --> GATE

  %% ---------------- NOTES ----------------
  %% - Update thresholds per tenant/intent in config-as-code.
  %% - Gate deploys on: (Factuality < 0.90) OR (p95 > SLO) OR ($/turn > budget).

  %% ---------------- STYLES ----------------
  classDef req fill:#e8f0fe,stroke:#1a73e8,stroke-width:2px,color:#0b468c;
  classDef met fill:#fef9c3,stroke:#f59e0b,stroke-width:2px,color:#7c2d12;
  classDef tst fill:#ecfeff,stroke:#06b6d4,stroke-width:2px,color:#134e4a;
  classDef obs fill:#f5f3ff,stroke:#8b5cf6,stroke-width:2px,color:#4c1d95;

  class REQ,R1,R2,R3 req
  class MET,M1,M2,M3 met
  class TST,T1,T2,T3,T4,T5,T6 tst
  class OBS,DSH,RUNS,TRC,GATE obs
`;
  var s = document.createElement('script');
  s.src = "https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js";
  s.onload = function() { window.mermaid.initialize({ startOnLoad: true, securityLevel: 'loose' }); };
  document.body.appendChild(s);
})();
</script>
"""))


In [None]:

# %% [SETUP-ENV]
import os, getpass
os.environ.setdefault('AZURE_OPENAI_ENDPOINT', 'https://4th-openai-resource.openai.azure.com')
os.environ.setdefault('AZURE_OPENAI_DEPLOYMENT', 'gpt-35-turbo')
os.environ.setdefault('AZURE_OPENAI_API_VERSION', '2024-10-21')
if not os.getenv('AZURE_OPENAI_API_KEY'):
    try:
        os.environ['AZURE_OPENAI_API_KEY'] = getpass.getpass('Enter AZURE_OPENAI_API_KEY (hidden): ').strip()
    except Exception:
        # Non-interactive context
        os.environ['AZURE_OPENAI_API_KEY'] = 'stub-key'
print('Azure OpenAI env ready (key is session-only).')


In [None]:

# %% [KERNEL]
class MiniKernel:
    async def invoke_prompt(self, text: str):
        # Deteministic stubbed reply
        return {"message": "[stub-llm] ok: " + (text[:120] + ("..." if len(text) > 120 else ""))}

kernel = MiniKernel()
print("Kernel ready (stubbed).")


In [None]:

# %% [TOOLS]
import random

def tool_accuracy_eval(expected=0.90, cite_cov=0.85, **kwargs) -> str:
    # simple stub with random-ish values
    fac = round(random.uniform(0.88, 0.98), 3)
    cov = round(random.uniform(0.82, 0.92), 3)
    status = "pass" if fac >= expected and cov >= cite_cov else "warn"
    return f"stub:accuracy fac={fac} cov={cov} target=({expected},{cite_cov}) status={status}"

def tool_latency_probe(p95_target_ms=1200, **kwargs) -> str:
    p95 = random.randint(900, 1600)
    status = "pass" if p95 <= p95_target_ms else "warn"
    return f"stub:latency p95={p95}ms target<={p95_target_ms} status={status}"

def tool_cost_meter(budget_usd=0.012, **kwargs) -> str:
    cost = round(random.uniform(0.006, 0.018), 4)
    status = "pass" if cost <= budget_usd else "warn"
    return f"stub:cost ${cost}/turn budget<={budget_usd} status={status}"

TOOLS = {
    "accuracy_eval": tool_accuracy_eval,
    "latency_probe": tool_latency_probe,
    "cost_meter": tool_cost_meter,
}
print("Tools:", ", ".join(sorted(TOOLS.keys())))


In [None]:

# %% [AGENTS]
class BaseAgent:
    name = "Base"
    skills = []
    def __init__(self, kernel): self.kernel = kernel
    def available_tools(self): 
        return [t for t in self.skills if t in TOOLS]
    def call(self, tool_name: str, **kwargs):
        fn = TOOLS.get(tool_name)
        if not fn: raise ValueError(f"Tool not found: {tool_name}")
        return fn(**kwargs)
    async def run(self, user_text: str) -> str:
        try:
            out = await self.kernel.invoke_prompt(user_text)
            return out["message"]
        except Exception as e:
            return f"[agent:{self.name}] LLM call failed: {e}"

class Agent_PVA_APIM(BaseAgent):
    name = "PVA/APIM (Latency focus)"
    skills = ["latency_probe"]

class Agent_Orchestrator(BaseAgent):
    name = "Orchestrator (SK/LangGraph)"
    skills = []

class Agent_Tools_RAG(BaseAgent):
    name = "Tools / RAG (Accuracy)"
    skills = ["accuracy_eval"]

class Agent_DataPlane(BaseAgent):
    name = "Data Plane (Residency)"
    skills = []

class Agent_Ops(BaseAgent):
    name = "Ops / CI-CD (Cost)"
    skills = ["cost_meter"]

agent_pva_apim   = Agent_PVA_APIM(kernel)
agent_orchestrator = Agent_Orchestrator(kernel)
agent_tools_rag  = Agent_Tools_RAG(kernel)
agent_data_plane = Agent_DataPlane(kernel)
agent_ops        = Agent_Ops(kernel)

print("Agents:", [a.name for a in [
    agent_pva_apim, agent_orchestrator, agent_tools_rag, agent_data_plane, agent_ops
]])


In [None]:

# %% [WIRES]
ROUTES = {
    "REQ-001: Grounded answers are accurate": "tools_rag",
    "REQ-002: Fast responses at scale": "pva_apim",
    "REQ-003: Cost efficiency per turn": "ops",
}
AGENT_INDEX = {
    "pva_apim": agent_pva_apim,
    "orchestrator": agent_orchestrator,
    "tools_rag": agent_tools_rag,
    "data_plane": agent_data_plane,
    "ops": agent_ops,
}

def validate_wiring():
    problems = []
    for req, key in ROUTES.items():
        agent = AGENT_INDEX.get(key)
        if not agent:
            problems.append(f"{req} -> missing agent key '{key}'")
            continue
        if not agent.available_tools():
            # not all agents must have tools for this matrix; still report
            problems.append(f"{req} -> {agent.name} has no available tools")
    return problems

total_wires = len(ROUTES)
distinct_agents = len(set(ROUTES.values()))
unreferenced_agents = sorted(set(AGENT_INDEX.keys()) - set(ROUTES.values()))
targets_by_agent = {}
for req, key in ROUTES.items():
    targets_by_agent.setdefault(key, []).append(req)

issues = validate_wiring()
print(f"Wires: {total_wires} (distinct agents: {distinct_agents})")
for agent_key, reqs in targets_by_agent.items():
    agent = AGENT_INDEX.get(agent_key)
    agent_name = getattr(agent, "name", agent_key)
    print(f"  - {agent_name} ← {len(reqs)} requirement(s): {', '.join(reqs)}")
if unreferenced_agents:
    print(f"Unreferenced agents: {', '.join(unreferenced_agents)}")
print("Wiring OK" if not issues else "Wiring issues:\n- " + "\n- ".join(issues))


In [None]:

# %% [DEMO]
import asyncio, time, nest_asyncio
nest_asyncio.apply()

async def demo_run():
    t0 = time.time()
    samples = [
        ("REQ-001", "Verify factuality and citations meet thresholds."),
        ("REQ-002", "Check p95 latency is under SLO during load."),
        ("REQ-003", "Ensure $/turn is within budget for this intent."),
    ]
    outputs = []
    for rid, text in samples:
        if rid == "REQ-001":
            agent = AGENT_INDEX["tools_rag"]
            tool_out = agent.call("accuracy_eval", expected=0.90, cite_cov=0.85)
        elif rid == "REQ-002":
            agent = AGENT_INDEX["pva_apim"]
            tool_out = agent.call("latency_probe", p95_target_ms=1200)
        else:
            agent = AGENT_INDEX["ops"]
            tool_out = agent.call("cost_meter", budget_usd=0.012)

        llm_out = await agent.run(text)
        outputs.append({
            "req": rid,
            "agent": agent.name,
            "tool_result": tool_out,
            "llm_result": llm_out[:220] + ("..." if len(llm_out) > 220 else "")
        })
    elapsed_ms = int((time.time() - t0)*1000)
    return {"elapsed_ms": elapsed_ms, "runs": outputs}

try:
    loop = asyncio.get_running_loop()
    result = loop.run_until_complete(demo_run())
except RuntimeError:
    result = asyncio.run(demo_run())

print("Elapsed (ms):", result["elapsed_ms"])
for r in result["runs"]:
    print(f"REQ: {r['req']} -> Agent: {r['agent']}")
    print(f"  Tool: {r['tool_result']}")
    print(f"  LLM:  {r['llm_result']}")
