# Assembled Notebook — SK Agents


In [None]:
# %% [SETUP]
# Package install (unpin; upgrade SK)
!pip install -U semantic-kernel
!pip -q uninstall -y pydrive2


In [None]:
# %% [SETUP-ENV]
# Environment variables for Azure OpenAI (replace placeholder strings with real values)
import os, getpass

os.environ.setdefault("AZURE_OPENAI_ENDPOINT",    "https://YOUR-AOAI.openai.azure.com")
os.environ.setdefault("AZURE_OPENAI_API_KEY",     "")
os.environ.setdefault("AZURE_OPENAI_API_VERSION", "2024-10-21")
os.environ.setdefault("AZURE_OPENAI_DEPLOYMENT",  "gpt-35-turbo")  # your deployment name

if not os.getenv("AZURE_OPENAI_API_KEY"):
    try:
        os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter AZURE_OPENAI_API_KEY (hidden): ").strip()
    except Exception:
        pass

print("AZURE_OPENAI_ENDPOINT :", os.getenv("AZURE_OPENAI_ENDPOINT"))
print("AZURE_OPENAI_DEPLOYMENT:", os.getenv("AZURE_OPENAI_DEPLOYMENT"))
print("API key set?           :", "yes" if bool(os.getenv("AZURE_OPENAI_API_KEY")) else "no")


In [None]:
# %% [KERNEL]
import os
from semantic_kernel import Kernel

# Preferred import per your instruction
try:
    from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
    _azure_cls = AzureChatCompletion
    _use_shim = False
except Exception as e:
    # Fallback shim to remain runnable across SK versions
    from semantic_kernel.connectors.ai.open_ai import OpenAIChatCompletion as AzureChatCompletion  # type: ignore
    _azure_cls = AzureChatCompletion
    _use_shim = True

kernel = Kernel()

service = _azure_cls(
    service_id="azure",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT"),  # AOAI deployment name
    endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    # api_version can be set explicitly if needed:
    # api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-10-21")
)

kernel.add_service(service)
print("Kernel ready (Azure OpenAI) | shim:", _use_shim)


In [None]:
# %% [TOOLS]
# Tools map to the pipeline: RAG packet build, execution, metering, etc.
# These are minimal, safe stubs you can later replace with real integrations.

import time
from typing import Any, Dict, List

def tool_pf_run(config: Dict[str, Any]) -> Dict[str, Any]:
    """Batch runner stub (Promptflow/SK)."""
    return {"status": "ok", "batches": 1, "config": dict(config)}

def tool_build_rag_packet(query: str, k: int = 4) -> Dict[str, Any]:
    """Hybrid retrieval stub that returns a fake grounding packet & citations."""
    cites = [{"doc": "sk://stub/doc1", "chunk": 1}, {"doc": "sk://stub/doc2", "chunk": 3}]
    return {"query": query, "topk": k, "citations": cites, "context": "stubbed grounding packet"}

def tool_exec_llm(prompt: str, need_citations: bool = False) -> Dict[str, Any]:
    """Minimal LLM exec using kernel.invoke_prompt if available; falls back to echo."""
    try:
        if hasattr(kernel, "invoke_prompt"):
            import asyncio
            async def _call():
                resp = await kernel.invoke_prompt(prompt)
                return str(resp)
            text = asyncio.get_event_loop().run_until_complete(_call())
        else:
            text = "[no kernel.invoke_prompt] " + prompt[:200]
    except Exception as e:
        text = f"[exec error] {e} | prompt: {prompt[:160]}"
    cites = [{"doc": "sk://stub/doc1"}] if need_citations else []
    return {"answer": text, "citations": cites}

def tool_meter(tokens_in: int, tokens_out: int, latency_ms: int) -> Dict[str, Any]:
    return {"tokens_in": tokens_in, "tokens_out": tokens_out, "latency_ms": latency_ms, "cost_usd": round((tokens_in+tokens_out)/1e6, 6)}

TOOLS = {
    "pf_run": tool_pf_run,
    "rag_packet": tool_build_rag_packet,
    "exec_llm": tool_exec_llm,
    "meter": tool_meter,
}
print("Tools ready:", list(TOOLS.keys()))


In [None]:
# %% [AGENTS]
import asyncio

class AgentPlanner:
    def __init__(self, kernel):
        self.kernel = kernel
        self.name = "Planner"
        self.system = "You plan eval runs and select steps based on config."

    async def run(self, cfg: dict) -> dict:
        # Simple plan: always run PF, build RAG, then EXEC
        return {"steps": ["pf_run", "rag_packet", "exec"], "budget_tokens": 20000}

class AgentExecutor:
    def __init__(self, kernel):
        self.kernel = kernel
        self.name = "Executor"

    async def run(self, prompt: str, citations: bool = False) -> dict:
        return TOOLS["exec_llm"](prompt, need_citations=citations)

class AgentMetrics:
    def factuality(self, expected_list, answer_text):
        if not expected_list:
            return 0.5
        t = (answer_text or "").lower()
        return 1.0 if all(s.lower() in t for s in expected_list) else 0.0
    def citation(self, required, citations):
        return 1.0 if (not required or citations) else 0.0
    def latency(self, start_ns=None, end_ns=None):
        if start_ns and end_ns:
            return max(0, int((end_ns - start_ns)/1e6))
        return 0

class AgentStorage:
    async def save(self, results):
        # TODO: Persist to blob/db if desired
        return {"saved": len(results)}

class AgentAlerts:
    async def notify(self, alert):
        # TODO: Wire to Teams/Email if desired
        print("[ALERT]", alert)
        return True

class AgentDashboard:
    async def publish(self, results):
        # TODO: Push to dashboard/wandb/etc.
        return {"published": len(results)}

agent_planner   = AgentPlanner(kernel)
agent_exec      = AgentExecutor(kernel)
agent_metrics   = AgentMetrics()
agent_storage   = AgentStorage()
agent_alerts    = AgentAlerts()
agent_dashboard = AgentDashboard()

print("Agents:", [a for a in ["agent_planner","agent_exec","agent_metrics","agent_storage","agent_alerts","agent_dashboard"]])


In [None]:
# %% [WIRES]
# Placeholder wiring; expand as needed.
WIRES = {
    "planner->pf_run": True,
    "planner->rag_packet": True,
    "planner->exec": True,
    "exec->meter": True,
}
print("Wires:", WIRES)


In [None]:
# %% [PATCH: robust eval + normalization]
import time
from typing import Any, Dict, List

def _normalize_item(item: Any) -> Dict[str, Any]:
    if isinstance(item, str):
        return {"prompt": item, "expected_contains": [], "citations_required": False, "refusal_expected": False}
    d = dict(item)
    d["prompt"] = d.get("prompt") or d.get("input") or d.get("query") or d.get("text") or ""
    exp = d.get("expected_contains", d.get("expected", d.get("must_include", [])))
    if exp is None: exp = []
    if isinstance(exp, str): exp = [exp]
    d["expected_contains"] = list(exp)
    d["citations_required"] = bool(d.get("citations_required") or d.get("needs_citation") or d.get("citation", False))
    d["refusal_expected"]   = bool(d.get("refusal_expected") or d.get("should_refuse") or d.get("refusal", False))
    return d

def _coerce_answer(ans: Any) -> Dict[str, Any]:
    if isinstance(ans, dict):
        text = ans.get("answer", ans.get("text", ""))
        cites = ans.get("citations", ans.get("cites", [])) or []
        if isinstance(cites, dict): cites = list(cites.values())
        return {"answer": str(text), "citations": list(cites)}
    return {"answer": str(ans), "citations": []}

async def eval_nightly():
    t0 = time.time_ns()
    # Try to read preloaded TESTCASES; fallback to one case
    tc_raw = globals().get("TESTCASES") or [
        {"prompt": "Summarize the eval pipeline in one sentence.", "expected_contains": ["eval","metrics"], "citations_required": False}
    ]
    tc = [_normalize_item(x) for x in tc_raw]

    results = []
    alerts = []
    for item in tc:
        start_ns = time.time_ns()
        out = await agent_exec.run(item["prompt"], citations=item["citations_required"])
        ans = _coerce_answer(out)
        end_ns = time.time_ns()

        fact = agent_metrics.factuality(item.get("expected_contains", []), ans["answer"])
        cite = agent_metrics.citation(item.get("citations_required", False), ans.get("citations", []))
        lat  = agent_metrics.latency(start_ns, end_ns)

        row = {
            "prompt": item["prompt"],
            "answer": ans["answer"],
            "citations": ans.get("citations", []),
            "factuality": fact,
            "citation_ok": cite,
            "latency_ms": lat,
        }
        results.append(row)

        if fact < 0.5 or (item.get("citations_required", False) and not cite):
            alerts.append({
                "prompt": item["prompt"][:160],
                "reason": "low_factuality" if fact < 0.5 else "missing_citations",
                "metrics": {"factuality": fact, "citation_ok": cite, "latency_ms": lat},
            })

    # Optional publish
    try:
        await agent_storage.save(results)
        await agent_dashboard.publish(results)
        for a in alerts: await agent_alerts.notify(a)
    except Exception:
        pass

    elapsed_ms = int((time.time_ns()-t0)/1e6)
    return {
        "summary": {
            "total": len(results),
            "alerts": len(alerts),
            "avg_latency_ms": int(sum(r["latency_ms"] for r in results)/max(1,len(results))),
            "avg_factuality": round(sum(r["factuality"] for r in results)/max(1,len(results)), 3),
        },
        "alerts": alerts,
        "actions": ["open_pr_prompts"] if alerts else [],
        "elapsed_ms": elapsed_ms,
        "results": results,
    }
print("Patched eval_nightly ready.")


In [None]:
# %% [DEMO]
import time, asyncio
t0 = time.time()

# In Jupyter, a loop is already running; use top-level await
result = await eval_nightly()

print("Eval summary:", result["summary"])
print("Alerts:", result["alerts"])
print("Actions:", result["actions"])
print("Elapsed (ms):", int((time.time()-t0)*1000))

# Peek at first 3 results
for i, r in enumerate(result["results"][:3], 1):
    print(f"\nCase {i}")
    print(" prompt   :", (r["prompt"][:120] + ("…" if len(r["prompt"])>120 else "")))
    print(" factual  :", r["factuality"], " citation_ok:", r["citation_ok"], " latency_ms:", r["latency_ms"])
    print(" answer   :", (r["answer"][:200] + ("…" if len(r["answer"])>200 else "")))
