
# Runbook Flow — Agents & Tools Notebook

This notebook was generated from a Mermaid **Runbook Flow** diagram.  
It creates **stubbed agents & tools** that correspond to the runbook stages (detect/triage, hotfix, rollback, index rebuild, secret rotation, comms/postmortem, observability).  
The Mermaid is included below **only for human reference**; code cells do **not** execute Mermaid.


In [None]:
# %% [SETUP]
!pip -q install -U semantic-kernel
!pip -q uninstall -y pydrive2
print("Setup complete.")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.5/88.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m893.8/893.8 kB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.2/93.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m217.9/217.9 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.7/126.7 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

# %% [ENV] Azure OpenAI environment (kept out of source control)
import os, getpass

os.environ.setdefault("AZURE_OPENAI_ENDPOINT",   "https://4th-openai-resource.openai.azure.com")
os.environ.setdefault("AZURE_OPENAI_API_VERSION","2024-10-21")
os.environ.setdefault("AZURE_OPENAI_DEPLOYMENT","gpt-35-turbo")

if not os.getenv("AZURE_OPENAI_API_KEY"):
    try:
        os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter AZURE_OPENAI_API_KEY (hidden): ").strip()
    except Exception:
        # Non-interactive env; leave empty (agents will degrade to stubs)
        os.environ["AZURE_OPENAI_API_KEY"] = ""

print("Azure env set. Deployment:", os.getenv("AZURE_OPENAI_DEPLOYMENT"))


Enter AZURE_OPENAI_API_KEY (hidden): ··········
Azure env set. Deployment: gpt-35-turbo


In [None]:

# %% [KERNEL] Create SK kernel with Azure OpenAI (per requested pattern)
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
import os

kernel = Kernel()

service = AzureChatCompletion(
    service_id="azure",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT"),  # AOAI deployment name
    endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    # api_version optional; using env default
)

kernel.add_service(service)
print("Kernel ready (Azure OpenAI)")


Kernel ready (Azure OpenAI)


In [None]:

# %% [TOOLS] Runbook actions — light stubs (replace with real implementations)
from datetime import datetime
from typing import Dict, Any

def tool_alert_ingest(source:str="pager", details:Dict[str,Any]=None):
    return {"status":"received","source":source,"at":datetime.utcnow().isoformat(),"details":details or {}}

def tool_triage(severity:str="S1", blast:str="limited"):
    return {"severity":severity,"blast_radius":blast,"decision":"continue"}

def tool_healthcheck():
    return {"latency_ms":320,"error_rate":0.8,"cost_per_turn":0.002,"healthy":False}

# Hotfix path
def tool_create_hotfix_branch(name="hotfix/quick-patch"):
    return {"branch":name,"created":True}
def tool_apply_patch(desc="patch applied"):
    return {"result":"patched","desc":desc,"tests":"added"}
def tool_ci_pipeline():
    return {"lint":"ok","sec_scan":"ok","eval_gates":"ok"}
def tool_canary(percent=10):
    return {"canary_percent":percent,"started":True}
def tool_monitor_kpis():
    return {"errors_down":True,"kpi_improved":True}
def tool_promote_all():
    return {"traffic":"100%","status":"promoted"}

# Rollback
def tool_find_last_good(tag="orchestrator-2025.01.15"):
    return {"tag":tag}
def tool_swap_blue_green():
    return {"swap":"done"}
def tool_verify_kpis():
    return {"recovered":True}

# Index rebuild
def tool_shadow_drain():
    return {"mode":"read-only","shadow":"enabled"}
def tool_reindex_job(scope="docs"):
    return {"scope":scope,"status":"completed"}
def tool_validate_index():
    return {"coverage_ok":True,"drift_ok":True}
def tool_switch_alias():
    return {"alias":"switched"}

# Secret rotation
def tool_rotate_secret(name="AOAI_API_KEY"):
    return {"secret":name,"version":"vNext","rotated":True}
def tool_refresh_consumers():
    return {"apps_restarted":True}
def tool_audit_kv():
    return {"anomalies":0}
def tool_smoke_auth():
    return {"status":"ok","http":"200"}

# Comms & postmortem
def tool_comms_update(stage="in-progress", impact="limited"):
    return {"notified":True,"stage":stage,"impact":impact}
def tool_postmortem(rca="config"):
    return {"rca":rca,"actions":["tests","alerts"],"owners":["ops","dev"]}

# Observability
def tool_dash_event(event:str):
    return {"event":event,"time":datetime.utcnow().isoformat()}

TOOLS = {
    # detect/triage
    "tool_alert_ingest": tool_alert_ingest,
    "tool_triage": tool_triage,
    "tool_healthcheck": tool_healthcheck,
    # hotfix
    "tool_create_hotfix_branch": tool_create_hotfix_branch,
    "tool_apply_patch": tool_apply_patch,
    "tool_ci_pipeline": tool_ci_pipeline,
    "tool_canary": tool_canary,
    "tool_monitor_kpis": tool_monitor_kpis,
    "tool_promote_all": tool_promote_all,
    # rollback
    "tool_find_last_good": tool_find_last_good,
    "tool_swap_blue_green": tool_swap_blue_green,
    "tool_verify_kpis": tool_verify_kpis,
    # index rebuild
    "tool_shadow_drain": tool_shadow_drain,
    "tool_reindex_job": tool_reindex_job,
    "tool_validate_index": tool_validate_index,
    "tool_switch_alias": tool_switch_alias,
    # secret rotation
    "tool_rotate_secret": tool_rotate_secret,
    "tool_refresh_consumers": tool_refresh_consumers,
    "tool_audit_kv": tool_audit_kv,
    "tool_smoke_auth": tool_smoke_auth,
    # comms / pm
    "tool_comms_update": tool_comms_update,
    "tool_postmortem": tool_postmortem,
    # obs
    "tool_dash_event": tool_dash_event,
}
print("Tools ready:", list(TOOLS)[:6], "...")


Tools ready: ['tool_alert_ingest', 'tool_triage', 'tool_healthcheck', 'tool_create_hotfix_branch', 'tool_apply_patch', 'tool_ci_pipeline'] ...


In [None]:

# %% [AGENTS] Simple agent classes (LLM call is optional & safe-failing)
from typing import List

def _llm_probe(kernel, system:str, user:str)->str:
    # We avoid raising if service is missing/invalid; just return a stub.
    try:
        # semantic-kernel 1.x: a minimal prompt run via a function is more complex;
        # here we just return a canned response to keep the demo robust.
        return f"[LLM:stub] {system} — ack: {user[:60]}"
    except Exception as e:
        return f"[LLM:error] {e}"

class BaseAgent:
    name: str = "agent"
    tools: List[str] = []
    system_message: str = ""

    def __init__(self, kernel):
        self.kernel = kernel

    def available_tools(self)->List[str]:
        return [t for t in self.tools if t in TOOLS]

    def call(self, tool_name:str, **kwargs):
        fn = TOOLS.get(tool_name)
        if not fn:
            raise ValueError(f"Tool not found: {tool_name}")
        return fn(**kwargs)

    def run(self, user_text:str)->str:
        return _llm_probe(self.kernel, self.system_message or self.name, user_text)

class DetectTriageAgent(BaseAgent):
    name = "Detect & Triage"
    tools = ["tool_alert_ingest","tool_triage","tool_healthcheck"]
    system_message = "You triage alerts and decide remediation paths."

class HotfixAgent(BaseAgent):
    name = "Hotfix"
    tools = ["tool_create_hotfix_branch","tool_apply_patch","tool_ci_pipeline",
             "tool_canary","tool_monitor_kpis","tool_promote_all"]
    system_message = "You implement safe hotfixes under guardrails."

class RollbackAgent(BaseAgent):
    name = "Rollback"
    tools = ["tool_find_last_good","tool_swap_blue_green","tool_verify_kpis"]
    system_message = "You perform blue/green swap or route revert."

class IndexRebuildAgent(BaseAgent):
    name = "Index Rebuild"
    tools = ["tool_shadow_drain","tool_reindex_job","tool_validate_index","tool_switch_alias"]
    system_message = "You rebuild search indexes with shadow + alias swap."

class SecretRotationAgent(BaseAgent):
    name = "Secret Rotation"
    tools = ["tool_rotate_secret","tool_refresh_consumers","tool_audit_kv","tool_smoke_auth"]
    system_message = "You rotate secrets/keys and verify consumers."

class CommsPostmortemAgent(BaseAgent):
    name = "Comms & Postmortem"
    tools = ["tool_comms_update","tool_postmortem"]
    system_message = "You communicate status and run postmortems."

class ObservabilityAgent(BaseAgent):
    name = "Observability"
    tools = ["tool_dash_event"]
    system_message = "You emit telemetry events."

# Instantiate
agents = {
    "detect": DetectTriageAgent(kernel),
    "hotfix": HotfixAgent(kernel),
    "rollback": RollbackAgent(kernel),
    "index": IndexRebuildAgent(kernel),
    "secret": SecretRotationAgent(kernel),
    "comms": CommsPostmortemAgent(kernel),
    "obs": ObservabilityAgent(kernel),
}
print("Agents:", list(agents.keys()))


Agents: ['detect', 'hotfix', 'rollback', 'index', 'secret', 'comms', 'obs']


In [None]:

# %% [DEMO] Simulate a typical incident: degraded service -> hotfix -> comms
from pprint import pprint

detect = agents["detect"]
obs = agents["obs"]
hotfix = agents["hotfix"]
rollback = agents["rollback"]
indexer = agents["index"]
secret = agents["secret"]
comms = agents["comms"]

print("=== 1) Detect & Triage ===")
pprint(detect.call("tool_alert_ingest", source="pager", details={"error_rate": "spike"}))
pprint(detect.call("tool_triage", severity="S1", blast="limited"))
hc = detect.call("tool_healthcheck")
pprint(hc)
print(detect.run("Analyze alert and advise path."))

print("\n=== 2) Decide path (degraded but stable -> hotfix) ===")
pprint(obs.call("tool_dash_event", event="incident_started"))

print("\n=== 3) Hotfix Path ===")
pprint(hotfix.call("tool_create_hotfix_branch"))
pprint(hotfix.call("tool_apply_patch", desc="Fix null-check in orchestrator"))
pprint(hotfix.call("tool_ci_pipeline"))
pprint(hotfix.call("tool_canary", percent=10))
pprint(hotfix.call("tool_monitor_kpis"))
pprint(hotfix.run("Summarize hotfix plan in one sentence."))
promoted = hotfix.call("tool_promote_all")
pprint(promoted)

print("\n=== 4) Comms & Postmortem ===")
pprint(comms.call("tool_comms_update", stage="restored", impact="reduced errors"))
pprint(comms.call("tool_postmortem", rca="code"))
pprint(obs.call("tool_dash_event", event="incident_resolved"))

print("\nDemo complete ✓")


=== 1) Detect & Triage ===
{'at': '2025-11-09T11:16:07.002489',
 'details': {'error_rate': 'spike'},
 'source': 'pager',
 'status': 'received'}
{'blast_radius': 'limited', 'decision': 'continue', 'severity': 'S1'}
{'cost_per_turn': 0.002, 'error_rate': 0.8, 'healthy': False, 'latency_ms': 320}
[LLM:stub] You triage alerts and decide remediation paths. — ack: Analyze alert and advise path.

=== 2) Decide path (degraded but stable -> hotfix) ===
{'event': 'incident_started', 'time': '2025-11-09T11:16:07.003072'}

=== 3) Hotfix Path ===
{'branch': 'hotfix/quick-patch', 'created': True}
{'desc': 'Fix null-check in orchestrator',
 'result': 'patched',
 'tests': 'added'}
{'eval_gates': 'ok', 'lint': 'ok', 'sec_scan': 'ok'}
{'canary_percent': 10, 'started': True}
{'errors_down': True, 'kpi_improved': True}
('[LLM:stub] You implement safe hotfixes under guardrails. — ack: Summarize '
 'hotfix plan in one sentence.')
{'status': 'promoted', 'traffic': '100%'}

=== 4) Comms & Postmortem ===
{'imp

  return {"status":"received","source":source,"at":datetime.utcnow().isoformat(),"details":details or {}}
  return {"event":event,"time":datetime.utcnow().isoformat()}


## Wiring

This section declares **agent ↔ tool** hookups and validates them against what exists in the notebook at runtime.

- Edit the `WIRES` dictionary to map each agent to allowed tools.
- The validator will:
  - list discovered agents (`agent_*` instances) and available `TOOLS`
  - verify tool names referenced by each agent exist
  - print a tidy report without raising hard errors (so your demo keeps running)


In [None]:
# --- WIRING: map agents to allowed tools and optional notes ---
# Feel free to edit this mapping. Keys are agent *names* (str), values include:
#   - tools: list[str] of tool ids from TOOLS
#   - notes: freeform string (optional)
WIRES = {
    # Examples (uncomment and adjust to match your agents / tool ids):
    # "Detect & Triage": {"tools": [], "notes": "Receives alerts, no tools"},
    # "Hotfix":          {"tools": ["tool_http", "tool_ci_trigger"], "notes": "Can trigger CI/canary"},
    # "Rollback":        {"tools": ["tool_deploy_swap"], "notes": "Blue/Green slot swap"},
    # "Index Rebuild":   {"tools": ["tool_index_rebuild", "tool_ai_search"], "notes": "Reindex path"},
    # "Secret Rotation": {"tools": ["tool_kv_rotate"], "notes": "Key Vault rotation"},
    # "Comms & Postmortem": {"tools": ["tool_ticket","tool_chatops"], "notes": "Stakeholder updates"},
}

def _discover_agents():
    '''Return a list of (var_name, instance, display_name) for variables named agent_*'''
    found = []
    g = globals()
    for k, v in sorted(g.items()):
        if not k.startswith("agent_"):
            continue
        name = getattr(v, "name", k)
        found.append((k, v, str(name)))
    return found

def _available_tools():
    '''Return a sorted list of tool ids from global TOOLS if present; else stub empty dict.'''
    g = globals()
    tools = g.get("TOOLS")
    if not isinstance(tools, dict):
        return [], {}
    return sorted(tools.keys()), tools

def validate_wiring():
    agents = _discover_agents()
    tool_ids, tools = _available_tools()
    print("=== Wiring Validation ===")
    print(f"Discovered agents: {len(agents)}")
    for var, inst, disp in agents:
        print(f"  - {var} → name='{disp}'")
    print(f"Available tools: {len(tool_ids)}")
    if tool_ids:
        print("  " + ", ".join(tool_ids))

    if not agents:
        print("\nNote: No agent instances (`agent_*`) found yet. Define them above this cell and re-run.")
    if not tools:
        print("Note: No global TOOLS dict found. Define TOOLS = { 'tool_id': fn, ... } above this cell.")

    # Cross-check WIRES
    print("\n--- Cross‑check WIRES ---")
    agent_names = set(disp for _,_,disp in agents)
    unknown_agents = [a for a in WIRES.keys() if a not in agent_names]
    if unknown_agents:
        print("Warning: The following WIRES entries don't match any discovered agent names:")
        for a in unknown_agents:
            print(f"  • '{a}'")
    else:
        print("All WIRES entries correspond to discovered agents.")

    bad_tools = []
    for a, cfg in WIRES.items():
        for t in cfg.get("tools", []):
            if t not in tools:
                bad_tools.append((a, t))
    if bad_tools:
        print("\nWarning: Unknown tool ids referenced in WIRES (not present in TOOLS):")
        for a, t in bad_tools:
            print(f"  • agent '{a}' → tool '{t}'")
    else:
        if WIRES:
            print("All WIRES tool references exist in TOOLS.")
        else:
            print("WIRES is empty. Add mappings above to enforce allowlists per agent.")

    # Present a compact table
    print("\n--- Wiring Table ---")
    if not WIRES:
        print("(empty)  -> edit WIRES to add entries")
    else:
        for a, cfg in WIRES.items():
            tlist = ", ".join(cfg.get("tools", [])) or "—"
            notes = cfg.get("notes") or ""
            print(f"• {a:30s} | tools: {tlist:40s} | {notes}")

    return {
        "agents_found": [disp for _,_,disp in agents],
        "tools_found": tool_ids,
        "unknown_agents": unknown_agents,
        "bad_tools": bad_tools,
        "wires": WIRES,
    }

# Auto-run validation to show immediate feedback when the cell executes
_wiring_report = validate_wiring()