In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "HgdguHRs8Tdg"
   },
   "source": [
    "## Operação Helius - FASE 3: Relatório de Simulação\n",
    "**Simulação de Falhas Catastróficas, Backpressure e Resiliência de IA**\n",
    "\n",
    "Este notebook implementa a validação do **Blueprint de Arquitetura Resiliente** (Fases 1, 2 e 3) comparando-o com a arquitetura **Legada** (Dia 0) sob três cenários de falha.\n",
    "\n",
    "**Métricas de Resiliência (KPIs):**\n",
    "- `RTO_steps` (Recovery Time Objective): Passos de tempo para a propagação da falha parar.\n",
    "- `failed_critical_fraction`: Fração de nós críticos (IA, Dados) que falharam.\n",
    "- `max_queue_length`: Pico de backlog na simulação de backpressure (MQTT).\n",
    "- `avg_invalid_rate`: (Vista pelo Cliente) Taxa de outputs de IA inválidos/erráticos.\n",
    "- `fallback_activation_ratio`: (Resiliente) % de simulações em que o modo de degradação controlada (Fallback) foi ativado.\n",
    "\n",
    "**Entregáveis:**\n",
    "- Geração de `deliverable/3_simulation_report.csv`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "lYh7TIup9XKx"
   },
   "outputs": [],
   "source": [
    "!pip install simpy pandas networkx plotly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "PqF-g-b29b6u"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import networkx as nx\n",
    "import simpy\n",
    "import random\n",
    "import numpy as np\n",
    "import plotly.express as px\n",
    "import plotly.io as pio\n",
    "import os\n",
    "from typing import Set, Tuple, Dict, List\n",
    "\n",
    "pio.templates.default = \"plotly_dark\"\n",
    "print(\"Bibliotecas importadas e configuradas.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "mE0A4-s59fV-"
   },
   "source": [
    "### 1. Definição das Arquiteturas (Legada vs. Resiliente)\n",
    "\n",
    "Definimos as topologias de grafos (NetworkX) para as duas arquiteturas. A arquitetura Legada reflete os SPOFs e a alta centralidade (como em `service_00`) observados no Incidente Dia 0."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "sUABdO3c9i7c"
   },
   "outputs": [],
   "source": [
    "def create_legacy_graph() -> nx.DiGraph:\n",
    "    \"\"\"Cria o grafo da arquitetura Legada (Single-Region, SPOF).\"\"\"\n",
    "    G = nx.DiGraph()\n",
    "    \n",
    "    # Nós Críticos (Todos em us-east-1, como no main.tf)\n",
    "    G.add_node(\"mlflow_tracker\", region=\"us-east-1\", type=\"critical_control\")\n",
    "    G.add_node(\"grafana_core\", region=\"us-east-1\", type=\"critical_control\") # service_00 (alta centralidade)\n",
    "    G.add_node(\"reco_engine\", region=\"us-east-1\", type=\"critical_ia\")      # service_10 (alta vulnerabilidade)\n",
    "    G.add_node(\"llm_router\", region=\"us-east-1\", type=\"critical_ia\")\n",
    "    G.add_node(\"telemetry_gateway\", region=\"us-east-1\", type=\"critical_ingest\")\n",
    "    G.add_node(\"rds_db\", region=\"us-east-1\", type=\"critical_data\") # SPOF (Não Multi-AZ na prática)\n",
    "\n",
    "    # Nós de Borda (Edge)\n",
    "    G.add_node(\"edge_device_1\", region=\"edge\", type=\"edge\")\n",
    "    G.add_node(\"edge_device_2\", region=\"edge\", type=\"edge\")\n",
    "\n",
    "    # Dependências (Alta Centralidade no Grafana/Telemetry)\n",
    "    G.add_edge(\"edge_device_1\", \"telemetry_gateway\")\n",
    "    G.add_edge(\"edge_device_2\", \"telemetry_gateway\")\n",
    "    \n",
    "    G.add_edge(\"telemetry_gateway\", \"rds_db\")\n",
    "    G.add_edge(\"telemetry_gateway\", \"grafana_core\") # Alta centralidade\n",
    "    G.add_edge(\"telemetry_gateway\", \"reco_engine\")  # Causa o Data Drift\n",
    "    \n",
    "    G.add_edge(\"reco_engine\", \"rds_db\")\n",
    "    G.add_edge(\"reco_engine\", \"mlflow_tracker\") # Dependência de Startup (SPOF)\n",
    "    G.add_edge(\"llm_router\", \"reco_engine\")\n",
    "    \n",
    "    G.add_edge(\"grafana_core\", \"rds_db\") # service_00 (alta centralidade)\n",
    "    return G\n",
    "\n",
    "\n",
    "def create_resilient_graph() -> nx.DiGraph:\n",
    "    \"\"\"Cria o grafo da arquitetura Resiliente (Multi-Region, HA, Fallback).\"\"\"\n",
    "    G = nx.DiGraph()\n",
    "\n",
    "    # --- Região Primária (us-east-1) ---\n",
    "    G.add_node(\"reco_engine_p\", region=\"us-east-1\", type=\"critical_ia\", has_fallback=True)\n",
    "    G.add_node(\"llm_router_p\", region=\"us-east-1\", type=\"critical_ia\", has_fallback=True)\n",
    "    G.add_node(\"telemetry_gateway_p\", region=\"us-east-1\", type=\"critical_ingest\", has_circuit_breaker=True)\n",
    "    G.add_node(\"grafana_core_p\", region=\"us-east-1\", type=\"critical_control\", replicas=3)\n",
    "\n",
    "    # --- Região Secundária (us-west-2) ---\n",
    "    G.add_node(\"reco_engine_s\", region=\"us-west-2\", type=\"critical_ia\", has_fallback=True)\n",
    "    G.add_node(\"llm_router_s\", region=\"us-west-2\", type=\"critical_ia\", has_fallback=True)\n",
    "    G.add_node(\"grafana_core_s\", region=\"us-west-2\", type=\"critical_control\", replicas=3)\n",
    "\n",
    "    # --- Recursos Globais / Multi-AZ / Replicados ---\n",
    "    G.add_node(\"kafka_buffer\", region=\"global\", type=\"critical_buffer\")\n",
    "    G.add_node(\"rds_db_multi_az\", region=\"global\", type=\"critical_data\", replicas=2) # Simula RDS Multi-AZ\n",
    "    G.add_node(\"mlflow_s3_crr\", region=\"global\", type=\"critical_data\", replicas=2) # Simula S3 CRR\n",
    "\n",
    "    # Nós de Borda (Edge)\n",
    "    G.add_node(\"edge_device_1\", region=\"edge\", type=\"edge\")\n",
    "    G.add_node(\"edge_device_2\", region=\"edge\", type=\"edge\")\n",
    "\n",
    "    # --- Fluxo de Dependências Resiliente ---\n",
    "    G.add_edge(\"edge_device_1\", \"kafka_buffer\")\n",
    "    G.add_edge(\"edge_device_2\", \"kafka_buffer\")\n",
    "    \n",
    "    G.add_edge(\"telemetry_gateway_p\", \"kafka_buffer\") # Consome do buffer\n",
    "    \n",
    "    # Dependências de IA (agora em recursos globais)\n",
    "    G.add_edge(\"reco_engine_p\", \"rds_db_multi_az\")\n",
    "    G.add_edge(\"reco_engine_p\", \"mlflow_s3_crr\")\n",
    "    G.add_edge(\"reco_engine_s\", \"rds_db_multi_az\")\n",
    "    G.add_edge(\"reco_engine_s\", \"mlflow_s3_crr\")\n",
    "    \n",
    "    # Roteamento (Primário -> Secundário)\n",
    "    G.add_edge(\"llm_router_p\", \"reco_engine_p\")\n",
    "    G.add_edge(\"llm_router_p\", \"reco_engine_s\") # Link de Failover\n",
    "    G.add_edge(\"llm_router_s\", \"reco_engine_s\")\n",
    "    G.add_edge(\"llm_router_s\", \"reco_engine_p\") # Link de Failover\n",
    "    \n",
    "    G.add_edge(\"grafana_core_p\", \"rds_db_multi_az\")\n",
    "    G.add_edge(\"grafana_core_s\", \"rds_db_multi_az\")\n",
    "    return G\n",
    "\n",
    "print(\"Modelos de grafo (Legado e Resiliente) definidos.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1yVlA9_W9qjB"
   },
   "source": [
    "### 2. Funções de Simulação (Copiadas de `sim/`)\n",
    "\n",
    "Copiamos as funções-base de `network_failure_sim.py` e `backpressure_sim.py` para tornar o notebook autônomo."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "q11RkSjV9wFk"
   },
   "outputs": [],
   "source": [
    "def simulate_network_failure(graph: nx.DiGraph, p_node: float, p_propagate: float, initial_failed_nodes: Set = None) -> Tuple[int, Set[int]]:\n",
    "    \"\"\"(Copiado de network_failure_sim.py) Simula falhas e propagação.\"\"\"\n",
    "    failed: Set[int] = initial_failed_nodes if initial_failed_nodes else set()\n",
    "    \n",
    "    if not initial_failed_nodes:\n",
    "        for node in graph.nodes:\n",
    "            if random.random() < p_node:\n",
    "                failed.add(node)\n",
    "    \n",
    "    if not failed:\n",
    "        return 0, set() # Nenhuma falha inicial\n",
    "\n",
    "    frontier = list(failed)\n",
    "    recovery_time = 0\n",
    "    \n",
    "    # Propagação BFS\n",
    "    while frontier:\n",
    "        next_frontier = []\n",
    "        recovery_time += 1\n",
    "        current_node = frontier.pop(0)\n",
    "        \n",
    "        # A falha se propaga para os vizinhos (dependências)\n",
    "        for neighbor in graph.neighbors(current_node):\n",
    "            if neighbor not in failed and random.random() < p_propagate:\n",
    "                failed.add(neighbor)\n",
    "                next_frontier.append(neighbor)\n",
    "        \n",
    "        if not frontier:\n",
    "            frontier = next_frontier\n",
    "            \n",
    "    return recovery_time, failed\n",
    "\n",
    "\n",
    "def simulate_queue_backpressure(arrival_rate: float, service_rate: float, capacity: int, sim_time: float, queue_threshold: int) -> Dict[str, float]:\n",
    "    \"\"\"(Adaptado de backpressure_sim.py) Simula uma fila M/M/c (SimPy).\"\"\"\n",
    "    env = simpy.Environment()\n",
    "    server = simpy.Resource(env, capacity)\n",
    "    queue_lengths = []\n",
    "    time_above_threshold = 0.0\n",
    "    \n",
    "    def request(env, server):\n",
    "        arrive = env.now\n",
    "        with server.request() as req:\n",
    "            queue_lengths.append(len(server.queue))\n",
    "            yield req\n",
    "            wait = env.now - arrive\n",
    "            if len(server.queue) > queue_threshold:\n",
    "                time_above_threshold += wait # Simplificação: tempo de espera contribui para o tempo acima do limite\n",
    "                \n",
    "            service_time = random.expovariate(service_rate)\n",
    "            yield env.timeout(service_time)\n",
    "\n",
    "    def setup(env, server):\n",
    "        while True:\n",
    "            yield env.timeout(random.expovariate(arrival_rate))\n",
    "            env.process(request(env, server))\n",
    "\n",
    "    env.process(setup(env, server))\n",
    "    env.run(until=sim_time)\n",
    "    \n",
    "    return {\n",
    "        \"max_queue_length\": max(queue_lengths) if queue_lengths else 0,\n",
    "        \"time_queue_above_threshold\": time_above_threshold\n",
    "    }\n",
    "\n",
    "print(\"Funções de simulação (NetworkX, SimPy) prontas.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "GvX17pWj939A"
   },
   "source": [
    "### 3. Lógica de Simulação: Cenários e Métricas\n",
    "\n",
    "Aqui implementamos a lógica principal que executa os 3 cenários de falha para ambas as arquiteturas."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "e_zYt0qY9-Qd"
   },
   "outputs": [],
   "source": [
    "def get_critical_nodes(G: nx.DiGraph) -> Set[str]:\n",
    "    \"\"\"Identifica nós críticos (IA, Dados, Controle).\"\"\"\n",
    "    return {n for n, d in G.nodes(data=True) if d.get(\"type\", \"\").startswith(\"critical\")}\n",
    "\n",
    "def run_single_simulation(architecture: str, scenario: str, sim_params: Dict) -> Dict:\n",
    "    \"\"\"\n",
    "    Executa uma única simulação para uma arquitetura e cenário.\n",
    "    Aqui é onde a lógica do Blueprint (Fallback, Circuit Breaker) é modelada.\n",
    "    \"\"\"\n",
    "    \n",
    "    if architecture == \"legacy\":\n",
    "        G = create_legacy_graph()\n",
    "    else: # resilient\n",
    "        G = create_resilient_graph()\n",
    "\n",
    "    critical_nodes = get_critical_nodes(G)\n",
    "    results = {\n",
    "        \"architecture\": architecture,\n",
    "        \"scenario\": scenario,\n",
    "        \"RTO_steps\": 0,\n",
    "        \"failed_nodes_count\": 0,\n",
    "        \"failed_critical_fraction\": 0.0,\n",
    "        \"max_queue_length\": 0,\n",
    "        \"time_queue_above_threshold\": 0.0,\n",
    "        \"avg_invalid_rate\": 0.0,\n",
    "        \"fallback_activation_ratio\": 0.0\n",
    "    }\n",
    "\n",
    "    # --- CENÁRIO 1: FALHA REGIONAL (us-east-1) ---\n",
    "    if scenario == \"regional_outage\":\n",
    "        # Perda total da us-east-1\n",
    "        initial_failures = {n for n, d in G.nodes(data=True) if d.get(\"region\") == \"us-east-1\"}\n",
    "        \n",
    "        if architecture == \"legacy\":\n",
    "            # Na arquitetura legada, a propagação é total (p_propagate=1.0)\n",
    "            rto, failed = simulate_network_failure(G, 0.0, 1.0, initial_failures)\n",
    "        else: # resilient\n",
    "            # Na arquitetura resiliente, a propagação é contida.\n",
    "            # O failover (Fase 1) é ativado, e a propagação é baixa (p_propagate=0.1)\n",
    "            rto, failed = simulate_network_failure(G, 0.0, 0.1, initial_failures)\n",
    "            # Os serviços de IA ativam o fallback (Fase 2)\n",
    "            results[\"fallback_activation_ratio\"] = 1.0 \n",
    "        \n",
    "        results[\"RTO_steps\"] = rto\n",
    "        results[\"failed_nodes_count\"] = len(failed)\n",
    "        failed_critical = failed.intersection(critical_nodes)\n",
    "        if critical_nodes:\n",
    "            results[\"failed_critical_fraction\"] = len(failed_critical) / len(critical_nodes)\n",
    "    \n",
    "    # --- CENÁRIO 2: BACKPRESSURE MQTT ---\n",
    "    elif scenario == \"mqtt_backpressure\":\n",
    "        # Simula duplicação de mensagens (taxa de chegada 2x maior que o serviço)\n",
    "        arrival_rate = 200.0\n",
    "        service_rate = 100.0\n",
    "        \n",
    "        if architecture == \"legacy\":\n",
    "            # Capacidade limitada, sem buffer (ex: 10 workers)\n",
    "            sim_results = simulate_queue_backpressure(arrival_rate, service_rate, 10, 100.0, 50)\n",
    "        else: # resilient\n",
    "            # Kafka (Fase 2) absorve o pico (capacidade muito maior, ex: 100 workers)\n",
    "            # E o Circuit Breaker (Fase 3) limita o tempo de espera.\n",
    "            sim_results = simulate_queue_backpressure(arrival_rate, service_rate, 100, 100.0, 50)\n",
    "            # O Control Plane ativa o fallback se o backlog for muito alto\n",
    "            if sim_results[\"max_queue_length\"] > 50:\n",
    "                 results[\"fallback_activation_ratio\"] = 1.0\n",
    "\n",
    "        results.update(sim_results)\n",
    "\n",
    "    # --- CENÁRIO 3: FALHA DE IA (DRIFT) ---\n",
    "    elif scenario == \"ia_drift\":\n",
    "        # Simula Data Drift causando 50% de outputs inválidos\n",
    "        raw_invalid_rate = 0.50\n",
    "        \n",
    "        if architecture == \"legacy\":\n",
    "            # O Drift é passado diretamente ao cliente\n",
    "            results[\"avg_invalid_rate\"] = raw_invalid_rate\n",
    "            results[\"fallback_activation_ratio\"] = 0.0\n",
    "        else: # resilient\n",
    "            # O py-llm-shield (Fase 3) detecta o drift (taxa > 5%)\n",
    "            # O LLM Router (Fase 2) ativa o Fail-Open (Confiança 0.0)\n",
    "            alert_threshold = 0.05\n",
    "            if raw_invalid_rate > alert_threshold:\n",
    "                results[\"fallback_activation_ratio\"] = 1.0\n",
    "                results[\"avg_invalid_rate\"] = 0.0 # O cliente recebe o fallback seguro\n",
    "            else:\n",
    "                results[\"avg_invalid_rate\"] = raw_invalid_rate\n",
    "                results[\"fallback_activation_ratio\"] = 0.0\n",
    "                \n",
    "    return results\n",
    "\n",
    "print(\"Lógica de simulação principal (run_single_simulation) definida.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "A4K70qW4AD-A"
   },
   "source": [
    "### 4. Execução da Simulação (Monte Carlo)\n",
    "\n",
    "Executamos N simulações para cada combinação de arquitetura e cenário para obter resultados estatisticamente relevantes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "n8t5w-5wAFq5"
   },
   "outputs": [],
   "source": [
    "N_SIMULATIONS = 100 # Número de execuções de Monte Carlo por cenário\n",
    "OUTPUT_CSV = \"deliverable/3_simulation_report.csv\"\n",
    "\n",
    "all_results = []\n",
    "scenarios = [\"regional_outage\", \"mqtt_backpressure\", \"ia_drift\"]\n",
    "architectures = [\"legacy\", \"resilient\"]\n",
    "sim_params = {} # Parâmetros futuros para análise de sensibilidade\n",
    "\n",
    "print(f\"Iniciando {N_SIMULATIONS} simulações de Monte Carlo para {len(scenarios)} cenários e {len(architectures)} arquiteturas...\")\n",
    "\n",
    "for i in range(N_SIMULATIONS):\n",
    "    if i % (N_SIMULATIONS // 10) == 0:\n",
    "        print(f\"... completou {i}/{N_SIMULATIONS} execuções ...\")\n",
    "        \n",
    "    for arch in architectures:\n",
    "        for scen in scenarios:\n",
    "            res = run_single_simulation(arch, scen, sim_params)\n",
    "            res[\"run_id\"] = i\n",
    "            all_results.append(res)\n",
    "\n",
    "df_results = pd.DataFrame(all_results)\n",
    "\n",
    "print(f\"Simulação concluída. Total de {len(df_results)} resultados gerados.\")\n",
    "display(df_results.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "J1xI2xJ4AH_g"
   },
   "source": [
    "### 5. Análise de Métricas de Resiliência (Resultados Agregados)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "oYvA9S4CAK-1"
   },
   "outputs": [],
   "source": [
    "df_agg = df_results.groupby([\"architecture\", \"scenario\"]).mean(numeric_only=True)\n",
    "\n",
    "# Métricas de Resiliência (Menor é Melhor)\n",
    "cols_lower_is_better = [\n",
    "    \"RTO_steps\", \n",
    "    \"failed_critical_fraction\", \n",
    "    \"max_queue_length\", \n",
    "    \"time_queue_above_threshold\", \n",
    "    \"avg_invalid_rate\"\n",
    "]\n",
    "\n",
    "# Métricas de Contenção (Maior é Melhor)\n",
    "cols_higher_is_better = [\n",
    "    \"fallback_activation_ratio\"\n",
    "]\n",
    "\n",
    "df_report = df_agg[cols_lower_is_better + cols_higher_is_better].reset_index()\n",
    "display(df_report)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "qR6zYF7cAPy1"
   },
   "source": [
    "### 6. Visualização dos Resultados (Legada vs. Resiliente)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "V476n5A1AUp4"
   },
   "outputs": [],
   "source": [
    "def plot_metric(metric_name, title_name):\n",
    "    fig = px.bar(\n",
    "        df_report,\n",
    "        x=\"scenario\", \n",
    "        y=metric_name, \n",
    "        color=\"architecture\", \n",
    "        barmode=\"group\",\n",
    "        title=title_name,\n",
    "        labels={\"architecture\": \"Arquitetura\", \"scenario\": \"Cenário de Falha\"}\n",
    "    )\n",
    "    fig.show()\n",
    "\n",
    "print(\"--- Relatório de Métricas de Resiliência (Menor é Melhor) ---\")\n",
    "plot_metric(\"failed_critical_fraction\", \"Fração de Nós Críticos que Falharam (Falha Regional)\")\n",
    "plot_metric(\"max_queue_length\", \"Pico Máximo da Fila de Backlog (Falha MQTT)\")\n",
    "plot_metric(\"avg_invalid_rate\", \"Taxa de Erro de IA (Drift) Vista pelo Cliente\")\n",
    "\n",
    "print(\"\\n--- Relatório de Métricas de Contenção (Maior é Melhor) ---\")\n",
    "plot_metric(\"fallback_activation_ratio\", \"Taxa de Ativação do Modo Fallback/Contenção\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "KjyRz8vl9C8G"
   },
   "source": [
    "### 7. SALVAR CSV COMO deliverable/3_simulation_report.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "C71z40V_9F3I"
   },
   "outputs": [],
   "source": [
    "os.makedirs(\"deliverable\", exist_ok=True)\n",
    "df_results.to_csv(OUTPUT_CSV, index=False)\n",
    "\n",
    "print(f\"Relatório de simulação salvo em: {OUTPUT_CSV}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

NameError: name 'null' is not defined

In [None]:
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "lYh7TIup9XKx"
   },
   "outputs": [],
   "source": [
    "!pip install simpy pandas networkx plotly"
   ]
  }

SyntaxError: unmatched '}' (ipython-input-1300973493.py, line 10)

In [None]:
  ,
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "PqF-g-b29b6u"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import networkx as nx\n",
    "import simpy\n",
    "import random\n",
    "import numpy as np\n",
    "import plotly.express as px\n",
    "import plotly.io as pio\n",
    "import os\n",
    "from typing import Set, Tuple, Dict, List\n",
    "\n",
    "pio.templates.default = \"plotly_dark\"\n",
    "print(\"Bibliotecas importadas e configuradas.\")"
   ]
  }

NameError: name 'null' is not defined

In [None]:
  ,
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "sUABdO3c9i7c"
   },
   "outputs": [],
   "source": [
    "def create_legacy_graph() -> nx.DiGraph:\n",
    "    \"\"\"Cria o grafo da arquitetura Legada (Single-Region, SPOF).\"\"\"\n",
    "    G = nx.DiGraph()\n",
    "    \n",
    "    # Nós Críticos (Todos em us-east-1, como no main.tf)\n",
    "    G.add_node(\"mlflow_tracker\", region=\"us-east-1\", type=\"critical_control\")\n",
    "    G.add_node(\"grafana_core\", region=\"us-east-1\", type=\"critical_control\") # service_00 (alta centralidade)\n",
    "    G.add_node(\"reco_engine\", region=\"us-east-1\", type=\"critical_ia\")      # service_10 (alta vulnerabilidade)\n",
    "    G.add_node(\"llm_router\", region=\"us-east-1\", type=\"critical_ia\")\n",
    "    G.add_node(\"telemetry_gateway\", region=\"us-east-1\", type=\"critical_ingest\")\n",
    "    G.add_node(\"rds_db\", region=\"us-east-1\", type=\"critical_data\") # SPOF (Não Multi-AZ na prática)\n",
    "\n",
    "    # Nós de Borda (Edge)\n",
    "    G.add_node(\"edge_device_1\", region=\"edge\", type=\"edge\")\n",
    "    G.add_node(\"edge_device_2\", region=\"edge\", type=\"edge\")\n",
    "\n",
    "    # Dependências (Alta Centralidade no Grafana/Telemetry)\n",
    "    G.add_edge(\"edge_device_1\", \"telemetry_gateway\")\n",
    "    G.add_edge(\"edge_device_2\", \"telemetry_gateway\")\n",
    "    \n",
    "    G.add_edge(\"telemetry_gateway\", \"rds_db\")\n",
    "    G.add_edge(\"telemetry_gateway\", \"grafana_core\") # Alta centralidade\n",
    "    G.add_edge(\"telemetry_gateway\", \"reco_engine\")  # Causa o Data Drift\n",
    "    \n",
    "    G.add_edge(\"reco_engine\", \"rds_db\")\n",
    "    G.add_edge(\"reco_engine\", \"mlflow_tracker\") # Dependência de Startup (SPOF)\n",
    "    G.add_edge(\"llm_router\", \"reco_engine\")\n",
    "    \n",
    "    G.add_edge(\"grafana_core\", \"rds_db\") # service_00 (alta centralidade)\n",
    "    return G\n",
    "\n",
    "\n",
    "def create_resilient_graph() -> nx.DiGraph:\n",
    "    \"\"\"Cria o grafo da arquitetura Resiliente (Multi-Region, HA, Fallback).\"\"\"\n",
    "    G = nx.DiGraph()\n",
    "\n",
    "    # --- Região Primária (us-east-1) ---\n",
    "    G.add_node(\"reco_engine_p\", region=\"us-east-1\", type=\"critical_ia\", has_fallback=True)\n",
    "    G.add_node(\"llm_router_p\", region=\"us-east-1\", type=\"critical_ia\", has_fallback=True)\n",
    "    G.add_node(\"telemetry_gateway_p\", region=\"us-east-1\", type=\"critical_ingest\", has_circuit_breaker=True)\n",
    "    G.add_node(\"grafana_core_p\", region=\"us-east-1\", type=\"critical_control\", replicas=3)\n",
    "\n",
    "    # --- Região Secundária (us-west-2) ---\n",
    "    G.add_node(\"reco_engine_s\", region=\"us-west-2\", type=\"critical_ia\", has_fallback=True)\n",
    "    G.add_node(\"llm_router_s\", region=\"us-west-2\", type=\"critical_ia\", has_fallback=True)\n",
    "    G.add_node(\"grafana_core_s\", region=\"us-west-2\", type=\"critical_control\", replicas=3)\n",
    "\n",
    "    # --- Recursos Globais / Multi-AZ / Replicados ---\n",
    "    G.add_node(\"kafka_buffer\", region=\"global\", type=\"critical_buffer\")\n",
    "    G.add_node(\"rds_db_multi_az\", region=\"global\", type=\"critical_data\", replicas=2) # Simula RDS Multi-AZ\n",
    "    G.add_node(\"mlflow_s3_crr\", region=\"global\", type=\"critical_data\", replicas=2) # Simula S3 CRR\n",
    "\n",
    "    # Nós de Borda (Edge)\n",
    "    G.add_node(\"edge_device_1\", region=\"edge\", type=\"edge\")\n",
    "    G.add_node(\"edge_device_2\", region=\"edge\", type=\"edge\")\n",
    "\n",
    "    # --- Fluxo de Dependências Resiliente ---\n",
    "    G.add_edge(\"edge_device_1\", \"kafka_buffer\")\n",
    "    G.add_edge(\"edge_device_2\", \"kafka_buffer\")\n",
    "    \n",
    "    G.add_edge(\"telemetry_gateway_p\", \"kafka_buffer\") # Consome do buffer\n",
    "    \n",
    "    # Dependências de IA (agora em recursos globais)\n",
    "    G.add_edge(\"reco_engine_p\", \"rds_db_multi_az\")\n",
    "    G.add_edge(\"reco_engine_p\", \"mlflow_s3_crr\")\n",
    "    G.add_edge(\"reco_engine_s\", \"rds_db_multi_az\")\n",
    "    G.add_edge(\"reco_engine_s\", \"mlflow_s3_crr\")\n",
    "    \n",
    "    # Roteamento (Primário -> Secundário)\n",
    "    G.add_edge(\"llm_router_p\", \"reco_engine_p\")\n",
    "    G.add_edge(\"llm_router_p\", \"reco_engine_s\") # Link de Failover\n",
    "    G.add_edge(\"llm_router_s\", \"reco_engine_s\")\n",
    "    G.add_edge(\"llm_router_s\", \"reco_engine_p\") # Link de Failover\n",
    "    \n",
    "    G.add_edge(\"grafana_core_p\", \"rds_db_multi_az\")\n",
    "    G.add_edge(\"grafana_core_s\", \"rds_db_multi_az\")\n",
    "    return G\n",
    "\n",
    "print(\"Modelos de grafo (Legado e Resiliente) definidos.\")"
   ]
  }

NameError: name 'null' is not defined

In [None]:
  ,
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "q11RkSjV9wFk"
   },
   "outputs": [],
   "source": [
    "def simulate_network_failure(graph: nx.DiGraph, p_node: float, p_propagate: float, initial_failed_nodes: Set = None) -> Tuple[int, Set[int]]:\n",
    "    \"\"\"(Copiado de network_failure_sim.py) Simula falhas e propagação.\"\"\"\n",
    "    failed: Set[int] = initial_failed_nodes if initial_failed_nodes else set()\n",
    "    \n",
    "    if not initial_failed_nodes:\n",
    "        for node in graph.nodes:\n",
    "            if random.random() < p_node:\n",
    "                failed.add(node)\n",
    "    \n",
    "    if not failed:\n",
    "        return 0, set() # Nenhuma falha inicial\n",
    "\n",
    "    frontier = list(failed)\n",
    "    recovery_time = 0\n",
    "    \n",
    "    # Propagação BFS\n",
    "    while frontier:\n",
    "        next_frontier = []\n",
    "        recovery_time += 1\n",
    "        current_node = frontier.pop(0)\n",
    "        \n",
    "        # A falha se propaga para os vizinhos (dependências)\n",
    "        for neighbor in graph.neighbors(current_node):\n",
    "            if neighbor not in failed and random.random() < p_propagate:\n",
    "                failed.add(neighbor)\n",
    "                next_frontier.append(neighbor)\n",
    "        \n",
    "        if not frontier:\n",
    "            frontier = next_frontier\n",
    "            \n",
    "    return recovery_time, failed\n",
    "\n",
    "\n",
    "def simulate_queue_backpressure(arrival_rate: float, service_rate: float, capacity: int, sim_time: float, queue_threshold: int) -> Dict[str, float]:\n",
    "    \"\"\"(Adaptado de backpressure_sim.py) Simula uma fila M/M/c (SimPy).\"\"\"\n",
    "    env = simpy.Environment()\n",
    "    server = simpy.Resource(env, capacity)\n",
    "    queue_lengths = []\n",
    "    time_above_threshold = 0.0\n",
    "    \n",
    "    def request(env, server):\n",
    "        arrive = env.now\n",
    "        with server.request() as req:\n",
    "            queue_lengths.append(len(server.queue))\n",
    "            yield req\n",
    "            wait = env.now - arrive\n",
    "            if len(server.queue) > queue_threshold:\n",
    "                time_above_threshold += wait # Simplificação: tempo de espera contribui para o tempo acima do limite\n",
    "                \n",
    "            service_time = random.expovariate(service_rate)\n",
    "            yield env.timeout(service_time)\n",
    "\n",
    "    def setup(env, server):\n",
    "        while True:\n",
    "            yield env.timeout(random.expovariate(arrival_rate))\n",
    "            env.process(request(env, server))\n",
    "\n",
    "    env.process(setup(env, server))\n",
    "    env.run(until=sim_time)\n",
    "    \n",
    "    return {\n",
    "        \"max_queue_length\": max(queue_lengths) if queue_lengths else 0,\n",
    "        \"time_queue_above_threshold\": time_above_threshold\n",
    "    }\n",
    "\n",
    "print(\"Funções de simulação (NetworkX, SimPy) prontas.\")"
   ]
  }

NameError: name 'null' is not defined

In [None]:
  ,
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "e_zYt0qY9-Qd"
   },
   "outputs": [],
   "source": [
    "def get_critical_nodes(G: nx.DiGraph) -> Set[str]:\n",
    "    \"\"\"Identifica nós críticos (IA, Dados, Controle).\"\"\"\n",
    "    return {n for n, d in G.nodes(data=True) if d.get(\"type\", \"\").startswith(\"critical\")}\n",
    "\n",
    "def run_single_simulation(architecture: str, scenario: str, sim_params: Dict) -> Dict:\n",
    "    \"\"\"\n",
    "    Executa uma única simulação para uma arquitetura e cenário.\n",
    "    Aqui é onde a lógica do Blueprint (Fallback, Circuit Breaker) é modelada.\n",
    "    \"\"\"\n",
    "    \n",
    "    if architecture == \"legacy\":\n",
    "        G = create_legacy_graph()\n",
    "    else: # resilient\n",
    "        G = create_resilient_graph()\n",
    "\n",
    "    critical_nodes = get_critical_nodes(G)\n",
    "    results = {\n",
    "        \"architecture\": architecture,\n",
    "        \"scenario\": scenario,\n",
    "        \"RTO_steps\": 0,\n",
    "        \"failed_nodes_count\": 0,\n",
    "        \"failed_critical_fraction\": 0.0,\n",
    "        \"max_queue_length\": 0,\n",
    "        \"time_queue_above_threshold\": 0.0,\n",
    "        \"avg_invalid_rate\": 0.0,\n",
    "        \"fallback_activation_ratio\": 0.0\n",
    "    }\n",
    "\n",
    "    # --- CENÁRIO 1: FALHA REGIONAL (us-east-1) ---\n",
    "    if scenario == \"regional_outage\":\n",
    "        # Perda total da us-east-1\n",
    "        initial_failures = {n for n, d in G.nodes(data=True) if d.get(\"region\") == \"us-east-1\"}\n",
    "        \n",
    "        if architecture == \"legacy\":\n",
    "            # Na arquitetura legada, a propagação é total (p_propagate=1.0)\n",
    "            rto, failed = simulate_network_failure(G, 0.0, 1.0, initial_failures)\n",
    "        else: # resilient\n",
    "            # Na arquitetura resiliente, a propagação é contida.\n",
    "            # O failover (Fase 1) é ativado, e a propagação é baixa (p_propagate=0.1)\n",
    "            rto, failed = simulate_network_failure(G, 0.0, 0.1, initial_failures)\n",
    "            # Os serviços de IA ativam o fallback (Fase 2)\n",
    "            results[\"fallback_activation_ratio\"] = 1.0 \n",
    "        \n",
    "        results[\"RTO_steps\"] = rto\n",
    "        results[\"failed_nodes_count\"] = len(failed)\n",
    "        failed_critical = failed.intersection(critical_nodes)\n",
    "        if critical_nodes:\n",
    "            results[\"failed_critical_fraction\"] = len(failed_critical) / len(critical_nodes)\n",
    "    \n",
    "    # --- CENÁRIO 2: BACKPRESSURE MQTT ---\n",
    "    elif scenario == \"mqtt_backpressure\":\n",
    "        # Simula duplicação de mensagens (taxa de chegada 2x maior que o serviço)\n",
    "        arrival_rate = 200.0\n",
    "        service_rate = 100.0\n",
    "        \n",
    "        if architecture == \"legacy\":\n",
    "            # Capacidade limitada, sem buffer (ex: 10 workers)\n",
    "            sim_results = simulate_queue_backpressure(arrival_rate, service_rate, 10, 100.0, 50)\n",
    "        else: # resilient\n",
    "            # Kafka (Fase 2) absorve o pico (capacidade muito maior, ex: 100 workers)\n",
    "            # E o Circuit Breaker (Fase 3) limita o tempo de espera.\n",
    "            sim_results = simulate_queue_backpressure(arrival_rate, service_rate, 100, 100.0, 50)\n",
    "            # O Control Plane ativa o fallback se o backlog for muito alto\n",
    "            if sim_results[\"max_queue_length\"] > 50:\n",
    "                 results[\"fallback_activation_ratio\"] = 1.0\n",
    "\n",
    "        results.update(sim_results)\n",
    "\n",
    "    # --- CENÁRIO 3: FALHA DE IA (DRIFT) ---\n",
    "    elif scenario == \"ia_drift\":\n",
    "        # Simula Data Drift causando 50% de outputs inválidos\n",
    "        raw_invalid_rate = 0.50\n",
    "        \n",
    "        if architecture == \"legacy\":\n",
    "            # O Drift é passado diretamente ao cliente\n",
    "            results[\"avg_invalid_rate\"] = raw_invalid_rate\n",
    "            results[\"fallback_activation_ratio\"] = 0.0\n",
    "        else: # resilient\n",
    "            # O py-llm-shield (Fase 3) detecta o drift (taxa > 5%)\n",
    "            # O LLM Router (Fase 2) ativa o Fail-Open (Confiança 0.0)\n",
    "            alert_threshold = 0.05\n",
    "            if raw_invalid_rate > alert_threshold:\n",
    "                results[\"fallback_activation_ratio\"] = 1.0\n",
    "                results[\"avg_invalid_rate\"] = 0.0 # O cliente recebe o fallback seguro\n",
    "            else:\n",
    "                results[\"avg_invalid_rate\"] = raw_invalid_rate\n",
    "                results[\"fallback_activation_ratio\"] = 0.0\n",
    "                \n",
    "    return results\n",
    "\n",
    "print(\"Lógica de simulação principal (run_single_simulation) definida.\")"
   ]
  }

NameError: name 'null' is not defined

In [None]:
  ,
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "n8t5w-5wAFq5"
   },
   "outputs": [],
   "source": [
    "N_SIMULATIONS = 100 # Número de execuções de Monte Carlo por cenário\n",
    "OUTPUT_CSV = \"deliverable/3_simulation_report.csv\"\n",
    "\n",
    "all_results = []\n",
    "scenarios = [\"regional_outage\", \"mqtt_backpressure\", \"ia_drift\"]\n",
    "architectures = [\"legacy\", \"resilient\"]\n",
    "sim_params = {} # Parâmetros futuros para análise de sensibilidade\n",
    "\n",
    "print(f\"Iniciando {N_SIMULATIONS} simulações de Monte Carlo para {len(scenarios)} cenários e {len(architectures)} arquiteturas...\")\n",
    "\n",
    "for i in range(N_SIMULATIONS):\n",
    "    if i % (N_SIMULATIONS // 10) == 0:\n",
    "        print(f\"... completou {i}/{N_SIMULATIONS} execuções ...\")\n",
    "        \n",
    "    for arch in architectures:\n",
    "        for scen in scenarios:\n",
    "            res = run_single_simulation(arch, scen, sim_params)\n",
    "            res[\"run_id\"] = i\n",
    "            all_results.append(res)\n",
    "\n",
    "df_results = pd.DataFrame(all_results)\n",
    "\n",
    "print(f\"Simulação concluída. Total de {len(df_results)} resultados gerados.\")\n",
    "display(df_results.head())"
   ]
  }

NameError: name 'null' is not defined

In [None]:
  ,
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "oYvA9S4CAK-1"
   },
   "outputs": [],
   "source": [
    "df_agg = df_results.groupby([\"architecture\", \"scenario\"]).mean(numeric_only=True)\n",
    "\n",
    "# Métricas de Resiliência (Menor é Melhor)\n",
    "cols_lower_is_better = [\n",
    "    \"RTO_steps\", \n",
    "    \"failed_critical_fraction\", \n",
    "    \"max_queue_length\", \n",
    "    \"time_queue_above_threshold\", \n",
    "    \"avg_invalid_rate\"\n",
    "]\n",
    "\n",
    "# Métricas de Contenção (Maior é Melhor)\n",
    "cols_higher_is_better = [\n",
    "    \"fallback_activation_ratio\"\n",
    "]\n",
    "\n",
    "df_report = df_agg[cols_lower_is_better + cols_higher_is_better].reset_index()\n",
    "display(df_report)"
   ]
  }

NameError: name 'null' is not defined

In [None]:
  ,
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "V476n5A1AUp4"
   },
   "outputs": [],
   "source": [
    "def plot_metric(metric_name, title_name):\n",
    "    fig = px.bar(\n",
    "        df_report,\n",
    "        x=\"scenario\", \n",
    "        y=metric_name, \n",
    "        color=\"architecture\", \n",
    "        barmode=\"group\",\n",
    "        title=title_name,\n",
    "        labels={\"architecture\": \"Arquitetura\", \"scenario\": \"Cenário de Falha\"}\n",
    "    )\n",
    "    fig.show()\n",
    "\n",
    "print(\"--- Relatório de Métricas de Resiliência (Menor é Melhor) ---\")\n",
    "plot_metric(\"failed_critical_fraction\", \"Fração de Nós Críticos que Falharam (Falha Regional)\")\n",
    "plot_metric(\"max_queue_length\", \"Pico Máximo da Fila de Backlog (Falha MQTT)\")\n",
    "plot_metric(\"avg_invalid_rate\", \"Taxa de Erro de IA (Drift) Vista pelo Cliente\")\n",
    "\n",
    "print(\"\\n--- Relatório de Métricas de Contenção (Maior é Melhor) ---\")\n",
    "plot_metric(\"fallback_activation_ratio\", \"Taxa de Ativação do Modo Fallback/Contenção\")"
   ]
  }

NameError: name 'null' is not defined

In [None]:
  ,
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "C71z40V_9F3I"
   },
   "outputs": [],
   "source": [
    "os.makedirs(\"deliverable\", exist_ok=True)\n",
    "df_results.to_csv(OUTPUT_CSV, index=False)\n",
    "\n",
    "print(f\"Relatório de simulação salvo em: {OUTPUT_CSV}\")"
   ]
  }
 ]
}

IndentationError: unexpected indent (ipython-input-1267994729.py, line 16)