# Event-Driven Microservices — Simulation + SLO Report

This notebook simulates an event-driven system (orders → payment → inventory → shipping) and produces an SLO-style report:
- event lag distribution
- consumer backlog over time
- failure injection + retries
- end-to-end latency

CPU-only, reproducible, outputs saved when executed.

In [1]:
import numpy as np
import pandas as pd
from dataclasses import dataclass

SEED = 1337
rng = np.random.default_rng(SEED)
pd.set_option('display.max_columns', 80)

## 1) Simulated event bus
We model each topic as a queue. Producers push events, consumers pull with variable service times.

In [2]:
@dataclass
class Event:
    t: int
    order_id: int
    kind: str

topics = {
  'orders': [],
  'payment': [],
  'inventory': [],
  'shipping': [],
}

def produce(topic: str, e: Event):
    topics[topic].append(e)

def consume(topic: str):
    if not topics[topic]: return None
    return topics[topic].pop(0)

T = 4000
orders_rate = 0.12
fail_rate = 0.02

service = {
  'payment': (4, 12),
  'inventory': (2, 8),
  'shipping': (6, 16),
}

in_flight = []
timeline = []

order_id = 0
for t in range(T):
    if rng.random() < orders_rate:
        order_id += 1
        produce('orders', Event(t=t, order_id=order_id, kind='order_created'))

    still = []
    for done_t, next_topic, ev, stage in in_flight:
        if t >= done_t:
            produce(next_topic, ev)
        else:
            still.append((done_t, next_topic, ev, stage))
    in_flight = still

    ev = consume('orders')
    if ev:
        in_flight.append((t + int(rng.integers(1, 4)), 'payment', ev, 'orders'))

    ev = consume('payment')
    if ev:
        if rng.random() < fail_rate:
            in_flight.append((t + int(rng.integers(10, 30)), 'payment', ev, 'payment_retry'))
            timeline.append((t, ev.order_id, 'payment_failed'))
        else:
            dt = int(rng.integers(*service['payment']))
            in_flight.append((t + dt, 'inventory', ev, 'payment_ok'))
            timeline.append((t, ev.order_id, 'payment_ok'))

    ev = consume('inventory')
    if ev:
        dt = int(rng.integers(*service['inventory']))
        in_flight.append((t + dt, 'shipping', ev, 'inventory_ok'))
        timeline.append((t, ev.order_id, 'inventory_ok'))

    ev = consume('shipping')
    if ev:
        dt = int(rng.integers(*service['shipping']))
        timeline.append((t + dt, ev.order_id, 'shipped'))

    if t % 50 == 0:
        msg = (
            f"backlog orders={len(topics['orders'])} "
            f"payment={len(topics['payment'])} "
            f"inv={len(topics['inventory'])} "
            f"ship={len(topics['shipping'])}"
        )
        timeline.append((t, -1, msg))

df = pd.DataFrame(timeline, columns=['t','order_id','event'])
df.head(), df.shape

(    t  order_id                                    event
 0   0        -1  backlog orders=0 payment=0 inv=0 ship=0
 1   6         1                               payment_ok
 2  10         2                               payment_ok
 3  11         1                             inventory_ok
 4  14         3                               payment_ok,
 (1530, 3))

## 2) End-to-end latency

In [3]:
created = {}
shipped = {}
for r in df.itertuples(index=False):
    if r.event == 'payment_ok' and r.order_id not in created:
        created[r.order_id] = max(0, r.t - 2)
    if r.event == 'shipped':
        shipped[r.order_id] = r.t

lat = []
for oid, t0 in created.items():
    if oid in shipped:
        lat.append(shipped[oid] - t0)
lat = np.array(lat)
float(lat.mean()), float(np.quantile(lat, 0.95)), len(lat)

(24.46972860125261, 31.099999999999966, 479)

## 3) SLO-style report

In [4]:
report = {
  'orders_generated': int(df.query('event=="payment_ok"')['order_id'].nunique()),
  'orders_shipped': int(len(lat)),
  'e2e_latency_mean': float(lat.mean()) if len(lat) else None,
  'e2e_latency_p95': float(np.quantile(lat, 0.95)) if len(lat) else None,
  'payment_failures': int((df['event']=='payment_failed').sum()),
}
report

{'orders_generated': 481,
 'orders_shipped': 479,
 'e2e_latency_mean': 24.46972860125261,
 'e2e_latency_p95': 31.099999999999966,
 'payment_failures': 9}