# Day 04: Low-Latency System Design for Trading

## Week 22 - System Design

---

### Learning Objectives
- Understand latency fundamentals and measurement techniques
- Master high-resolution timing and profiling in Python
- Implement lock-free data structures and memory-efficient patterns
- Compare synchronous vs asynchronous processing
- Build low-latency components for trading systems

### Why Latency Matters in Trading
- **Microseconds matter**: In HFT, a 1μs advantage can mean winning or losing trades
- **Competitive edge**: Lower latency = better fills, tighter spreads, more alpha
- **Risk management**: Delayed signals can lead to significant losses
- **Regulatory requirements**: MiFID II requires timestamp accuracy to microseconds

### Latency Budget Example (Typical HFT System)
| Component | Target Latency |
|-----------|----------------|
| Market Data Parsing | < 1 μs |
| Signal Generation | < 5 μs |
| Risk Check | < 2 μs |
| Order Encoding | < 1 μs |
| Network (colocation) | < 10 μs |
| **Total Tick-to-Trade** | **< 20 μs** |

## 1. Import Required Libraries

In [1]:
import time
import asyncio
import threading
import multiprocessing
from multiprocessing import Process, Queue as MPQueue
from collections import deque
from queue import Queue, PriorityQueue
import cProfile
import pstats
from io import StringIO
from functools import wraps
from dataclasses import dataclass, field
from typing import Any, Callable, List, Optional, Dict
import statistics
import random
import socket
import struct
import heapq
from contextlib import contextmanager
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import warnings
warnings.filterwarnings('ignore')

# Scientific computing
import numpy as np
import pandas as pd

# For visualization
import matplotlib.pyplot as plt

print("✓ All libraries imported successfully")
print(f"Python version: {__import__('sys').version}")
print(f"NumPy version: {np.__version__}")

✓ All libraries imported successfully
Python version: 3.14.2 (main, Dec  5 2025, 16:49:16) [Clang 17.0.0 (clang-1700.4.4.1)]
NumPy version: 2.4.1


## 2. Measuring Latency Basics

### High-Resolution Timing in Python

Python offers several timing mechanisms with different resolutions:
- `time.time()` - Wall clock time (microsecond resolution)
- `time.perf_counter()` - Performance counter (nanosecond resolution)
- `time.perf_counter_ns()` - Performance counter in nanoseconds (best for latency)
- `time.monotonic()` - Monotonic clock (cannot go backwards)

**For trading systems, always use `time.perf_counter_ns()`**

In [2]:
# Compare different timing methods
def compare_timing_methods():
    """Compare resolution and overhead of different timing methods."""
    methods = {
        'time.time()': time.time,
        'time.perf_counter()': time.perf_counter,
        'time.perf_counter_ns()': time.perf_counter_ns,
        'time.monotonic()': time.monotonic,
        'time.monotonic_ns()': time.monotonic_ns
    }
    
    results = {}
    n_samples = 10000
    
    for name, method in methods.items():
        # Measure overhead of calling the timing function itself
        start = time.perf_counter_ns()
        for _ in range(n_samples):
            _ = method()
        end = time.perf_counter_ns()
        
        overhead_ns = (end - start) / n_samples
        results[name] = overhead_ns
    
    print("Timing Method Overhead Comparison")
    print("=" * 50)
    for name, overhead in results.items():
        print(f"{name:25s}: {overhead:8.2f} ns/call")
    
    return results

overhead_results = compare_timing_methods()

Timing Method Overhead Comparison
time.time()              :    51.45 ns/call
time.perf_counter()      :    51.46 ns/call
time.perf_counter_ns()   :    56.65 ns/call
time.monotonic()         :    51.21 ns/call
time.monotonic_ns()      :    65.32 ns/call


In [3]:
class LatencyTracker:
    """High-precision latency tracking for trading systems."""
    
    def __init__(self, name: str, warmup_iterations: int = 100):
        self.name = name
        self.warmup_iterations = warmup_iterations
        self.latencies_ns: List[int] = []
        self._warmup_done = False
        self._warmup_count = 0
    
    @contextmanager
    def measure(self):
        """Context manager for measuring latency."""
        start = time.perf_counter_ns()
        yield
        end = time.perf_counter_ns()
        latency = end - start
        
        # Skip warmup iterations (JIT, cache warming)
        if self._warmup_count < self.warmup_iterations:
            self._warmup_count += 1
        else:
            self.latencies_ns.append(latency)
    
    def record(self, latency_ns: int):
        """Manually record a latency measurement."""
        if self._warmup_count < self.warmup_iterations:
            self._warmup_count += 1
        else:
            self.latencies_ns.append(latency_ns)
    
    def get_statistics(self) -> Dict[str, float]:
        """Calculate latency statistics."""
        if not self.latencies_ns:
            return {}
        
        sorted_latencies = sorted(self.latencies_ns)
        n = len(sorted_latencies)
        
        return {
            'name': self.name,
            'count': n,
            'mean_ns': statistics.mean(sorted_latencies),
            'median_ns': statistics.median(sorted_latencies),
            'std_ns': statistics.stdev(sorted_latencies) if n > 1 else 0,
            'min_ns': sorted_latencies[0],
            'max_ns': sorted_latencies[-1],
            'p50_ns': sorted_latencies[int(n * 0.50)],
            'p90_ns': sorted_latencies[int(n * 0.90)],
            'p99_ns': sorted_latencies[int(n * 0.99)],
            'p999_ns': sorted_latencies[int(n * 0.999)] if n >= 1000 else sorted_latencies[-1],
        }
    
    def print_report(self):
        """Print a formatted latency report."""
        stats = self.get_statistics()
        if not stats:
            print(f"No measurements for {self.name}")
            return
        
        print(f"\n{'='*60}")
        print(f"Latency Report: {stats['name']}")
        print(f"{'='*60}")
        print(f"Samples: {stats['count']:,}")
        print(f"\n{'Statistic':<15} {'Nanoseconds':>15} {'Microseconds':>15}")
        print("-" * 47)
        
        for key in ['mean', 'median', 'std', 'min', 'max', 'p50', 'p90', 'p99', 'p999']:
            ns_key = f'{key}_ns'
            if ns_key in stats:
                ns_val = stats[ns_key]
                us_val = ns_val / 1000
                print(f"{key.upper():<15} {ns_val:>15,.0f} {us_val:>15,.2f}")

# Demonstrate the latency tracker
tracker = LatencyTracker("demo_operation", warmup_iterations=50)

# Simulate some operations
for i in range(10000):
    with tracker.measure():
        # Simulate work
        _ = sum(range(100))

tracker.print_report()


Latency Report: demo_operation
Samples: 9,950

Statistic           Nanoseconds    Microseconds
-----------------------------------------------
MEAN                        548            0.55
MEDIAN                      500            0.50
STD                       3,222            3.22
MIN                         417            0.42
MAX                     227,416          227.42
P50                         500            0.50
P90                         500            0.50
P99                         625            0.62
P999                      4,333            4.33


In [4]:
def latency_benchmark(warmup: int = 100, iterations: int = 10000):
    """Decorator to benchmark function latency."""
    def decorator(func: Callable):
        @wraps(func)
        def wrapper(*args, **kwargs):
            tracker = LatencyTracker(func.__name__, warmup_iterations=warmup)
            result = None
            
            for _ in range(warmup + iterations):
                with tracker.measure():
                    result = func(*args, **kwargs)
            
            tracker.print_report()
            return result
        return wrapper
    return decorator

# Example usage
@latency_benchmark(warmup=100, iterations=5000)
def simulated_order_validation(order_size: float, max_size: float = 1_000_000) -> bool:
    """Simulate order validation logic."""
    # Bounds check
    if order_size <= 0 or order_size > max_size:
        return False
    # Simulate additional checks
    _ = order_size * 1.001  # Apply some calculation
    return True

# Run the benchmark
_ = simulated_order_validation(1000.0)


Latency Report: simulated_order_validation
Samples: 5,000

Statistic           Nanoseconds    Microseconds
-----------------------------------------------
MEAN                        270            0.27
MEDIAN                      209            0.21
STD                         658            0.66
MIN                         166            0.17
MAX                      28,458           28.46
P50                         209            0.21
P90                         250            0.25
P99                         708            0.71
P999                     12,208           12.21


## 3. Memory Access Patterns and Cache Optimization

### Why Cache Matters

Modern CPUs have a memory hierarchy:
- **L1 Cache**: ~1-4 cycles, ~32-64 KB
- **L2 Cache**: ~10-20 cycles, ~256-512 KB  
- **L3 Cache**: ~40-75 cycles, ~8-32 MB
- **Main Memory (RAM)**: ~100-300 cycles

**Cache-friendly code can be 10-100x faster!**

Key principles:
1. **Spatial locality**: Access memory sequentially
2. **Temporal locality**: Reuse recently accessed data
3. **Avoid cache thrashing**: Don't exceed cache size in tight loops

In [5]:
def benchmark_memory_access_patterns():
    """
    Demonstrate the massive performance difference between 
    cache-friendly and cache-unfriendly memory access.
    """
    # Create a large 2D array (row-major storage in NumPy/C)
    rows, cols = 10000, 10000
    matrix = np.random.randn(rows, cols)
    
    results = {}
    
    # Row-major access (cache-friendly for C-contiguous arrays)
    tracker_row = LatencyTracker("row_major_access", warmup_iterations=2)
    for _ in range(5):
        with tracker_row.measure():
            total = 0.0
            for i in range(rows):
                for j in range(cols):
                    total += matrix[i, j]
    results['row_major'] = tracker_row.get_statistics()
    
    # Column-major access (cache-unfriendly for C-contiguous arrays)
    tracker_col = LatencyTracker("col_major_access", warmup_iterations=2)
    for _ in range(5):
        with tracker_col.measure():
            total = 0.0
            for j in range(cols):
                for i in range(rows):
                    total += matrix[i, j]
    results['col_major'] = tracker_col.get_statistics()
    
    # NumPy vectorized (best - uses SIMD and optimal memory access)
    tracker_vec = LatencyTracker("numpy_vectorized", warmup_iterations=2)
    for _ in range(5):
        with tracker_vec.measure():
            total = np.sum(matrix)
    results['vectorized'] = tracker_vec.get_statistics()
    
    # Print comparison
    print("\nMemory Access Pattern Comparison")
    print("=" * 60)
    print(f"Matrix size: {rows:,} x {cols:,} = {rows*cols:,} elements")
    print(f"Memory size: {matrix.nbytes / 1024 / 1024:.1f} MB")
    print()
    
    baseline = results['row_major']['mean_ns']
    for pattern, stats in results.items():
        mean_ms = stats['mean_ns'] / 1_000_000
        speedup = baseline / stats['mean_ns']
        print(f"{pattern:20s}: {mean_ms:10.2f} ms  (speedup: {speedup:6.1f}x)")
    
    return results

cache_results = benchmark_memory_access_patterns()


Memory Access Pattern Comparison
Matrix size: 10,000 x 10,000 = 100,000,000 elements
Memory size: 762.9 MB

row_major           :   12551.09 ms  (speedup:    1.0x)
col_major           :   11548.30 ms  (speedup:    1.1x)
vectorized          :      19.56 ms  (speedup:  641.8x)


In [6]:
def benchmark_struct_of_arrays_vs_array_of_structs():
    """
    Compare SoA (Struct of Arrays) vs AoS (Array of Structs) patterns.
    SoA is typically more cache-efficient for columnar operations.
    """
    n_orders = 100000
    
    # Array of Structs (AoS) - like a list of dictionaries
    @dataclass
    class Order:
        order_id: int
        price: float
        quantity: int
        side: int  # 0=buy, 1=sell
    
    # Create AoS
    orders_aos = [
        Order(
            order_id=i,
            price=100.0 + random.random(),
            quantity=random.randint(1, 1000),
            side=random.randint(0, 1)
        )
        for i in range(n_orders)
    ]
    
    # Struct of Arrays (SoA) - columnar layout
    orders_soa = {
        'order_id': np.arange(n_orders, dtype=np.int64),
        'price': 100.0 + np.random.random(n_orders),
        'quantity': np.random.randint(1, 1000, n_orders),
        'side': np.random.randint(0, 2, n_orders)
    }
    
    # Benchmark: Calculate total buy value (price * quantity for buy orders)
    
    # AoS approach
    tracker_aos = LatencyTracker("AoS_calculation", warmup_iterations=10)
    for _ in range(100):
        with tracker_aos.measure():
            total_buy_value = sum(
                o.price * o.quantity 
                for o in orders_aos 
                if o.side == 0
            )
    
    # SoA approach with NumPy
    tracker_soa = LatencyTracker("SoA_calculation", warmup_iterations=10)
    for _ in range(100):
        with tracker_soa.measure():
            buy_mask = orders_soa['side'] == 0
            total_buy_value = np.sum(
                orders_soa['price'][buy_mask] * orders_soa['quantity'][buy_mask]
            )
    
    aos_stats = tracker_aos.get_statistics()
    soa_stats = tracker_soa.get_statistics()
    
    print("\nStruct of Arrays vs Array of Structs")
    print("=" * 60)
    print(f"Orders: {n_orders:,}")
    print(f"\nAoS (list of objects):")
    print(f"  Mean: {aos_stats['mean_ns']/1000:.2f} μs, P99: {aos_stats['p99_ns']/1000:.2f} μs")
    print(f"\nSoA (columnar NumPy):")
    print(f"  Mean: {soa_stats['mean_ns']/1000:.2f} μs, P99: {soa_stats['p99_ns']/1000:.2f} μs")
    print(f"\nSpeedup: {aos_stats['mean_ns']/soa_stats['mean_ns']:.1f}x")

benchmark_struct_of_arrays_vs_array_of_structs()


Struct of Arrays vs Array of Structs
Orders: 100,000

AoS (list of objects):
  Mean: 4375.86 μs, P99: 5758.00 μs

SoA (columnar NumPy):
  Mean: 1024.28 μs, P99: 1129.58 μs

Speedup: 4.3x


## 4. Lock-Free Data Structures

### Why Lock-Free?
- **Contention**: Locks create serialization points that increase latency
- **Priority inversion**: High-priority threads blocked by low-priority lock holders
- **Deadlocks**: Lock ordering bugs can halt the system
- **Context switches**: Lock contention causes expensive OS context switches

### Lock-Free Alternatives
- `collections.deque` - Thread-safe for append/pop operations
- Atomic operations via `threading.Lock` with minimal hold time
- Ring buffers with atomic indices
- Compare-and-swap (CAS) patterns

In [7]:
class LockFreeRingBuffer:
    """
    A simple lock-free single-producer single-consumer (SPSC) ring buffer.
    Uses atomic-like operations with memory barriers implied by Python's GIL.
    
    In production C++/Rust, you'd use std::atomic with proper memory ordering.
    """
    
    def __init__(self, capacity: int):
        # Power of 2 for fast modulo via bitwise AND
        self.capacity = 1 << (capacity - 1).bit_length()
        self.mask = self.capacity - 1
        self.buffer = [None] * self.capacity
        self.head = 0  # Write position (producer)
        self.tail = 0  # Read position (consumer)
    
    def push(self, item) -> bool:
        """Add item to buffer. Returns False if full."""
        next_head = (self.head + 1) & self.mask
        if next_head == self.tail:
            return False  # Buffer full
        self.buffer[self.head] = item
        self.head = next_head
        return True
    
    def pop(self):
        """Remove and return item. Returns None if empty."""
        if self.tail == self.head:
            return None  # Buffer empty
        item = self.buffer[self.tail]
        self.tail = (self.tail + 1) & self.mask
        return item
    
    def __len__(self):
        return (self.head - self.tail) & self.mask


class PreallocatedObjectPool:
    """
    Object pool to avoid allocation latency in hot paths.
    Pre-allocate objects and reuse them.
    """
    
    def __init__(self, factory: Callable, initial_size: int = 1000):
        self.factory = factory
        self.pool = deque([factory() for _ in range(initial_size)])
        self.allocated = 0
    
    def acquire(self):
        """Get an object from the pool."""
        if self.pool:
            return self.pool.pop()
        self.allocated += 1
        return self.factory()
    
    def release(self, obj):
        """Return an object to the pool."""
        self.pool.append(obj)
    
    @contextmanager
    def borrow(self):
        """Context manager for automatic release."""
        obj = self.acquire()
        try:
            yield obj
        finally:
            self.release(obj)


# Demonstrate ring buffer vs standard queue
def benchmark_queue_implementations():
    """Compare different queue implementations."""
    n_operations = 100000
    
    # Standard Queue (thread-safe with locks)
    std_queue = Queue()
    tracker_std = LatencyTracker("threading.Queue", warmup_iterations=1000)
    
    for i in range(n_operations):
        with tracker_std.measure():
            std_queue.put(i)
            _ = std_queue.get()
    
    # collections.deque (lock-free for single operations)
    dq = deque()
    tracker_deque = LatencyTracker("collections.deque", warmup_iterations=1000)
    
    for i in range(n_operations):
        with tracker_deque.measure():
            dq.append(i)
            _ = dq.popleft()
    
    # Custom ring buffer
    ring = LockFreeRingBuffer(1024)
    tracker_ring = LatencyTracker("LockFreeRingBuffer", warmup_iterations=1000)
    
    for i in range(n_operations):
        with tracker_ring.measure():
            ring.push(i)
            _ = ring.pop()
    
    # Print results
    print("\nQueue Implementation Comparison")
    print("=" * 60)
    print(f"Operations: {n_operations:,} push+pop pairs\n")
    
    for tracker in [tracker_std, tracker_deque, tracker_ring]:
        stats = tracker.get_statistics()
        print(f"{stats['name']:25s}")
        print(f"  Mean: {stats['mean_ns']:,.0f} ns, P99: {stats['p99_ns']:,.0f} ns")

benchmark_queue_implementations()


Queue Implementation Comparison
Operations: 100,000 push+pop pairs

threading.Queue          
  Mean: 1,042 ns, P99: 1,166 ns
collections.deque        
  Mean: 138 ns, P99: 167 ns
LockFreeRingBuffer       
  Mean: 277 ns, P99: 333 ns


In [8]:
# Benchmark object pool vs fresh allocation
def benchmark_object_pool():
    """Compare allocation latency with and without object pooling."""
    
    @dataclass
    class OrderMessage:
        """Represents an order message."""
        order_id: int = 0
        symbol: str = ""
        price: float = 0.0
        quantity: int = 0
        side: str = "BUY"
        timestamp: int = 0
    
    n_operations = 50000
    
    # Without pool - fresh allocation each time
    tracker_alloc = LatencyTracker("fresh_allocation", warmup_iterations=1000)
    for i in range(n_operations):
        with tracker_alloc.measure():
            msg = OrderMessage(
                order_id=i,
                symbol="AAPL",
                price=150.0,
                quantity=100,
                side="BUY",
                timestamp=time.perf_counter_ns()
            )
            # Simulate using the object
            _ = msg.price * msg.quantity
            # Object goes out of scope (GC pressure)
    
    # With pool - reuse pre-allocated objects
    pool = PreallocatedObjectPool(OrderMessage, initial_size=100)
    tracker_pool = LatencyTracker("object_pool", warmup_iterations=1000)
    
    for i in range(n_operations):
        with tracker_pool.measure():
            with pool.borrow() as msg:
                msg.order_id = i
                msg.symbol = "AAPL"
                msg.price = 150.0
                msg.quantity = 100
                msg.side = "BUY"
                msg.timestamp = time.perf_counter_ns()
                _ = msg.price * msg.quantity
    
    alloc_stats = tracker_alloc.get_statistics()
    pool_stats = tracker_pool.get_statistics()
    
    print("\nObject Pool vs Fresh Allocation")
    print("=" * 60)
    print(f"Operations: {n_operations:,}\n")
    print(f"Fresh allocation:")
    print(f"  Mean: {alloc_stats['mean_ns']:,.0f} ns, P99: {alloc_stats['p99_ns']:,.0f} ns")
    print(f"\nObject pool:")
    print(f"  Mean: {pool_stats['mean_ns']:,.0f} ns, P99: {pool_stats['p99_ns']:,.0f} ns")
    print(f"\nSpeedup: {alloc_stats['mean_ns']/pool_stats['mean_ns']:.2f}x")
    print(f"P99 improvement: {alloc_stats['p99_ns']/pool_stats['p99_ns']:.2f}x")

benchmark_object_pool()


Object Pool vs Fresh Allocation
Operations: 50,000

Fresh allocation:
  Mean: 656 ns, P99: 750 ns

Object pool:
  Mean: 1,136 ns, P99: 1,250 ns

Speedup: 0.58x
P99 improvement: 0.60x


## 5. Async vs Sync Processing Comparison

### When to Use Async
- **I/O bound workloads**: Network calls, file I/O, database queries
- **Many concurrent connections**: Thousands of market data feeds
- **Non-blocking requirements**: Don't want to block main event loop

### When to Use Sync
- **CPU bound workloads**: Signal computation, risk calculations
- **Low latency requirements**: Async adds ~1-5 μs overhead
- **Simple sequential logic**: Avoid complexity where not needed

### Trading System Pattern
Most HFT systems use a **hybrid approach**:
- Async for market data ingestion (many feeds)
- Sync for the hot path (signal → order)
- Separate threads for different concerns

In [9]:
async def async_io_simulation(delay_ms: float = 0.1):
    """Simulate an async I/O operation."""
    await asyncio.sleep(delay_ms / 1000)
    return time.perf_counter_ns()

def sync_io_simulation(delay_ms: float = 0.1):
    """Simulate a sync I/O operation."""
    time.sleep(delay_ms / 1000)
    return time.perf_counter_ns()

async def benchmark_async_vs_sync():
    """Compare async and sync processing patterns."""
    n_operations = 100
    delay_ms = 0.5
    
    # Synchronous sequential processing
    print("Running synchronous sequential benchmark...")
    start_sync = time.perf_counter_ns()
    sync_results = []
    for _ in range(n_operations):
        result = sync_io_simulation(delay_ms)
        sync_results.append(result)
    end_sync = time.perf_counter_ns()
    sync_total_ms = (end_sync - start_sync) / 1_000_000
    
    # Asynchronous concurrent processing
    print("Running asynchronous concurrent benchmark...")
    start_async = time.perf_counter_ns()
    tasks = [async_io_simulation(delay_ms) for _ in range(n_operations)]
    async_results = await asyncio.gather(*tasks)
    end_async = time.perf_counter_ns()
    async_total_ms = (end_async - start_async) / 1_000_000
    
    print("\nAsync vs Sync I/O Comparison")
    print("=" * 60)
    print(f"Operations: {n_operations}, Simulated I/O delay: {delay_ms} ms each")
    print(f"\nSynchronous (sequential):")
    print(f"  Total time: {sync_total_ms:.2f} ms")
    print(f"  Per operation: {sync_total_ms/n_operations:.2f} ms")
    print(f"\nAsynchronous (concurrent):")
    print(f"  Total time: {async_total_ms:.2f} ms")
    print(f"  Per operation: {async_total_ms/n_operations:.2f} ms")
    print(f"\nSpeedup: {sync_total_ms/async_total_ms:.1f}x")
    print(f"Theoretical max: {n_operations}x (fully parallel)")

# Run async benchmark
await benchmark_async_vs_sync()

Running synchronous sequential benchmark...
Running asynchronous concurrent benchmark...

Async vs Sync I/O Comparison
Operations: 100, Simulated I/O delay: 0.5 ms each

Synchronous (sequential):
  Total time: 63.70 ms
  Per operation: 0.64 ms

Asynchronous (concurrent):
  Total time: 1.20 ms
  Per operation: 0.01 ms

Speedup: 53.0x
Theoretical max: 100x (fully parallel)


In [10]:
# Measure async overhead for CPU-bound work
def benchmark_async_overhead():
    """
    Measure the overhead of async for CPU-bound work.
    Shows why sync is preferred for the hot path.
    """
    def cpu_work():
        """Simulate CPU-bound work."""
        total = 0
        for i in range(1000):
            total += i * i
        return total
    
    async def async_cpu_work():
        """Same work wrapped in async."""
        return cpu_work()
    
    n_iterations = 10000
    
    # Pure sync
    tracker_sync = LatencyTracker("sync_cpu_work", warmup_iterations=100)
    for _ in range(n_iterations):
        with tracker_sync.measure():
            _ = cpu_work()
    
    # Async with event loop overhead
    async def run_async_benchmark():
        tracker_async = LatencyTracker("async_cpu_work", warmup_iterations=100)
        for _ in range(n_iterations):
            with tracker_async.measure():
                _ = await async_cpu_work()
        return tracker_async
    
    tracker_async = asyncio.get_event_loop().run_until_complete(run_async_benchmark())
    
    sync_stats = tracker_sync.get_statistics()
    async_stats = tracker_async.get_statistics()
    
    print("\nAsync Overhead for CPU-bound Work")
    print("=" * 60)
    print(f"Iterations: {n_iterations:,}")
    print(f"\nSynchronous:")
    print(f"  Mean: {sync_stats['mean_ns']:,.0f} ns")
    print(f"  P99:  {sync_stats['p99_ns']:,.0f} ns")
    print(f"\nAsynchronous:")
    print(f"  Mean: {async_stats['mean_ns']:,.0f} ns")
    print(f"  P99:  {async_stats['p99_ns']:,.0f} ns")
    print(f"\nAsync overhead: {async_stats['mean_ns'] - sync_stats['mean_ns']:,.0f} ns ({(async_stats['mean_ns']/sync_stats['mean_ns'] - 1)*100:.1f}%)")

benchmark_async_overhead()

RuntimeError: This event loop is already running

## 6. Message Queue Latency Benchmarking

### Message Queue Design Considerations
- **In-memory vs Persistent**: In-memory for latency, persistent for reliability
- **FIFO vs Priority**: Priority queues add log(n) overhead
- **Bounded vs Unbounded**: Bounded prevents memory exhaustion
- **Single vs Multi-producer/consumer**: Affects locking requirements

In [None]:
class LowLatencyMessageQueue:
    """
    A low-latency in-memory message queue for trading systems.
    Optimized for single-producer single-consumer pattern.
    """
    
    def __init__(self, capacity: int = 65536):
        self.buffer = LockFreeRingBuffer(capacity)
        self.sequence_number = 0
        self.stats = {
            'enqueue_count': 0,
            'dequeue_count': 0,
            'drops': 0
        }
    
    def enqueue(self, message: Any, timestamp_ns: Optional[int] = None) -> bool:
        """Add message to queue with optional timestamp."""
        if timestamp_ns is None:
            timestamp_ns = time.perf_counter_ns()
        
        envelope = (self.sequence_number, timestamp_ns, message)
        success = self.buffer.push(envelope)
        
        if success:
            self.sequence_number += 1
            self.stats['enqueue_count'] += 1
        else:
            self.stats['drops'] += 1
        
        return success
    
    def dequeue(self) -> Optional[tuple]:
        """Remove and return message with metadata."""
        result = self.buffer.pop()
        if result is not None:
            self.stats['dequeue_count'] += 1
            seq, enqueue_time, message = result
            dequeue_time = time.perf_counter_ns()
            queue_latency_ns = dequeue_time - enqueue_time
            return (seq, message, queue_latency_ns)
        return None
    
    def __len__(self):
        return len(self.buffer)


class PriorityMessageQueue:
    """
    Priority queue for messages (e.g., cancel orders before new orders).
    Uses heapq which has O(log n) insert/extract.
    """
    
    def __init__(self):
        self.heap = []
        self.sequence = 0
    
    def enqueue(self, priority: int, message: Any) -> None:
        """Add message with priority (lower = higher priority)."""
        timestamp = time.perf_counter_ns()
        # (priority, sequence, timestamp, message)
        heapq.heappush(self.heap, (priority, self.sequence, timestamp, message))
        self.sequence += 1
    
    def dequeue(self) -> Optional[tuple]:
        """Get highest priority message."""
        if not self.heap:
            return None
        priority, seq, enqueue_time, message = heapq.heappop(self.heap)
        dequeue_time = time.perf_counter_ns()
        return (priority, message, dequeue_time - enqueue_time)
    
    def __len__(self):
        return len(self.heap)


def benchmark_message_queues():
    """Compare different message queue implementations."""
    n_messages = 50000
    
    results = {}
    
    # 1. Simple deque (baseline)
    dq = deque()
    tracker_deque = LatencyTracker("deque_fifo", warmup_iterations=1000)
    
    for i in range(n_messages):
        enqueue_time = time.perf_counter_ns()
        with tracker_deque.measure():
            dq.append((i, enqueue_time))
            seq, ts = dq.popleft()
    results['deque'] = tracker_deque.get_statistics()
    
    # 2. Low-latency queue with timestamps
    llq = LowLatencyMessageQueue(capacity=4096)
    tracker_llq = LatencyTracker("LowLatencyQueue", warmup_iterations=1000)
    
    for i in range(n_messages):
        with tracker_llq.measure():
            llq.enqueue(f"message_{i}")
            _ = llq.dequeue()
    results['low_latency'] = tracker_llq.get_statistics()
    
    # 3. Priority queue
    pq = PriorityMessageQueue()
    tracker_pq = LatencyTracker("PriorityQueue", warmup_iterations=1000)
    
    for i in range(n_messages):
        priority = random.randint(0, 10)
        with tracker_pq.measure():
            pq.enqueue(priority, f"message_{i}")
            _ = pq.dequeue()
    results['priority'] = tracker_pq.get_statistics()
    
    # Print comparison
    print("\nMessage Queue Comparison")
    print("=" * 70)
    print(f"{'Queue Type':<25} {'Mean (ns)':<15} {'P99 (ns)':<15} {'P999 (ns)':<15}")
    print("-" * 70)
    
    for name, stats in results.items():
        print(f"{name:<25} {stats['mean_ns']:<15,.0f} {stats['p99_ns']:<15,.0f} {stats['p999_ns']:<15,.0f}")
    
    return results

queue_results = benchmark_message_queues()

## 7. Connection Pooling Implementation

### Why Connection Pooling?
- **TCP handshake latency**: ~1-2 RTT (round trip time)
- **TLS handshake**: Additional ~2-3 RTT
- **Connection setup**: Memory allocation, socket buffer setup
- **Keep-alive**: Reuse established connections

### Trading System Connections
- Exchange gateways (FIX, Binary protocols)
- Market data feeds  
- Database connections
- Internal service mesh

In [None]:
class SimulatedConnection:
    """Simulates a network connection with setup latency."""
    
    _connection_id = 0
    
    def __init__(self, host: str, port: int, setup_latency_us: float = 100):
        self.host = host
        self.port = port
        self.setup_latency_us = setup_latency_us
        self.id = SimulatedConnection._connection_id
        SimulatedConnection._connection_id += 1
        self.is_connected = False
        self.request_count = 0
    
    def connect(self):
        """Simulate connection establishment."""
        # Simulate TCP/TLS handshake latency
        time.sleep(self.setup_latency_us / 1_000_000)
        self.is_connected = True
    
    def send(self, data: bytes) -> bytes:
        """Simulate sending data and receiving response."""
        if not self.is_connected:
            self.connect()
        self.request_count += 1
        # Simulate some processing time (1 μs)
        time.sleep(0.000001)
        return b"OK"
    
    def close(self):
        """Close the connection."""
        self.is_connected = False


class ConnectionPool:
    """
    Connection pool for low-latency request handling.
    Pre-establishes and maintains connections.
    """
    
    def __init__(self, host: str, port: int, pool_size: int = 10,
                 connection_factory: Callable = None):
        self.host = host
        self.port = port
        self.pool_size = pool_size
        self.factory = connection_factory or (
            lambda: SimulatedConnection(host, port)
        )
        
        # Available connections
        self.available = deque()
        # All connections (for cleanup)
        self.all_connections = []
        # Lock for thread safety
        self.lock = threading.Lock()
        
        # Pre-warm the pool
        self._initialize_pool()
    
    def _initialize_pool(self):
        """Pre-create and connect all connections."""
        print(f"Initializing connection pool with {self.pool_size} connections...")
        start = time.perf_counter_ns()
        
        for _ in range(self.pool_size):
            conn = self.factory()
            conn.connect()
            self.available.append(conn)
            self.all_connections.append(conn)
        
        init_time_ms = (time.perf_counter_ns() - start) / 1_000_000
        print(f"Pool initialized in {init_time_ms:.2f} ms")
    
    @contextmanager
    def get_connection(self):
        """Get a connection from the pool."""
        conn = None
        with self.lock:
            if self.available:
                conn = self.available.popleft()
        
        if conn is None:
            # Pool exhausted, create new connection (slow path)
            conn = self.factory()
            conn.connect()
            self.all_connections.append(conn)
        
        try:
            yield conn
        finally:
            # Return to pool
            with self.lock:
                self.available.append(conn)
    
    def close_all(self):
        """Close all connections."""
        for conn in self.all_connections:
            conn.close()


def benchmark_connection_pooling():
    """Compare pooled vs non-pooled connection performance."""
    n_requests = 1000
    
    # Without pooling - new connection per request
    tracker_no_pool = LatencyTracker("no_pooling", warmup_iterations=10)
    
    for _ in range(n_requests):
        with tracker_no_pool.measure():
            conn = SimulatedConnection("localhost", 8080, setup_latency_us=50)
            conn.connect()
            _ = conn.send(b"test")
            conn.close()
    
    # With pooling - reuse connections
    pool = ConnectionPool("localhost", 8080, pool_size=5,
                         connection_factory=lambda: SimulatedConnection("localhost", 8080, setup_latency_us=50))
    
    tracker_pool = LatencyTracker("with_pooling", warmup_iterations=10)
    
    for _ in range(n_requests):
        with tracker_pool.measure():
            with pool.get_connection() as conn:
                _ = conn.send(b"test")
    
    pool.close_all()
    
    no_pool_stats = tracker_no_pool.get_statistics()
    pool_stats = tracker_pool.get_statistics()
    
    print("\nConnection Pooling Comparison")
    print("=" * 60)
    print(f"Requests: {n_requests:,}")
    print(f"\nWithout pooling (new connection each time):")
    print(f"  Mean: {no_pool_stats['mean_ns']/1000:.2f} μs")
    print(f"  P99:  {no_pool_stats['p99_ns']/1000:.2f} μs")
    print(f"\nWith pooling (connection reuse):")
    print(f"  Mean: {pool_stats['mean_ns']/1000:.2f} μs")
    print(f"  P99:  {pool_stats['p99_ns']/1000:.2f} μs")
    print(f"\nLatency reduction: {(1 - pool_stats['mean_ns']/no_pool_stats['mean_ns'])*100:.1f}%")
    print(f"Speedup: {no_pool_stats['mean_ns']/pool_stats['mean_ns']:.1f}x")

benchmark_connection_pooling()

## 8. Batch Processing vs Single Processing

### The Latency-Throughput Tradeoff
- **Single processing**: Lowest latency per item, lower throughput
- **Batch processing**: Higher latency per item, higher throughput
- **Adaptive batching**: Balance based on current load

### When to Batch in Trading
- **Market data normalization**: Combine ticks before processing
- **Risk aggregation**: Batch position updates
- **Order submission**: Some exchanges accept batch orders
- **Logging/Analytics**: Always batch non-critical writes

In [None]:
def process_single(item: float) -> float:
    """Process a single item (simulate computation)."""
    return item * 1.001 + 0.01

def process_batch(items: np.ndarray) -> np.ndarray:
    """Process items in batch (vectorized)."""
    return items * 1.001 + 0.01

def benchmark_batch_vs_single():
    """Compare single vs batch processing."""
    n_items = 100000
    data = np.random.randn(n_items)
    
    # Single item processing
    tracker_single = LatencyTracker("single_processing", warmup_iterations=100)
    results_single = []
    
    for item in data[:10000]:  # Subset for single processing
        with tracker_single.measure():
            result = process_single(item)
        results_single.append(result)
    
    # Batch processing - various batch sizes
    batch_results = {}
    
    for batch_size in [10, 100, 1000, 10000]:
        tracker_batch = LatencyTracker(f"batch_{batch_size}", warmup_iterations=10)
        
        for i in range(0, n_items, batch_size):
            batch = data[i:i+batch_size]
            with tracker_batch.measure():
                _ = process_batch(batch)
        
        stats = tracker_batch.get_statistics()
        # Calculate per-item latency
        stats['per_item_ns'] = stats['mean_ns'] / batch_size
        batch_results[batch_size] = stats
    
    single_stats = tracker_single.get_statistics()
    
    print("\nBatch vs Single Processing Comparison")
    print("=" * 70)
    print(f"\nSingle item processing:")
    print(f"  Per-item latency: {single_stats['mean_ns']:.0f} ns")
    
    print(f"\nBatch processing:")
    print(f"{'Batch Size':<15} {'Batch (ns)':<15} {'Per-Item (ns)':<15} {'Speedup':<10}")
    print("-" * 55)
    
    for batch_size, stats in batch_results.items():
        speedup = single_stats['mean_ns'] / stats['per_item_ns']
        print(f"{batch_size:<15} {stats['mean_ns']:<15,.0f} {stats['per_item_ns']:<15,.1f} {speedup:<10.1f}x")

benchmark_batch_vs_single()

In [None]:
class AdaptiveBatcher:
    """
    Adaptive batching based on queue depth and time.
    Balances latency and throughput dynamically.
    """
    
    def __init__(self, 
                 min_batch_size: int = 1,
                 max_batch_size: int = 100,
                 max_wait_us: float = 100,
                 processor: Callable = None):
        self.min_batch_size = min_batch_size
        self.max_batch_size = max_batch_size
        self.max_wait_ns = int(max_wait_us * 1000)
        self.processor = processor or (lambda x: x)
        
        self.buffer = []
        self.last_flush_time = time.perf_counter_ns()
        self.stats = {
            'total_items': 0,
            'total_batches': 0,
            'batch_sizes': []
        }
    
    def add(self, item) -> Optional[List]:
        """Add item, return processed batch if ready."""
        self.buffer.append(item)
        
        current_time = time.perf_counter_ns()
        time_since_flush = current_time - self.last_flush_time
        
        # Flush conditions
        should_flush = (
            len(self.buffer) >= self.max_batch_size or
            (len(self.buffer) >= self.min_batch_size and 
             time_since_flush >= self.max_wait_ns)
        )
        
        if should_flush:
            return self.flush()
        return None
    
    def flush(self) -> List:
        """Process and return current batch."""
        if not self.buffer:
            return []
        
        batch = self.buffer
        self.buffer = []
        self.last_flush_time = time.perf_counter_ns()
        
        self.stats['total_items'] += len(batch)
        self.stats['total_batches'] += 1
        self.stats['batch_sizes'].append(len(batch))
        
        return self.processor(batch)
    
    def get_stats(self) -> Dict:
        """Get batching statistics."""
        if self.stats['batch_sizes']:
            return {
                **self.stats,
                'avg_batch_size': statistics.mean(self.stats['batch_sizes']),
                'min_batch_size_seen': min(self.stats['batch_sizes']),
                'max_batch_size_seen': max(self.stats['batch_sizes'])
            }
        return self.stats


def demo_adaptive_batching():
    """Demonstrate adaptive batching behavior."""
    
    def batch_processor(items):
        """Simulate batch processing."""
        return [item * 2 for item in items]
    
    batcher = AdaptiveBatcher(
        min_batch_size=5,
        max_batch_size=50,
        max_wait_us=500,
        processor=batch_processor
    )
    
    # Simulate varying load
    print("Simulating adaptive batching with varying load...")
    print("-" * 50)
    
    # High load period - should batch to max_batch_size
    print("\nHigh load period (items arriving rapidly):")
    for i in range(200):
        result = batcher.add(i)
        if result:
            print(f"  Batch processed: {len(result)} items")
    
    # Low load period - should flush on timeout
    print("\nLow load period (items arriving slowly):")
    for i in range(10):
        result = batcher.add(i)
        if result:
            print(f"  Batch processed: {len(result)} items")
        time.sleep(0.0001)  # 100 μs between items
    
    # Flush remaining
    remaining = batcher.flush()
    if remaining:
        print(f"  Final flush: {len(remaining)} items")
    
    stats = batcher.get_stats()
    print(f"\nBatching Statistics:")
    print(f"  Total items: {stats['total_items']}")
    print(f"  Total batches: {stats['total_batches']}")
    print(f"  Avg batch size: {stats['avg_batch_size']:.1f}")

demo_adaptive_batching()