In [None]:
# === Environment Setup ===
import os, sys, math, time, random, json, textwrap, warnings, timeit
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
try:
    from numba import njit, prange, vectorize
    NUMBA_AVAILABLE = True
except ImportError:
    def njit(func=None, **kwargs): return func if func else lambda f: f
    def prange(*args, **kwargs): return range(*args, **kwargs)
    def vectorize(func=None, **kwargs): return np.vectorize(func) if func else lambda f: np.vectorize(f)
    NUMBA_AVAILABLE = False
try:
    import dask
    from dask import delayed
    DASK_AVAILABLE = True
except ImportError:
    DASK_AVAILABLE = False
import multiprocessing as mp
from IPython.display import display, Markdown

# --- Configuration ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8), 'figure.dpi': 150})
np.set_printoptions(suppress=True, linewidth=120, precision=4)

# --- Utility Functions ---
def note(msg): display(Markdown(f"<div class='alert alert-block alert-info'>📝 **Note:** {msg}</div>"))
def sec(title): print(f"\n{80*'='}\n| {title.upper()} |\n{80*'='}")

note("Environment initialized for High-Performance Computing.")

# Part 7: Advanced Computational Methods
## Chapter 7.6: High-Performance Computing for Economics

### Introduction: Why and When to Optimize

As economic models become more complex and datasets larger, computational cost can become a significant bottleneck. Standard Python, an interpreted language, can be orders of magnitude slower than compiled languages like C++ or Fortran for number-crunching tasks, especially those involving explicit `for` loops.

**High-Performance Computing (HPC)** provides techniques to break through these barriers. Mastering these tools is essential for the modern computational economist. However, it is crucial to heed the famous adage: **"Premature optimization is the root of all evil."**

The workflow for a computational project should always be:
1.  **Write it correctly:** First, write clear, simple, and well-tested code.
2.  **Profile it:** If the correct code is too slow, use a profiler to identify exactly *which functions* are the bottlenecks.
3.  **Optimize the bottleneck:** Only after identifying a specific bottleneck should you apply the advanced tools discussed in this chapter.

This chapter provides a hands-on introduction to the most common HPC techniques for economic research, focusing on JIT compilation, parallel computing, and GPU acceleration.

## 1. Just-In-Time (JIT) Compilation with Numba

When a vectorized NumPy solution is not feasible, **Numba** is the first line of defense against slow loops. Numba is a **Just-In-Time (JIT) compiler** that translates a subset of Python and NumPy code into fast, optimized machine code via a simple function decorator.

The `@njit` decorator is the recommended mode. It stands for **"no-python JIT,"** which guarantees that the entire function is compiled to machine code without any calls back to the slow Python interpreter. If Numba cannot compile some part of the function (e.g., because you used an unsupported feature like a dictionary), it will raise an error. This forces you to use data structures (primarily NumPy arrays and simple scalar types) that can be efficiently compiled.

In [None]:
sec("Numba for Accelerating Loops: A Monte Carlo Example")

def monte_carlo_pi_python(n_samples):
    acc = 0
    for i in range(n_samples):
        x, y = random.random(), random.random()
        if x**2 + y**2 < 1.0: acc += 1
    return 4.0 * acc / n_samples

# The @njit decorator compiles the function to machine code.
# - `parallel=True`: Numba will attempt to automatically parallelize loops.
# - `cache=True`: The compiled function is saved to disk to avoid re-compilation on subsequent runs.
@njit(parallel=True, cache=True)
def monte_carlo_pi_numba(n_samples):
    acc = 0
    # `prange` is Numba's parallel range, which is enabled by `parallel=True`.
    for i in prange(n_samples):
        x, y = np.random.rand(), np.random.rand()
        if x**2 + y**2 < 1.0: acc += 1
    return 4.0 * acc / n_samples

n = 10_000_000
if NUMBA_AVAILABLE:
    py_time = timeit.timeit(lambda: monte_carlo_pi_python(n), number=1)
    # The first call to a Numba function has a compilation overhead.
    monte_carlo_pi_numba(1)
    numba_time = timeit.timeit(lambda: monte_carlo_pi_numba(n), number=1)
    note(f"Numba provides a {py_time / numba_time:.1f}x speedup over pure Python for this task.")

## 2. Parallel Computing: Theory and Practice

### 2.1 The Limits of Parallelization: Amdahl's Law
**Amdahl's Law** provides a sobering formula for the maximum possible speedup from parallelizing a task. The core idea is that the portion of a program that is inherently sequential acts as a bottleneck, limiting the gains from adding more processors. 

Let $P$ be the proportion of a program that can be parallelized, and $(1-P)$ the proportion that must be run sequentially. The maximum speedup from using $N$ processors is:
$$ \text{Speedup}(N, P) = \frac{1}{(1-P) + \frac{P}{N}} $$ 

As the number of processors $N$ approaches infinity, the term $\frac{P}{N}$ goes to zero. This means the maximum theoretical speedup is limited to $1 / (1-P)$. For example, if 95% of your code can be parallelized (P = 0.95), the maximum speedup you can ever achieve is $1 / (1 - 0.95) = 20x$, regardless of how many cores you use. This highlights the importance of minimizing the sequential parts of a program.

![Amdahl's Law Visualization](../../images/high_performance_python/AmdahlsLaw.png)

### 2.2 The Global Interpreter Lock (GIL) and Multiprocessing

A common desire is to use multiple CPU cores with Python's `threading` module. However, the standard Python interpreter (CPython) has a feature called the **Global Interpreter Lock (GIL)**. The GIL is a lock that prevents multiple native threads from executing Python bytecodes at the same time within a single process. It was implemented to simplify memory management and prevent race conditions in C extensions, but it has the side effect of making multithreading ineffective for **CPU-bound** code (code that is limited by the speed of the CPU).

To truly parallelize CPU-bound tasks, we bypass the GIL by using **multi-processing**. With multiprocessing, the operating system runs each task in a separate process, each with its own Python interpreter and memory space. This is ideal for **"embarrassingly parallel"** problems where tasks are independent and require little communication, such as parameter sweeps or bootstrapping.

In [None]:
sec("Parallelism with Multiprocessing")

# This function must be defined at the top level of the module to be 'pickleable' (serializable) by multiprocessing.
def run_simulation(params):
    sim_id, alpha, beta = params
    result = 0
    for i in range(1_000_000): result += np.sin(i * alpha) * np.cos(i * beta)
    return sim_id, result

# The `if __name__ == '__main__':` block is essential for multiprocessing on some platforms.
# It prevents child processes from re-importing and re-executing the script, which would lead to an infinite loop.
if __name__ == '__main__': 
    param_grid = [(i, alpha, beta) for i, (alpha, beta) in enumerate(np.random.rand(8, 2))]
    note(f"Running a parameter sweep with {len(param_grid)} simulations...")
    
    start_time = time.time()
    # We create a pool of 4 worker processes.
    with mp.Pool(processes=4) as pool:
        # `pool.map` distributes the `param_grid` across the worker processes.
        results = pool.map(run_simulation, param_grid)
    end_time = time.time()
    note(f"Multiprocessing execution time: {end_time - start_time:.2f}s")

### 2.3 High-Level Parallelism with `Dask`
`Dask` is a modern library for parallel computing that can scale from a single machine to a distributed cluster. A key concept in Dask is the **delayed** object. When you wrap a function call with `delayed`, Dask builds a **symbolic computation graph** representing all tasks and their dependencies. This "lazy evaluation" allows Dask's intelligent scheduler to optimize the workflow before executing it, which can lead to better performance.

In [None]:
sec("Dask Delayed Example")

if DASK_AVAILABLE:
    @delayed
    def inc(x):
        time.sleep(0.1) # Simulate work
        return x + 1

    @delayed
    def add(x, y):
        time.sleep(0.1) # Simulate work
        return x + y

    x = inc(1)
    y = inc(2)
    total = add(x, y)

    # At this point, no computation has happened. `total` is a delayed object.
    note("Visualizing the Dask computation graph:")
    display(total) # In a Jupyter environment, this would show the graph

    # To run the computation, we call .compute()
    start_time = time.time()
    result = total.compute()
    end_time = time.time()
    note(f"Dask execution time: {end_time - start_time:.2f}s, Result: {result}")
else:
    note("Dask is not installed. Skipping Dask example.")

## 3. An Alternative: `Cython`

`Cython` offers another path to high performance by allowing you to write C-like static types directly in Python-like code. Cython code is translated into optimized C or C++ and then compiled into a Python extension module. This gives you fine-grained control over performance-critical sections of your code.

**When to use Cython vs. Numba:**
*   **Numba:** Best for accelerating numerical algorithms that are expressed as Python functions with loops over NumPy arrays. It's often easier to get started with, requiring just a decorator.
*   **Cython:** More flexible and powerful. It's a good choice when you need to interact with external C libraries, work with complex data structures not supported by Numba, or when you need to optimize code that is not purely numerical.


## 4. GPU Computing for Massively Parallel Problems

For problems that fit the **SIMD (Single Instruction, Multiple Data)** paradigm—where the same operation is applied to many data points simultaneously—we can use a **Graphics Processing Unit (GPU)**. GPUs contain thousands of simpler cores, making them exceptionally fast for tasks like large matrix multiplications.

*   **`CuPy`**: A near-complete clone of the NumPy API that executes on an NVIDIA GPU. For code already written using NumPy, you can often achieve massive speedups by simply replacing `import numpy as np` with `import cupy as cp`.
*   **`JAX`**: A newer library from Google that combines a NumPy-like API with its own JIT compiler, automatic differentiation, and the ability to run transparently on CPUs, GPUs, and TPUs.

In [None]:
sec("GPU Acceleration Example: NumPy vs. CuPy")
try:
    import cupy as cp
    CUPY_AVAILABLE = True
except ImportError:
    CUPY_AVAILABLE = False

if CUPY_AVAILABLE:
    # Create large random matrices on both CPU (NumPy) and GPU (CuPy)
    size = 4000
    np_A, np_B = np.random.rand(size, size), np.random.rand(size, size)
    cp_A, cp_B = cp.asarray(np_A), cp.asarray(np_B)
    
    note(f"Timing matrix multiplication for a {size}x{size} matrix...")
    # Time NumPy
    numpy_time = timeit.timeit(lambda: np_A @ np_B, number=10)
    # Time CuPy (after a warm-up)
    cp.cuda.runtime.deviceSynchronize() # Synchronize to get accurate timing
    cupy_time = timeit.timeit(lambda: cp_A @ cp_B, number=10)
    cp.cuda.runtime.deviceSynchronize()

    print(f"NumPy (CPU) time: {numpy_time:.4f} seconds")
    print(f"CuPy (GPU) time:  {cupy_time:.4f} seconds")
    print(f"GPU Speedup:      {numpy_time / cupy_time:.1f}x")
else:
    note("CuPy is not installed or no compatible GPU is found. Skipping GPU example.")

## 5. Profiling: Finding the Bottleneck

Optimization efforts should always be guided by **profiling**. A profiler is a tool that measures where a program spends its time. Instead of guessing, you can get precise data on which functions are the performance bottlenecks.

Python's built-in `cProfile` module provides function-level profiling. For more granular detail, the third-party `line_profiler` library provides line-by-line profiling, which is invaluable for identifying the single slowest line within a larger function.

In [None]:
sec("Profiling with cProfile")
import cProfile, pstats

# Define two functions to simulate a workflow
def slow_function():
    # This function is deliberately slow
    time.sleep(0.1)
    [math.sqrt(i) for i in range(10**4)]

def fast_function():
    # This function is fast
    pass

def main_workflow():
    for _ in range(5):
        slow_function()
    for _ in range(100):
        fast_function()

# Create a profiler object and run it on our main function
profiler = cProfile.Profile()
profiler.enable()
main_workflow()
profiler.disable()

# Print the stats
stats = pstats.Stats(profiler).sort_stats('cumulative')
stats.print_stats(10) # Print the top 10 offenders
note("The profiler output clearly shows that nearly all the execution time is spent inside `slow_function`, making it the obvious target for optimization efforts.")