In [None]:
# === Environment Setup ===
import os, sys, math, time, random, json, textwrap, warnings, timeit
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
try:
    from numba import njit, prange, vectorize
    NUMBA_AVAILABLE = True
except ImportError:
    # Define dummy decorators if Numba is not available
    def njit(func=None, **kwargs): return func if func else lambda f: f
    def prange(*args, **kwargs): return range(*args, **kwargs)
    def vectorize(func=None, **kwargs): return np.vectorize(func) if func else lambda f: np.vectorize(f)
    NUMBA_AVAILABLE = False
try:
    import dask
    from dask import delayed
    DASK_AVAILABLE = True
except ImportError:
    DASK_AVAILABLE = False
import multiprocessing as mp
from IPython.display import display, Markdown

# --- Configuration ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8), 'figure.dpi': 150})
np.set_printoptions(suppress=True, linewidth=120, precision=4)

# --- Utility Functions ---
def note(msg):
    """Displays a formatted note in a Jupyter Notebook."""
    display(Markdown(f"<div class='alert alert-block alert-info'>📝 **Note:** {msg}</div>"))
def sec(title):
    """Prints a section header to the console."""
    print(f"\n{80*'='}\n| {title.upper()} |\n{80*'='}")

note("Environment initialized for High-Performance Computing.")

# High-Performance Computing for Economics

### Introduction: Why and When to Optimize

As economic models become more complex and datasets grow, the time it takes to run code can become a major research bottleneck. Standard Python, while excellent for rapid development, is an interpreted language and can be orders of magnitude slower than compiled languages like C++ or Fortran for number-crunching tasks, especially those involving explicit `for` loops.

**High-Performance Computing (HPC)** provides a set of techniques to break through these computational barriers. However, it is crucial to follow a disciplined workflow and heed the famous adage from computer scientist Donald Knuth: **"Premature optimization is the root of all evil."**

The workflow for a computational project should always be:
1.  **Write it correctly:** First, write clear, simple, and well-tested code that verifiably produces the correct result.
2.  **Profile it:** If the correct code is too slow for your needs, use a profiler to identify exactly *which functions or lines of code* are the bottlenecks.
3.  **Optimize the bottleneck:** Only after identifying a specific, measured bottleneck should you apply the advanced tools discussed in this chapter to that specific part of the code.

This chapter provides a hands-on introduction to the most common HPC techniques used in economic research, focusing on Just-In-Time (JIT) compilation, parallel computing, and GPU acceleration.

## 1. Just-In-Time (JIT) Compilation with Numba

When a vectorized NumPy solution is not obvious or is too memory-intensive, **Numba** is the first line of defense against slow loops. Numba is a **Just-In-Time (JIT) compiler** that translates a subset of Python and NumPy code into fast, optimized machine code at runtime.

This is achieved through a simple function decorator, `@njit`. This decorator stands for **"no-python JIT,"** a mode that guarantees the entire function is compiled to machine code without any calls back to the slow Python interpreter. If Numba cannot compile some part of the function (e.g., because you used an unsupported feature like a Python dictionary), it will raise an error. This forces you to use data structures (primarily NumPy arrays and simple scalar types) that can be compiled to highly efficient machine code.

In [None]:
sec("Numba for Accelerating Loops: A Monte Carlo Example")

# We will estimate pi using a simple Monte Carlo simulation.
# This is a classic example of a task that is slow in pure Python but easy to speed up.
def monte_carlo_pi_python(n_samples):
    """Estimates pi using a pure Python loop."""
    acc = 0
    # This loop is the bottleneck. Python's dynamic typing makes each iteration slow.
    for i in range(n_samples):
        x, y = random.random(), random.random()
        if x**2 + y**2 < 1.0:
            acc += 1
    return 4.0 * acc / n_samples

# The @njit decorator tells Numba to compile this function.
# - `parallel=True` allows Numba to automatically parallelize the loop.
# - `cache=True` saves the compiled function to disk, making subsequent calls faster.
@njit(parallel=True, cache=True)
def monte_carlo_pi_numba(n_samples):
    """Estimates pi using a Numba-compiled, parallel loop."""
    acc = 0
    # `prange` is Numba's parallel range, which works like range() but splits the
    # work across multiple CPU cores.
    for i in prange(n_samples):
        # Inside a Numba function, we must use NumPy's random functions.
        x, y = np.random.rand(), np.random.rand()
        if x**2 + y**2 < 1.0:
            acc += 1
    return 4.0 * acc / n_samples

n = 10_000_000
if NUMBA_AVAILABLE:
    # Time the pure Python version
    py_time = timeit.timeit(lambda: monte_carlo_pi_python(n), number=1)
    
    # The first time a Numba function is called, it has to compile. This is a one-off cost.
    # We run it once on a small input to "warm it up" before timing.
    print("Warming up Numba (compiling the function)...")
    monte_carlo_pi_numba(1)
    
    # Time the compiled Numba version
    numba_time = timeit.timeit(lambda: monte_carlo_pi_numba(n), number=1)
    
    print(f"Pure Python time: {py_time:.4f}s")
    print(f"Numba time:       {numba_time:.4f}s")
    note(f"Numba provides a **{py_time / numba_time:.1f}x** speedup over pure Python for this task.")
else:
    note("Numba is not installed. Skipping performance comparison.")

## 2. Parallel Computing: Theory and Practice

### 2.1 The Limits of Parallelization: Amdahl's Law
Before diving into parallel coding, it's important to understand its theoretical limits. **Amdahl's Law**, formulated by computer architect Gene Amdahl in 1967, specifies the maximum possible speedup from parallelizing a task. Let $P$ be the proportion of a program that can be parallelized (and $1-P$ be the proportion that is inherently serial). The maximum speedup from using $N$ processors is:

$$ \text{Speedup}(N, P) = \frac{1}{(1-P) + \frac{P}{N}} $$ 

The key insight is that the serial portion $(1-P)$ acts as a permanent bottleneck. As the number of processors $N$ approaches infinity, the term $P/N$ goes to zero, and the maximum speedup is limited to $1 / (1-P)$.

For example, if 10% of your code is inherently serial ($P=0.9$), you can never achieve more than a 10x speedup, no matter how many cores you throw at the problem.

### 2.2 The Global Interpreter Lock (GIL) and Multiprocessing

A common source of confusion when trying to parallelize Python code is the **Global Interpreter Lock (GIL)**. In the standard CPython interpreter, the GIL is a mutex (a type of lock) that prevents multiple native threads from executing Python bytecodes at the same time within a single process. This means that for **CPU-bound** code (tasks limited by CPU speed, like our pi calculation), Python's `threading` module provides no performance gain and can even make things slower due to overhead.

To achieve true parallelism for CPU-bound tasks, we must bypass the GIL. The standard library's **`multiprocessing`** module does this by running each task in a separate process, managed by the operating system. Each process has its own Python interpreter and memory space, so the GIL of one process does not block the others. This approach is ideal for **"embarrassingly parallel"** problems where tasks are independent and require little communication (e.g., parameter sweeps, bootstrapping, or Monte Carlo simulations).

In [None]:
sec("Parallelism with Multiprocessing")

# This function must be defined at the top level of a module so that it can be "pickled"
# (serialized) and sent to the worker processes.
def run_simulation(params):
    """A dummy function that performs some CPU-intensive work."""
    sim_id, alpha, beta = params
    result = 0
    # This loop is a placeholder for a more complex calculation.
    for i in range(1_000_000):
        result += np.sin(i * alpha) * np.cos(i * beta)
    return sim_id, result

# The `if __name__ == '__main__':` guard is essential for multiprocessing on some platforms
# (like Windows and macOS). It prevents worker processes from re-importing and re-executing
# the script's main code, which would lead to an infinite loop of process creation.
if __name__ == '__main__': 
    # Create a grid of parameters for our simulation.
    # Each tuple in the list is a separate, independent task.
    n_sims = mp.cpu_count() # Run one simulation per available CPU core
    param_grid = [(i, alpha, beta) for i, (alpha, beta) in enumerate(np.random.rand(n_sims, 2))]
    note(f"Running a parameter sweep with {len(param_grid)} simulations on {n_sims} cores...")
    
    # Time the serial execution first for comparison
    start_serial = time.time()
    serial_results = [run_simulation(p) for p in param_grid]
    end_serial = time.time()
    serial_time = end_serial - start_serial
    print(f"Serial execution time: {serial_time:.2f}s")

    # Now, execute in parallel
    start_parallel = time.time()
    # `mp.Pool` creates a pool of worker processes. Using a `with` statement ensures
    # the pool is properly closed afterwards.
    with mp.Pool(processes=n_sims) as pool:
        # `pool.map` is like the built-in `map` function but it distributes the
        # tasks in `param_grid` across the worker processes.
        parallel_results = pool.map(run_simulation, param_grid)
    end_parallel = time.time()
    parallel_time = end_parallel - start_parallel
    print(f"Multiprocessing execution time: {parallel_time:.2f}s")
    
    note(f"Parallel execution provides a **{serial_time / parallel_time:.2f}x** speedup.")

### 2.3 High-Level Parallelism with `Dask`

`Dask` is a modern, powerful library for parallel computing in Python. It can replicate the functionality of `multiprocessing` with a more flexible API and can also scale from a single machine to a distributed cluster of many machines.

A core concept in Dask is the **delayed** object. When you wrap a function call with `dask.delayed`, Dask does not execute it immediately. Instead, it builds a **symbolic computation graph** (a directed acyclic graph or DAG) representing all tasks and their dependencies. This "lazy evaluation" allows Dask's intelligent scheduler to analyze the entire workflow and optimize its execution, which can lead to better performance by minimizing data movement and maximizing parallelism.

## 3. GPU Computing for Massively Parallel Problems

For certain types of problems that fit the **SIMD (Single Instruction, Multiple Data)** paradigm—like large matrix multiplications, image processing, or deep learning—we can leverage a **Graphics Processing Unit (GPU)**. Consumer GPUs contain thousands of simpler cores compared to a CPU's handful of complex cores, making them exceptionally fast for these specific tasks.

Two popular libraries for GPU computing in Python are:
*   **`CuPy`**: A library that provides a near-complete clone of the NumPy API but executes operations on an NVIDIA GPU. For code already written using NumPy, you can often achieve massive speedups by simply replacing `import numpy as np` with `import cupy as cp`.
*   **`JAX`**: A newer library from Google that combines a NumPy-like API with its own JIT compiler (similar to Numba), automatic differentiation (for machine learning), and the ability to run transparently on CPUs, GPUs, and TPUs.

In [None]:
sec("GPU Acceleration Example: NumPy vs. CuPy")
try:
    import cupy as cp
    CUPY_AVAILABLE = True
    # Get GPU device name
    gpu_name = cp.cuda.runtime.getDeviceProperties(0)['name'].decode('utf-8')
except (ImportError, cp.cuda.runtime.CUDARuntimeError):
    CUPY_AVAILABLE = False

if CUPY_AVAILABLE:
    note(f"Found compatible GPU: {gpu_name}")
    # Create large random matrices on both CPU (NumPy) and GPU (CuPy)
    size = 4000
    np_A, np_B = np.random.rand(size, size).astype(np.float32), np.random.rand(size, size).astype(np.float32)
    
    # Transfer data to the GPU
    cp_A, cp_B = cp.asarray(np_A), cp.asarray(np_B)
    
    note(f"Timing matrix multiplication for a {size}x{size} matrix...")
    # Time NumPy on the CPU
    numpy_time = timeit.timeit(lambda: np_A @ np_B, number=10)
    
    # Time CuPy on the GPU. We must synchronize the device to ensure the computation
    # is finished before stopping the timer.
    cp.cuda.runtime.deviceSynchronize()
    cupy_time = timeit.timeit(lambda: cp_A @ cp_B, number=10)
    cp.cuda.runtime.deviceSynchronize()

    print(f"NumPy (CPU) time: {numpy_time:.4f} seconds")
    print(f"CuPy (GPU) time:  {cupy_time:.4f} seconds")
    note(f"GPU provides a **{numpy_time / cupy_time:.1f}x** speedup for this operation.")
else:
    note("CuPy is not installed or no compatible NVIDIA GPU is found. Skipping GPU example.")

## 4. Profiling: Finding the Bottleneck

As emphasized in the introduction, optimization efforts should always be guided by **profiling**. A profiler is a tool that measures where a program spends its time and how often different functions are called. Instead of guessing, you can get precise, empirical data on which functions are the performance bottlenecks.

Python's built-in **`cProfile`** module provides function-level profiling. It is a good first step for getting a high-level overview of your program's performance. For more granular detail, third-party libraries like `line_profiler` can provide line-by-line profiling statistics, which is invaluable for identifying the single slowest line within a larger function.

In [None]:
sec("Profiling with cProfile")
import cProfile, pstats

# Define two functions to simulate a simple workflow
def slow_function():
    """This function is deliberately slow to simulate a bottleneck."""
    time.sleep(0.1) # Represents I/O-bound work
    _ = [math.sqrt(i) for i in range(10**4)] # Represents CPU-bound work

def fast_function():
    """This function is fast and does not contribute much to the total runtime."""
    pass

def main_workflow():
    """The main entry point of our program, calling the other functions."""
    # slow_function is called only 5 times, but is the bottleneck.
    for _ in range(5):
        slow_function()
    # fast_function is called 100 times, but is not the bottleneck.
    for _ in range(100):
        fast_function()

# 1. Create a profiler object
profiler = cProfile.Profile()

# 2. Wrap the code you want to profile between enable() and disable()
profiler.enable()
main_workflow()
profiler.disable()

# 3. Print the statistics
# We create a pstats.Stats object to analyze the profiler's output.
# We sort the results by 'cumulative' time to see which functions took the longest in total.
stats = pstats.Stats(profiler).sort_stats('cumulative')

# `print_stats(10)` shows the top 10 functions contributing to the runtime.
stats.print_stats(10)

note("The profiler output (see `cumtime` column) clearly shows that nearly all the execution time is spent inside `slow_function`, making it the obvious and only target for optimization efforts.")