In [5]:
import cupy as cp
import numpy as np
import time

def mc_price_option_cupy(S0, K, r, sigma, T, M, I):
    """
    GPU-accelerated Monte Carlo pricing using CuPy.
    """
    dt = T / M
    # Allocate memory directly on the GPU
    S = cp.zeros((M + 1, I))
    S[0] = S0

    # Constant values for the simulation (calculate once)
    mu = (r - 0.5 * sigma ** 2) * dt
    vol_sqrt_dt = sigma * cp.sqrt(dt)

    # Generate random paths
    for t in range(1, M + 1):
        eps = cp.random.standard_normal(I)
        S[t] = S[t - 1] * cp.exp(mu + vol_sqrt_dt * eps)

    # Calculate payoff
    payoff = cp.maximum(S[-1] - K, 0)
    C0 = cp.exp(-r * T) * cp.mean(payoff)

    # Wait for the GPU to finish threads
    cp.cuda.Device(0).synchronize()

    return C0, S

def main():
    S0 = 100.0
    K = 105.0
    r = 0.05
    sigma = 0.12
    T = 0.5
    M = 1000
    I = 50_000

    # First run is a "warm-up"
    mc_price_option_cupy(S0, K, r, sigma, T, M, I)

    # Actual timed run
    C0, S_gpu = mc_price_option_cupy(S0, K, r, sigma, T, M, I)

    # Transfer results back to CPU
    C0_cpu = float(C0)

    print(f">> Initial Stock Price: {S0}")
    print("="*28)
    print(f">> GPU European Option Value: {C0_cpu}")
    print("="*28)

In [6]:
def run_gpu_benchmark():
    S0, K, r, sigma, T = 100.0, 105.0, 0.05, 0.12, 0.5
    I_values = [10000, 25000, 50000, 75000, 100000]
    M_values = [100, 500, 1000, 2500, 5000]

    print("--- STARTING GPU BENCHMARK (LEVEL B) ---")

    # Warm-up run
    _ = mc_price_option_cupy(S0, K, r, sigma, T, 100, 1000)

    # 1. Path Scaling (Fixed M=1000)
    print(f"\n{'Dimension':<10} | {'Value':<10} | {'Total Time (s)':<15} | {'Execution Time (s)':<18}")
    print("-" * 65)

    fixed_M = 1000
    for I in I_values:
        t0 = time.perf_counter()
        # Math + GPU Internal work
        C0, S_gpu = mc_price_option_cupy(S0, K, r, sigma, T, fixed_M, I)
        t1 = time.perf_counter()

        # Memory Transfer back to CPU
        S_cpu = cp.asnumpy(S_gpu)
        t2 = time.perf_counter()

        # Output: Total (t2-t0) and Execution (t1-t0)
        print(f"{'I':<10} | {I:<10} | {t2-t0:<15.4f} | {t1-t0:<18.4f}")

    # 2. Step Scaling (Fixed I=50,000)
    print(f"\n{'Dimension':<10} | {'Value':<10} | {'Total Time (s)':<15} | {'Execution Time (s)':<18}")
    print("-" * 65)

    fixed_I = 50000
    for M in M_values:
        t0 = time.perf_counter()
        C0, S_gpu = mc_price_option_cupy(S0, K, r, sigma, T, M, fixed_I)
        t1 = time.perf_counter()

        S_cpu = cp.asnumpy(S_gpu)
        t2 = time.perf_counter()

        print(f"{'M':<10} | {M:<10} | {t2-t0:<15.4f} | {t1-t0:<18.4f}")

run_gpu_benchmark()

--- STARTING GPU BENCHMARK (LEVEL B) ---

Dimension  | Value      | Total Time (s)  | Execution Time (s)
-----------------------------------------------------------------
I          | 10000      | 0.1471          | 0.1154            
I          | 25000      | 0.1911          | 0.1173            
I          | 50000      | 0.2908          | 0.1340            
I          | 75000      | 0.3365          | 0.1179            
I          | 100000     | 0.4098          | 0.1184            

Dimension  | Value      | Total Time (s)  | Execution Time (s)
-----------------------------------------------------------------
M          | 100        | 0.0417          | 0.0223            
M          | 500        | 0.1333          | 0.0591            
M          | 1000       | 0.2674          | 0.1187            
M          | 2500       | 0.6658          | 0.2952            
M          | 5000       | 1.3165          | 0.5823            
