# Convolution Analysis Notebook

This notebook analyzes the **Gaussian Convolution (Blur)** algorithm.

**Key Feature**: This algorithm has two GPU implementations:
1.  **Naive (`conv`)**: Reads neighbors directly from Global Memory (Slow).
2.  **Optimized (`shared_conv`)**: Pre-loads the image tile into **Shared Memory** (Fast L1 Cache) to minimize Global Memory access.

Tasks:
1.  **Visual Verification**: Ensure both implementations produce the same blur.
2.  **Optimization Benchmark**: Measuring the speedup of Shared Memory vs Naive.
3.  **Standard Benchmarks**: Resolution & Block Size scaling.

In [None]:
import os
import time
import matplotlib.pyplot as plt
import cv2
import numpy as np

# Config & Paths
PROJECT_ROOT = os.path.abspath("..")
IMAGES_DIR = os.path.join(PROJECT_ROOT, "images")
INPUT_CLEAN = os.path.join(IMAGES_DIR, "input.jpg")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "analysis_output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

print(f"Project Root: {PROJECT_ROOT}")
print(f"Input Clean Image: {INPUT_CLEAN}")

In [None]:
def compile_gpu(tile_width):
    """Re-compiles the GPU code with a specific TILE_WIDTH."""
    print(f"[Build] Compiling GPU with TILE_WIDTH={tile_width}...")
    build_dir = os.path.join(PROJECT_ROOT, "src/gpu/build")
    if not os.path.exists(build_dir):
        os.makedirs(build_dir)
    if not os.path.exists(os.path.join(build_dir, "output")):
        os.makedirs(os.path.join(build_dir, "output"))
    
    os.chdir(build_dir)
    # Clean and Rebuild
    os.system("make clean")
    
    cmd = f'cmake -DCMAKE_CUDA_FLAGS="-DTILE_WIDTH={tile_width} -DTILE_HEIGHT={tile_width}" ..'
    os.system(cmd)
    
    os.system("make")
    os.chdir(PROJECT_ROOT)

def run_algo(start_dir, image_path, algo, params):
    """Runs the algorithm binary and returns execution time."""
    cwd = os.getcwd()
    # Ensure we use absolute paths for chdir and checking binary
    abs_start_dir = os.path.abspath(start_dir)
    binary_path = os.path.join(abs_start_dir, "main")
    
    if not os.path.exists(binary_path):
        print(f"[Error] Binary not found at {binary_path}")
        # Try to find it in case it's in a subdirectory or misnamed
        # But for now we just return None
        return None
    
    # Ensure executable permissions (Linux/Mac)
    if os.name != 'nt':
        os.system(f'chmod +x "{binary_path}"')
    
    os.chdir(abs_start_dir)
    
    start_time = time.time()
    # Use ./main explicitly. 
    # We wrap image_path in quotes in case of spaces
    cmd = f'./main "{image_path}" {algo} {params}'
    
    ret_code = os.system(cmd)
    end_time = time.time()
    
    os.chdir(cwd)
    if ret_code != 0:
        print(f"[Error] Execution failed for {algo} in {start_dir} (RetCode: {ret_code})")
        return None
    return end_time - start_time

## 1. Visual Verification & Optimization Check
Compare CPU, GPU (Naive), and GPU (Shared) outputs.

In [None]:
# Ensure basic compilation (default block size 16)
compile_gpu(16)
# Build CPU
print("[Build] Compiling CPU...")
cpu_build_dir = os.path.join(PROJECT_ROOT, "src/cpu/build")
if not os.path.exists(cpu_build_dir):
    os.makedirs(cpu_build_dir)
if not os.path.exists(os.path.join(cpu_build_dir, "output")):
    os.makedirs(os.path.join(cpu_build_dir, "output"))

os.chdir(cpu_build_dir)
os.system("cmake .. && make")
os.chdir(PROJECT_ROOT)

params = "7"

if os.path.exists(INPUT_CLEAN):
    print("Running CPU Conv...")
    run_algo("src/cpu/build", INPUT_CLEAN, "conv", params)
    
    print("Running GPU Naive Conv...")
    t_naive = run_algo("src/gpu/build", INPUT_CLEAN, "conv", params)
    os.rename("src/gpu/build/output/conv_result.jpg", "src/gpu/build/output/conv_naive_result.jpg")
    
    print("Running GPU Shared Conv...")
    t_shared = run_algo("src/gpu/build", INPUT_CLEAN, "shared_conv", params)
    # Note: shared_conv saves to shared_conv_result.jpg usually
    
    # Load Results
    img_in = cv2.imread(INPUT_CLEAN)
    img_cpu = cv2.imread("src/cpu/build/output/cpu_conv_result.jpg")
    img_naive = cv2.imread("src/gpu/build/output/conv_naive_result.jpg")
    img_shared = cv2.imread("src/gpu/build/output/shared_conv_result.jpg")
    
    # Visualize
    fig, ax = plt.subplots(1, 4, figsize=(20, 6))
    ax[0].imshow(cv2.cvtColor(img_in, cv2.COLOR_BGR2RGB))
    ax[0].set_title("Original")
    ax[1].imshow(cv2.cvtColor(img_cpu, cv2.COLOR_BGR2RGB))
    ax[1].set_title("CPU Blur")
    ax[2].imshow(cv2.cvtColor(img_naive, cv2.COLOR_BGR2RGB))
    ax[2].set_title("GPU Naive")
    ax[3].imshow(cv2.cvtColor(img_shared, cv2.COLOR_BGR2RGB))
    ax[3].set_title("GPU Shared")
    for a in ax: a.axis('off')
    plt.savefig(os.path.join(OUTPUT_DIR, 'convolution_visual_verification.png'))
    plt.show()
    
    if t_naive is not None:
        print(f"Naive GPU Time:  {t_naive:.4f}s")
    else:
        print("Naive GPU Time:  N/A (Failed)")
    if t_shared is not None:
        print(f"Shared GPU Time: {t_shared:.4f}s")
    else:
        print("Shared GPU Time: N/A (Failed)")
    if t_shared and t_naive and t_shared < t_naive:
        print(f"Optimization Speedup: {t_naive/t_shared:.2f}x (Shared Memory Win!)")
    else:
        print("Optimization Neutral/Loss (Check Tile Size vs Kernel Size)")
else:
    print("⚠️ ERROR: input.jpg not found! Please upload it to 'images/' folder.")

## 2. Benchmarking: Resolution Scaling
We resize the image and run the **Optimized (Shared)** version.

In [None]:
resolutions = [512, 1024, 2048] # Add 4096 if you have a 4K image
cpu_times = []
gpu_times = []
speedups = []

if os.path.exists(INPUT_CLEAN):
    base_img = cv2.imread(INPUT_CLEAN)
    
    for res in resolutions:
        print(f"Benchmarking Resolution: {res}x{res}...")
        
        # Create Temp Image
        temp_img_path = os.path.join(OUTPUT_DIR, f"temp_{res}.jpg")
        resized = cv2.resize(base_img, (res, res))
        cv2.imwrite(temp_img_path, resized)
        
        # Run (Using shared_conv for GPU)
        t_cpu = run_algo("src/cpu/build", temp_img_path, "conv", params)
        t_gpu = run_algo("src/gpu/build", temp_img_path, "shared_conv", params)
        
        if t_cpu and t_gpu:
            cpu_times.append(t_cpu)
            gpu_times.append(t_gpu)
            speedups.append(t_cpu / t_gpu)
            print(f"  CPU: {t_cpu:.4f}s | GPU: {t_gpu:.4f}s | Speedup: {t_cpu/t_gpu:.2f}x")
        
        # Cleanup
        if os.path.exists(temp_img_path):
            os.remove(temp_img_path)

    # Plot Res Scaling
    fig, ax1 = plt.subplots(figsize=(10, 5))
    
    ax1.set_xlabel('Resolution (NxN)')
    ax1.set_ylabel('Execution Time (s)', color='tab:blue')
    ax1.plot(resolutions, cpu_times, label='CPU Time', color='tab:blue', marker='o')
    ax1.plot(resolutions, gpu_times, label='GPU Time', color='tab:cyan', marker='o')
    ax1.tick_params(axis='y', labelcolor='tab:blue')
    ax1.legend(loc='upper left')
    
    ax2 = ax1.twinx()
    ax2.set_ylabel('Speedup Factor', color='tab:orange')
    ax2.plot(resolutions, speedups, label='Speedup', color='tab:orange', marker='x', linestyle='--')
    ax2.tick_params(axis='y', labelcolor='tab:orange')
    ax2.legend(loc='upper right')
    
    plt.title("Performance vs Resolution (Shared Conv)")
    plt.grid(True)
    plt.xticks(resolutions)
    plt.savefig(os.path.join(OUTPUT_DIR, 'convolution_resolution_benchmark.png'))
    plt.show()

## 3. Benchmarking: GPU Block Size Optimization
We use the largest resolution (from previous step) and vary the CUDA TILE_WIDTH.

In [None]:
block_sizes = [4, 8, 16, 32]
block_speedups = []

# Create one large consolidated test image
test_res = 2048
temp_img_path = os.path.join(OUTPUT_DIR, f"temp_block_test_conv.jpg")
if os.path.exists(INPUT_CLEAN):
    base_img = cv2.imread(INPUT_CLEAN)
    resized = cv2.resize(base_img, (test_res, test_res))
    cv2.imwrite(temp_img_path, resized)
    
    # Run CPU Baseline (Once)
    print(f"Running CPU Baseline (for Speedup Calculation)...")
    t_cpu = run_algo("src/cpu/build", temp_img_path, "conv", params)
    print(f"  CPU Time: {t_cpu:.4f}s")
    
    for bs in block_sizes:
        compile_gpu(bs)
        print(f"Benchmarking Block Size: {bs}x{bs}...")
        t_gpu = run_algo("src/gpu/build", temp_img_path, "shared_conv", params)
        
        speedup = t_cpu / t_gpu if t_gpu > 0 else 0
        block_speedups.append(speedup)
        print(f"  GPU Time: {t_gpu:.4f}s | Speedup: {speedup:.2f}x")
        
    # Cleanup
    if os.path.exists(temp_img_path):
        os.remove(temp_img_path)
        
    # Plot Block Size vs Speedup
    plt.figure(figsize=(8, 6))
    bars = plt.bar([str(b) for b in block_sizes], block_speedups, color='darkorange')
    plt.xlabel('Block Size (NxN)')
    plt.ylabel('Speedup Factor (vs CPU)')
    plt.title(f'GPU Optimization: Block Size vs Speedup (Shared Conv @ {test_res}x{test_res})')
    plt.grid(axis='y')
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}x',
                ha='center', va='bottom')
                
    plt.savefig(os.path.join(OUTPUT_DIR, 'convolution_block_size_benchmark.png'))
    plt.show()