# Setup

In [38]:
import numpy as np
import pandas as pd
import random as rd
import os

FEATURES = 10
cols = "abcdefghijkmnopqrstuv"
columns = list(cols)[:FEATURES]

test_configs = [
    (1000, 100, 2, "small_simple"),
    (5000, 200, 2, "small_medium"),
    
    (10000, 500, 3, "medium_balanced"),
    (20000, 1000, 3, "medium_complex"),
    
    (50000, 1000, 4, "large_moderate"),
    (100000, 1000, 4, "large_standard"),
    (100000, 2000, 4, "large_heavy"),
    
    (200000, 2000, 5, "xlarge_extreme"),
]

unary_funs = ["sinf", "cosf", "sqrtf"]
operators = ["+", "-"]

def random_program(depth=4):
    """Gera expressão aleatória com profundidade controlada."""
    r = rd.randint(0, 100)
    if depth == 0 or r < 30:
        c = rd.choice(columns)
        return f"_{c}_"
    elif r < 80:
        c = rd.choice(unary_funs)
        r = random_program(depth - 1)
        return f"{c}({r})"
    else:
        c = rd.choice(operators)
        r1 = random_program(depth - 1)
        r2 = random_program(depth - 1)
        return f"({r1}) {c} ({r2})"

os.makedirs("test_cases", exist_ok=True)

print("=" * 80)
print("Generating test cases for CPU vs GPU comparison")
print("=" * 80 + "\n")

for n_rows, n_functions, depth, name in test_configs:
    print(f"Creating {name}:")
    print(f"  - Rows: {n_rows:,}")
    print(f"  - Functions: {n_functions:,}")
    print(f"  - Depth: {depth}")
    
    x = np.random.rand(n_rows, FEATURES)
    df = pd.DataFrame(x, columns=columns)
    df["y"] = np.sin(df["a"].values) + np.cos(df["b"].values) + np.random.rand(n_rows) * 0.001
    
    csv_file = f"test_cases/data_{name}.csv"
    df.to_csv(csv_file, index=False)
    print(f"  ✓ Created {csv_file}")
    
    functions = []
    for _ in range(n_functions):
        functions.append(random_program(depth))
    
    functions_file = f"test_cases/functions_{name}.txt"
    with open(functions_file, "w") as f:
        for func in functions:
            f.write(func + "\n")
    print(f"  ✓ Created {functions_file}")
    
    avg_len = sum(len(f) for f in functions) / len(functions)
    print(f"  ✓ Avg expression length: {avg_len:.1f} chars")
    print()

print("=" * 80)
print("Summary of test cases:")
print("=" * 80)
print(f"{'Name':<20} {'Rows':<12} {'Functions':<12} {'Complexity':<12} {'GPU Expected'}")
print("-" * 80)

for n_rows, n_functions, depth, name in test_configs:
    complexity = n_rows * n_functions
    gpu_wins = "✓ Yes" if complexity > 5_000_000 else "✗ No" if complexity < 1_000_000 else "? Maybe"
    print(f"{name:<20} {n_rows:<12,} {n_functions:<12,} {complexity:<12,} {gpu_wins}")

print("\n" + "=" * 80)
print("Test files created in ./test_cases/")
print("Use these files to benchmark CPU vs GPU performance")
print("=" * 80)

        




Generating test cases for CPU vs GPU comparison

Creating small_simple:
  - Rows: 1,000
  - Functions: 100
  - Depth: 2
  ✓ Created test_cases/data_small_simple.csv
  ✓ Created test_cases/functions_small_simple.txt
  ✓ Avg expression length: 12.7 chars

Creating small_medium:
  - Rows: 5,000
  - Functions: 200
  - Depth: 2
  ✓ Created test_cases/data_small_medium.csv
  ✓ Created test_cases/functions_small_medium.txt
  ✓ Avg expression length: 14.6 chars

Creating medium_balanced:
  - Rows: 10,000
  - Functions: 500
  - Depth: 3
  ✓ Created test_cases/data_medium_balanced.csv
  ✓ Created test_cases/functions_medium_balanced.txt
  ✓ Avg expression length: 18.5 chars

Creating medium_complex:
  - Rows: 20,000
  - Functions: 1,000
  - Depth: 3
  ✓ Created test_cases/data_medium_complex.csv
  ✓ Created test_cases/functions_medium_complex.txt
  ✓ Avg expression length: 17.9 chars

Creating large_moderate:
  - Rows: 50,000
  - Functions: 1,000
  - Depth: 4
  ✓ Created test_cases/data_large_mo

# Bechmark

In [39]:
import numpy as np
import pandas as pd
import torch
import time
import os
import glob

TASK_BATCH = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_grad_enabled(False)

OPS = {
    "sinf": torch.sin,
    "cosf": torch.cos,
    "tanf": torch.tan,
    "sqrtf": torch.sqrt,
    "expf": torch.exp
}

def benchmark_sequential(csv_file, functions_file):
    df = pd.read_csv(csv_file)
    funs = [line.strip() for line in open(functions_file).readlines()]
    
    def score(line):
        for u in ["sinf", "cosf", "tanf", "sqrtf", "expf"]:
            line = line.replace(u, f"np.{u[:-1]}")
        for c in df.columns:
            line = line.replace(f"_{c}_", f"(df[\"{c}\"].values)")
        a = eval(line)
        b = df["y"]
        e = np.square(np.subtract(a, b)).mean()
        return e
    
    start = time.time()
    r = min([(score(line), line) for line in funs])
    elapsed = time.time() - start
    
    return elapsed, r[0], r[1]

def generate_kernel_code(expr, input_cols):
    kernel_expr = expr
    for k in OPS:
        kernel_expr = kernel_expr.replace(k, f"OPS['{k}']")
    
    for c in input_cols:
        kernel_expr = kernel_expr.replace(f"_{c}_", f"X['{c}']")
    
    kernel_code = f"""
def kernel_func(X, OPS):
    return {kernel_expr}
"""
    return kernel_code

def compile_kernel(expr, input_cols):
    kernel_code = generate_kernel_code(expr, input_cols)
    env = {}
    exec(kernel_code.strip(), {}, env)
    return env["kernel_func"]

def benchmark_parallel(csv_file, functions_file):
    df = pd.read_csv(csv_file)
    funs = [line.strip() for line in open(functions_file).readlines()]
    
    cols = list(df.columns)
    target_col = cols[-1]
    input_cols = cols[:-1]
    
    X = {c: torch.tensor(df[c].values, dtype=torch.float64, device=device)
         for c in cols}
    y = X[target_col]
    
    compiled = [compile_kernel(f, input_cols) for f in funs]
    
    start = time.time()
    
    best_err = float('inf')
    best_expr = None
    
    for i in range(0, len(compiled), TASK_BATCH):
        block = compiled[i:i + TASK_BATCH]
        preds = torch.stack([kernel(X, OPS) for kernel in block])
        errs = torch.mean((preds - y) ** 2, dim=1)
        
        for j, err in enumerate(errs):
            idx_global = i + j
            err_val = err.item()

            if err_val < best_err:
                best_err = err_val
                best_expr = funs[idx_global]
        
        del preds, errs
        torch.cuda.empty_cache()
    
    elapsed = time.time() - start
    
    return elapsed, best_err, best_expr


print("=" * 100)
print(f"CPU vs GPU Benchmark - Device: {device}")
print("=" * 100 + "\n")

test_files = sorted(glob.glob("test_cases/data_*.csv"))

if not test_files:
    print("ERROR: No test files found in test_cases/")
    print("Run 'python generate_inputs.py' first!")
    exit(1)

results = []

for csv_file in test_files:
    name = os.path.basename(csv_file).replace("data_", "").replace(".csv", "")
    functions_file = csv_file.replace("data_", "functions_").replace(".csv", ".txt")
    
    df = pd.read_csv(csv_file)
    n_rows = len(df)
    n_functions = sum(1 for _ in open(functions_file))
    
    print(f"Testing: {name}")
    print(f"  Rows: {n_rows:,}, Functions: {n_functions:,}")
    
    # CPU
    print("  Running CPU version...", end=" ", flush=True)
    cpu_time, cpu_mse, cpu_expr = benchmark_sequential(csv_file, functions_file)
    print(f"✓ {cpu_time:.4f}s")
    
    # GPU
    print("  Running GPU version...", end=" ", flush=True)
    gpu_time, gpu_mse, gpu_expr = benchmark_parallel(csv_file, functions_file)
    print(f"✓ {gpu_time:.4f}s")
    
    # Speedup
    speedup = cpu_time / gpu_time
    winner = "GPU" if speedup > 1.0 else "CPU"
    
    print(f"  Speedup: {speedup:.2f}x ({winner} wins!)")
    print()
    
    results.append({
        "name": name,
        "rows": n_rows,
        "functions": n_functions,
        "cpu_time": cpu_time,
        "gpu_time": gpu_time,
        "speedup": speedup,
        "winner": winner,
        "mse": cpu_mse
    })


print("=" * 100)
print("BENCHMARK RESULTS SUMMARY")
print("=" * 100)
print(f"{'Test Case':<20} {'Rows':<10} {'Funcs':<8} {'CPU(s)':<10} {'GPU(s)':<10} {'Speedup':<10} {'Winner'}")
print("-" * 100)

for r in results:
    print(f"{r['name']:<20} {r['rows']:<10,} {r['functions']:<8,} "
          f"{r['cpu_time']:<10.4f} {r['gpu_time']:<10.4f} "
          f"{r['speedup']:<10.2f}x {r['winner']}")

print("\n" + "=" * 100)
print("KEY INSIGHTS:")
print("=" * 100)

cpu_wins = sum(1 for r in results if r['winner'] == 'CPU')
gpu_wins = sum(1 for r in results if r['winner'] == 'GPU')

print(f"CPU wins: {cpu_wins}/{len(results)}")
print(f"GPU wins: {gpu_wins}/{len(results)}")
print(f"Best speedup: {max(r['speedup'] for r in results):.2f}x ({max(results, key=lambda x: x['speedup'])['name']})")
print(f"Worst speedup: {min(r['speedup'] for r in results):.2f}x ({min(results, key=lambda x: x['speedup'])['name']})")

transition = None
for i, r in enumerate(results):
    if i > 0 and results[i-1]['winner'] == 'CPU' and r['winner'] == 'GPU':
        transition = r
        break

if transition:
    print(f"\nTransition point: {transition['name']}")
    print(f"  → {transition['rows']:,} rows × {transition['functions']:,} functions")
    print(f"  → Complexity: {transition['rows'] * transition['functions']:,}")

print("=" * 100)


CPU vs GPU Benchmark - Device: cuda

Testing: large_heavy
  Rows: 100,000, Functions: 2,000
  Running CPU version... 



✓ 5.5208s
  Running GPU version... ✓ 0.4816s
  Speedup: 11.46x (GPU wins!)

Testing: large_moderate
  Rows: 50,000, Functions: 1,000
  Running CPU version... 



✓ 1.3574s
  Running GPU version... ✓ 0.1545s
  Speedup: 8.79x (GPU wins!)

Testing: large_standard
  Rows: 100,000, Functions: 1,000
  Running CPU version... 



✓ 2.4130s
  Running GPU version... ✓ 0.2855s
  Speedup: 8.45x (GPU wins!)

Testing: medium_balanced
  Rows: 10,000, Functions: 500
  Running CPU version... 



✓ 0.2361s
  Running GPU version... ✓ 0.0246s
  Speedup: 9.61x (GPU wins!)

Testing: medium_complex
  Rows: 20,000, Functions: 1,000
  Running CPU version... 



✓ 0.6445s
  Running GPU version... ✓ 0.0448s
  Speedup: 14.39x (GPU wins!)

Testing: small_medium
  Rows: 5,000, Functions: 200
  Running CPU version... ✓ 0.0674s
  Running GPU version... ✓ 0.0062s
  Speedup: 10.96x (GPU wins!)

Testing: small_simple
  Rows: 1,000, Functions: 100
  Running CPU version... ✓ 0.0321s
  Running GPU version... ✓ 0.0031s
  Speedup: 10.33x (GPU wins!)





Testing: xlarge_extreme
  Rows: 200,000, Functions: 2,000
  Running CPU version... 



✓ 10.9501s
  Running GPU version... ✓ 0.5911s
  Speedup: 18.53x (GPU wins!)

BENCHMARK RESULTS SUMMARY
Test Case            Rows       Funcs    CPU(s)     GPU(s)     Speedup    Winner
----------------------------------------------------------------------------------------------------
large_heavy          100,000    2,000    5.5208     0.4816     11.46     x GPU
large_moderate       50,000     1,000    1.3574     0.1545     8.79      x GPU
large_standard       100,000    1,000    2.4130     0.2855     8.45      x GPU
medium_balanced      10,000     500      0.2361     0.0246     9.61      x GPU
medium_complex       20,000     1,000    0.6445     0.0448     14.39     x GPU
small_medium         5,000      200      0.0674     0.0062     10.96     x GPU
small_simple         1,000      100      0.0321     0.0031     10.33     x GPU
xlarge_extreme       200,000    2,000    10.9501    0.5911     18.53     x GPU

KEY INSIGHTS:
CPU wins: 0/8
GPU wins: 8/8
Best speedup: 18.53x (xlarge_extreme)
Wo