In [None]:
import time
import pandas as pd
import numpy as np
import os
import sys

In [None]:
# ensure project root on path
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

from src.exact_engine import ExactQueryEngine
from src.approx_engine import ApproxQueryEngine

def relative_error(df_true, df_est, key_col, val_col_true, val_col_est):
    df = df_true.merge(df_est, on=key_col, how='left')
    df[val_col_est] = df[val_col_est].fillna(0)
    mask = df[val_col_true] != 0
    df['rel_err'] = 0.0
    df.loc[mask, 'rel_err'] = (df.loc[mask, val_col_est] - df.loc[mask, val_col_true]).abs() / df.loc[mask, val_col_true]
    return df['rel_err'].mean()

def run_benchmark(path='data/sample_data.csv', agg='SUM', col='value', group_by=['group'], out_csv='benchmarks/results.csv'):
    df = pd.read_csv(path)
    exact = ExactQueryEngine(df)
    true = exact.query(agg, col=col, group_by=group_by)
    
    fractions = [0.01, 0.02, 0.05, 0.1, 0.2]
    results = []
    
    for f in fractions:
        start = time.time()
        approx = ApproxQueryEngine(df, sample_frac=f)
        est = approx.query(agg, col=col, group_by=group_by)
        dur = time.time() - start   
        if agg == 'SUM':
            val_true = 'sum'
            val_est = 'est_sum'
        elif agg == 'COUNT':
            val_true = 'count'
            val_est = 'est_count'
        elif agg in ('AVG', 'MEAN'):
            val_true = 'avg'
            val_est = 'sample_mean'
        else:
            raise ValueError('agg not supported in benchmark')     
        err = relative_error(true, est, key_col=group_by, val_col_true=val_true, val_col_est=val_est)
        results.append({'frac': f, 'time_sec': dur, 'rel_error': err})
        print(f'frac={f:.2f}: time={dur:.4f}s rel_error={err:.4f}')
     
    df_results = pd.DataFrame(results)
    df_results.to_csv(out_csv, index=False)
    print(f'\nSaved benchmark results to {out_csv}')
    return df_results

if __name__ == '__main__':
run_benchmark()