In [None]:
import pandas as pd
import numpy as np
from typing import List, Optional
from .sampler import uniform_sample

In [None]:
class ApproxQueryEngine:
    def __init__(self, df: pd.DataFrame, sample_frac: float = 0.1, random_state: int = 42):
        self.df = df
        self.sample_frac = sample_frac
        self.random_state = random_state
        # Precompute sample for the prototype (can be refreshed/streamed later)
        self.sample = uniform_sample(df, frac=sample_frac, random_state=random_state)
    
    
    def refresh_sample(self, frac: Optional[float] = None):
        if frac is not None:
            self.sample_frac = frac
        self.sample = uniform_sample(self.df, frac=self.sample_frac, random_state=self.random_state)
    
    
    def query(self, agg: str, col: Optional[str] = None, group_by: Optional[List[str]] = None, where=None):
        s = self.sample
        if where is not None:
            s = s.query(where)
       
        gb = s.groupby(group_by) if group_by else None
        N = len(self.df)
        f = self.sample_frac
        
        
        agg = agg.upper()
        if agg == 'COUNT':
            # c_sample is number of rows in sample per group; estimate = c_sample / f
            if gb is not None:
                c = gb.size().reset_index(name='sample_count')
                c['est_count'] = (c['sample_count'] / f).round().astype(int)
                # simple approximate standard error for count estimate
                c['se'] = np.sqrt(c['sample_count'] * (1 - f)) / f
                c['rel_error_pct'] = np.abs(c['est_count'] - c['sample_count'] / f) / (c['est_count'].replace(0, np.nan)) * 100
                return c
            sample_total = len(s)
            est = int(round(sample_total / f))
            se = np.sqrt(sample_total * (1 - f)) / f
            return pd.DataFrame([{'sample_count': sample_total, 'est_count': est, 'se': se}])
         
        if agg == 'SUM':
            if col is None:
                raise ValueError('SUM needs a column')
            if gb is not None:
                ssum = gb[col].sum().reset_index(name='sample_sum')
                # scale sum by 1/f
                ssum['est_sum'] = ssum['sample_sum'] / f
                # rough SE estimate using sample variance: se(sum) ~ sqrt(sample_var * n_sample) * (1/f)
                # This is a simple heuristic for the prototype.
                sample_counts = gb[col].count().reset_index(name='sample_count')
                sample_vars = gb[col].var(ddof=0).reset_index(name='sample_var')
                ssum = ssum.merge(sample_counts, on=group_by)
                ssum = ssum.merge(sample_vars, on=group_by)
                ssum['se'] = np.sqrt(ssum['sample_var'] * ssum['sample_count']) / f
                return ssum
            sample_sum = s[col].sum()
            est_sum = sample_sum / f
            sample_var = s[col].var(ddof=0)
            se = np.sqrt(sample_var * len(s)) / f
            return pd.DataFrame([{'sample_sum': sample_sum, 'est_sum': est_sum, 'se': se}])
        
        
        if agg == 'AVG' or agg == 'MEAN':
            if col is None:
                raise ValueError('AVG needs a column')
            if gb is not None:
                smean = gb[col].mean().reset_index(name='sample_mean')
                scount = gb[col].count().reset_index(name='sample_count')
                # For average, sample mean is unbiased estimator of population mean; we report sample_mean
                res = smean.merge(scount, on=group_by)
                # rough SE for mean: sqrt(sample_var / n_sample)
                svar = gb[col].var(ddof=0).reset_index(name='sample_var')
                res = res.merge(svar, on=group_by)
                res['se'] = np.sqrt(res['sample_var'] / res['sample_count'])
                return res
            sample_mean = s[col].mean()
            sample_var = s[col].var(ddof=0)
            se = np.sqrt(sample_var / len(s))
            return pd.DataFrame([{'sample_mean': sample_mean, 'se': se}])
        
        raise ValueError(f'Unknown agg: {agg}')