## Hardware Details
[GCP](https://cloud.google.com/) VM: [n1-highmem-16](https://cloud.google.com/compute/docs/machine-types#n1_machine_types) (16 vCPUs, 104 GB memory)

In [1]:
%%bash
lscpu

In [2]:
%%bash
cat /proc/meminfo | head -n1

## Basic functions

In [3]:
import pandas as pd
import numpy as np
import random
import string
import gc

from siuba import _, summarize, group_by
from siuba.experimental.pd_groups import fast_summarize

In [4]:
def createTable(rowCount):
    gc.collect()
    return pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],
                  'weight': np.random.uniform(0, 2, rowCount),
                  'qty': np.random.randint(100, size=rowCount),
                  'risk': np.random.randint(10, size=rowCount)})

In [5]:
def executeQueryFastSummarize(t):
    return (
        t >>
        group_by(_.bucket) >>
        fast_summarize(
            NR = _.bucket.count(),
            TOTAL_QTY = _.qty.sum(),
            AVG_QTY = _.qty.mean(),
            TOTAL_RISK = _.risk.sum(),
            AVG_RISK = _.risk.mean(),
            W_AVG_QTY = (_.qty * _.weight).sum() / _.weight.sum(),
            W_AVG_RISK = (_.risk * _.weight).sum() / _.weight.sum()
        )
    )

In [6]:
# keep executeQueryApply just for reference
def my_agg(x):
    data = {'NR': x.bucket.count(),
            'TOTAL_QTY': x.qty.sum(),
            'AVG_QTY': x.qty.mean(),
            'TOTAL_RISK': x.risk.sum(),
            'AVG_RISK': x.risk.mean(),
            'W_AVG_QTY':  np.average(x.qty, weights=x.weight),
            'W_AVG_RISK':  np.average(x.risk, weights=x.weight)
           }
    return pd.Series(data, index=['NR', 'TOTAL_QTY', 'AVG_QTY', 'TOTAL_RISK', 
                                  'AVG_RISK', 'W_AVG_QTY', 'W_AVG_RISK'])

def executeQueryApply(t):
    return t.groupby('bucket').apply(my_agg).astype(
        {'NR': 'int64', 'TOTAL_QTY': 'int64', 'TOTAL_RISK': 'int64'})

## Demonstrate equality of methods

In [7]:
from pandas.testing import assert_frame_equal

t = createTable(10*10000)
assert_frame_equal(
    executeQueryFastSummarize(t).drop(columns = "bucket"),
    executeQueryApply(t).reset_index(drop = True)
)

## Row Number 10k

In [8]:
t = createTable(10 * 1000)

In [9]:
%timeit executeQueryFastSummarize(t)

26.4 ms ± 3.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
%timeit executeQueryApply(t)

1.34 s ± 122 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Row Number 100k

In [11]:
del t
t = createTable(100 * 1000)

In [12]:
%timeit executeQueryFastSummarize(t)

36.9 ms ± 628 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%timeit executeQueryApply(t)

1.19 s ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Row Number 1M

In [14]:
del t
t = createTable(1000 * 1000)

In [15]:
%timeit executeQueryFastSummarize(t)

175 ms ± 2.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%timeit executeQueryApply(t)

1.65 s ± 9.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Row Number 10M

In [17]:
del t
t = createTable(10 * 1000 * 1000)

In [18]:
%timeit executeQueryFastSummarize(t)

1.51 s ± 18.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit executeQueryApply(t)

7.2 s ± 460 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Row Number 100M

In [26]:
del t
t = createTable(100 * 1000 * 1000)

KeyboardInterrupt: 

In [None]:
%timeit -n 1 -r 10 executeQueryFastSummarize(t)

In [None]:
%timeit -n 1 -r 10  executeQueryApply(t)

## Row Number 1B

In [None]:
del t
t = createTable(1000 * 1000 * 1000)

In [None]:
%timeit -n 1 -r 10 executeQueryFastSummarize(t)

In [None]:
%timeit -n 1 -r 10 executeQueryApply(t)