In [1]:
import pandas as pd
import numpy as np
import random
import string
import gc

In [2]:
def createTable(rowCount):
    gc.collect()
    return pd.DataFrame({'bucket': [''.join(random.choices(string.ascii_lowercase, k=2)) for _ in range(rowCount)],
                  'weight': [random.uniform(0, 2) for _ in range(rowCount)],
                  'qty': [random.randint(0, 100) for _ in range(rowCount)],
                  'risk': [random.randint(0, 10) for _ in range(rowCount)]})

In [3]:
def fn(t):
    res = t.groupby('bucket').agg({'bucket': len, 'qty': [sum, np.mean], 'risk': [sum, np.mean]})
    res.columns = res.columns.map('_'.join)
    return res.rename(columns={'bucket_len':'NR', 'qty_sum':'TOTAL_QTY','qty_mean':'AVG_QTY', 
                        'risk_sum':'TOTAL_RISK','risk_mean':'AVG_RISK'}).join(
        t.groupby('bucket').apply(lambda g: np.average(g.qty, weights=g.weight)).to_frame('W_AVG_QTY')).join(
        t.groupby('bucket').apply(lambda g: np.average(g.risk, weights=g.weight)).to_frame('W_AVG_RISK'))


In [4]:
def my_agg(x):
    data = {'NR': x.bucket.count(),
            'TOTAL_QTY': x.qty.sum(),
            'AVG_QTY': x.qty.mean(),
            'TOTAL_RISK': x.risk.sum(),
            'AVG_RISK': x.risk.mean(),
            'W_AVG_QTY':  np.average(x.qty, weights=x.weight),
            'W_AVG_RISK':  np.average(x.risk, weights=x.weight)
           }
    return pd.Series(data, index=['NR', 'TOTAL_QTY', 'AVG_QTY', 'TOTAL_RISK', 
                                  'AVG_RISK', 'W_AVG_QTY', 'W_AVG_RISK'])

## Row Number 10k

In [5]:
t = createTable(10 * 1000)

In [6]:
%timeit fn(t)

298 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit t.groupby('bucket').apply(my_agg).astype({'NR': 'int64', 'TOTAL_QTY': 'int64', 'TOTAL_RISK': 'int64'})

1.04 s ± 1.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Row Number 100k

In [8]:
t = createTable(100 * 1000)

In [9]:
%timeit fn(t)

351 ms ± 5.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit t.groupby('bucket').apply(my_agg).astype({'NR': 'int64', 'TOTAL_QTY': 'int64', 'TOTAL_RISK': 'int64'})

1.09 s ± 6.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Row Number 1M

In [11]:
t = createTable(1000 * 1000)

In [12]:
%timeit fn(t)

1.1 s ± 6.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit t.groupby('bucket').apply(my_agg).astype({'NR': 'int64', 'TOTAL_QTY': 'int64', 'TOTAL_RISK': 'int64'})

1.53 s ± 27.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Row Number 10M

In [14]:
t = createTable(10 * 1000 * 1000)

In [15]:
%timeit fn(t)

14.5 s ± 97.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%timeit t.groupby('bucket').apply(my_agg).astype({'NR': 'int64', 'TOTAL_QTY': 'int64', 'TOTAL_RISK': 'int64'})

7.97 s ± 68.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Row Number 100M

In [17]:
t = createTable(100 * 1000 * 1000)

In [18]:
%timeit fn(t)

2min 17s ± 1 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit t.groupby('bucket').apply(my_agg).astype({'NR': 'int64', 'TOTAL_QTY': 'int64', 'TOTAL_RISK': 'int64'})

1min 9s ± 212 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Row Number 1000M

In [None]:
t = createTable(1000 * 1000 * 1000)

In [None]:
%timeit fn(t)

In [None]:
%timeit t.groupby('bucket').apply(my_agg).astype({'NR': 'int64', 'TOTAL_QTY': 'int64', 'TOTAL_RISK': 'int64'})