In [2]:
import numpy as np
import pandas as pd
import timeit
# from numba import njit

In [2]:
df = pd.DataFrame({
    'first category': [0, 1, 2, 0, 1, 1, 0], 
    'y': np.arange(0, .7, .1)
})
df

Unnamed: 0,first category,y
0,0,0.0
1,1,0.1
2,2,0.2
3,0,0.3
4,1,0.4
5,1,0.5
6,0,0.6


In [3]:
group_means = df.groupby('first category')['y'].apply(np.mean)
group_means

first category
0    0.300000
1    0.333333
2    0.200000
Name: y, dtype: float64

In [4]:
df['mean'] = df.groupby('first category')['y'].transform(np.mean)
df

Unnamed: 0,first category,y,mean
0,0,0.0,0.3
1,1,0.1,0.333333
2,2,0.2,0.2
3,0,0.3,0.3
4,1,0.4,0.333333
5,1,0.5,0.333333
6,0,0.6,0.3


In [9]:
n_decimals = 3
n_iters = 100

n_obs = 10**4
n_categories = 10**3

first_category = np.random.choice(n_categories, n_obs)
np.random.seed(2016)
y = np.random.normal(0, 1, n_obs)

df = pd.DataFrame({'first category': first_category,
                   'y': y})
                     
start = time.perf_counter()
grouped = df.groupby('first category')
pandas_answer = grouped.apply(np.mean)
print('time to compute group means once with Pandas: {0}'\
      .format(round(time.perf_counter() - start, n_decimals)))

start = time.perf_counter()
for i in range(n_iters):
    grouped['y'].apply(np.mean)
print('time to compute group means {0} times with Pandas: {1}'\
      .format(n_iters, round(time.perf_counter() - start, n_decimals)))

time to compute group means once with Pandas: 1.105
time to compute group means 100 times with Pandas: 15.737


## Faster operations without Pandas: The Groupby Class

In [10]:
class Groupby:
    def __init__(self, keys):
        _, self.keys_as_int = np.unique(keys, return_inverse = True)
        self.n_keys = max(self.keys_as_int) + 1
        self.set_indices()
        
    def set_indices(self):
        self.indices = [[] for i in range(self.n_keys)]
        for i, k in enumerate(self.keys_as_int):
            self.indices[k].append(i)
        self.indices = [np.array(elt) for elt in self.indices]
        
    def apply(self, function, vector, broadcast):
        if broadcast:
            result = np.zeros(len(vector))
            for idx in self.indices:
                result[idx] = function(vector[idx])
        else:
            result = np.zeros(self.n_keys)
            for k, idx in enumerate(self.indices):
                result[self.keys_as_int[k]] = function(vector[idx])

        return result

In [37]:
n_obs = 10**7
n_categories = 10**4

In [38]:
df = pd.DataFrame(np.random.randint(0, 100, size=(n_obs, 4)), columns=list('ABCD'))

In [39]:
df['category'] = np.random.choice(n_categories, n_obs)

In [40]:
df

Unnamed: 0,A,B,C,D,category
0,64,62,76,2,9424
1,28,86,82,11,579
2,5,45,15,89,8000
3,84,94,48,80,7576
4,41,88,4,80,1841
...,...,...,...,...,...
9999995,45,96,49,21,7233
9999996,97,46,94,7,2529
9999997,64,53,50,76,9645
9999998,91,17,26,12,1573


In [62]:
%%time
grouped = Groupby(df['category'])

Wall time: 11.8 s


<__main__.Groupby at 0x204b44ea948>

In [65]:
%%time
grouped.apply(np.mean, df['A'], broadcast=True)

Wall time: 635 ms


array([54.8       , 48.33333333, 41.77777778, ...,  0.        ,
        0.        ,  0.        ])

In [71]:
%%time
df.groupby('category')['A'].apply(np.mean)

Wall time: 5.72 s


category
0       48.951196
1       49.814556
2       50.042574
3       49.415370
4       49.213402
          ...    
9995    49.621782
9996    48.583333
9997    49.524772
9998    48.487487
9999    48.846686
Name: A, Length: 10000, dtype: float64

In [72]:
df

Unnamed: 0,A,B,C,D,category
0,64,62,76,2,9424
1,28,86,82,11,579
2,5,45,15,89,8000
3,84,94,48,80,7576
4,41,88,4,80,1841
...,...,...,...,...,...
9999995,45,96,49,21,7233
9999996,97,46,94,7,2529
9999997,64,53,50,76,9645
9999998,91,17,26,12,1573


In [5]:
n_obs = 10**7
n_categories = 10**4

In [6]:
df = pd.DataFrame(np.random.randint(0, 100, size=(n_obs, 4)), columns=list('ABCD'))

In [7]:
df['category'] = np.random.choice(n_categories, n_obs)

In [8]:
df

Unnamed: 0,A,B,C,D,category
0,80,26,53,87,97
1,74,49,39,52,9402
2,8,3,58,28,4818
3,38,60,49,88,4817
4,43,87,58,27,6885
...,...,...,...,...,...
9999995,51,59,2,94,3475
9999996,56,8,61,39,7673
9999997,44,78,60,93,9701
9999998,72,67,18,42,9325


In [9]:
# @njit
def groupby_apply_njit(data, func):
    """
    The first column of data is group id. The second is y and the third is x.
    """
    ngroups = int(data[-1,0])+1   # Number of groups
    nrows = data.shape[0]    # Number of rows
    reslist = []
    istart = 0
    for k in range(ngroups):
        # Find start and end rows of the group
        # (istart point to the start and iend-1 point to the end
        iend = istart + 1
        while iend < nrows and data[iend-1,0] == data[iend,0]:
            iend += 1
        # Apply the function to the numpy array in the group
        res = func(data[istart:iend,1:])
        reslist.append(np.hstack((np.array([k]), res)))
        # Move to the next group
        istart = iend
    return reslist

In [49]:
%%time
df = df.sort_values(by=['category'])
# lambda x: np.mean(x, axis=0)
temp = groupby_apply(df[['category', 'A', 'B']].to_numpy(), lambda x: list(x))

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

In [41]:
df

Unnamed: 0,A,B,C,D,category
6732922,73,27,24,61,0
3242494,34,44,23,5,0
2143077,81,53,87,44,0
4285886,63,14,28,47,0
2991932,59,7,81,34,0
...,...,...,...,...,...
8021493,72,75,4,26,9999
1797011,21,38,88,63,9999
753926,36,78,57,93,9999
5653644,33,81,46,89,9999


In [48]:
temp

[array([0, <class 'list'>], dtype=object),
 array([1, <class 'list'>], dtype=object),
 array([2, <class 'list'>], dtype=object),
 array([3, <class 'list'>], dtype=object),
 array([4, <class 'list'>], dtype=object),
 array([5, <class 'list'>], dtype=object),
 array([6, <class 'list'>], dtype=object),
 array([7, <class 'list'>], dtype=object),
 array([8, <class 'list'>], dtype=object),
 array([9, <class 'list'>], dtype=object),
 array([10, <class 'list'>], dtype=object),
 array([11, <class 'list'>], dtype=object),
 array([12, <class 'list'>], dtype=object),
 array([13, <class 'list'>], dtype=object),
 array([14, <class 'list'>], dtype=object),
 array([15, <class 'list'>], dtype=object),
 array([16, <class 'list'>], dtype=object),
 array([17, <class 'list'>], dtype=object),
 array([18, <class 'list'>], dtype=object),
 array([19, <class 'list'>], dtype=object),
 array([20, <class 'list'>], dtype=object),
 array([21, <class 'list'>], dtype=object),
 array([22, <class 'list'>], dtype=object)

In [38]:
temp

[array([ 0.        , 49.80320856, 51.57860963]),
 array([ 1.        , 49.77077077, 50.95495495]),
 array([ 2.        , 48.66830226, 50.24435721]),
 array([ 3.        , 49.00996016, 48.70916335]),
 array([ 4.       , 48.5106599, 48.4213198]),
 array([ 5.        , 49.75834176, 50.48129424]),
 array([ 6.        , 49.96028513, 49.02342159]),
 array([ 7.        , 49.95643756, 49.45982575]),
 array([ 8.        , 49.96908213, 48.10434783]),
 array([ 9.        , 48.23036649, 50.66492147]),
 array([10.        , 49.5050813 , 51.02337398]),
 array([11.        , 50.77723735, 48.35603113]),
 array([12.        , 48.39656912, 50.61049445]),
 array([13.        , 51.25609756, 49.2449187 ]),
 array([14.        , 51.07021063, 49.60882648]),
 array([15.        , 48.3781344 , 47.60381143]),
 array([16.        , 49.94305019, 49.05212355]),
 array([17.        , 50.89020772, 49.27002967]),
 array([18.        , 48.91173305, 50.87405813]),
 array([19.        , 49.28367347, 49.46632653]),
 array([20.        , 49

In [32]:
temp = pd.DataFrame(temp, columns=['category', 'A', 'B'])

ValueError: 3 columns passed, passed data had 2 columns

In [None]:
temp['category'] = temp['category'].astype('int64')

In [None]:
temp.head()

In [24]:
%%time
temp2 = df.groupby('category')[['A', 'B']].apply(lambda x: np.mean(x, axis=0))
temp2 = temp2.reset_index()

Wall time: 9.35 s


In [25]:
temp2.head()

Unnamed: 0,category,A,B
0,0,49.803209,51.57861
1,1,49.770771,50.954955
2,2,48.668302,50.244357
3,3,49.00996,48.709163
4,4,48.51066,48.42132


In [26]:
pd.testing.assert_frame_equal(temp, temp2)

In [47]:
first_category = np.random.choice(n_categories, n_obs)
np.random.seed(2016)
y = np.random.normal(0, 1, n_obs)

In [53]:
len(np.unique(first_category))

10000

In [54]:
len(first_category)

10000000

In [49]:
y

array([ 0.29485409,  0.50803768,  0.7047403 , ...,  0.27281381,
        0.60083149, -1.25487632])

In [11]:
unique_keys, indices = np.unique(['a', 'b', 'a', 'a', 'b'], return_inverse = True)
# array([{'b', 'a'}], dtype=object)
print(unique_keys) 
# [0, 1, 0, 0, 1]
print(indices) 
# array(['a', 'b', 'a', 'a', 'b'], dtype='<U1')
print(unique_keys[indices]) 

['a' 'b']
[0 1 0 0 1]
['a' 'b' 'a' 'a' 'b']


In [20]:
start = timeit.timeit()
grouped = Groupby(first_category)

group_means = Groupby(first_category).apply(np.mean, y, broadcast=False)
print('time to compute group means once with Grouped: {0}'\
      .format(round(timeit.timeit() - start, n_decimals)))

start = timeit.timeit()
grouped = Groupby(first_category)
for i in range(n_iters):
    grouped.apply(np.mean, y, broadcast=False)
    
print('time to compute group means {0} times with Grouped: {1}'\
      .format(n_iters, timeit.timeit()-start, n_decimals))

time to compute group means once with Grouped: -0.007
time to compute group means 100 times with Grouped: 0.0022116999999752807
