In [1]:
import numpy as np
import pandas as pd
import timeit
# from numba import njit
from joblib import Parallel, delayed

### Create `DataFrame`

In [2]:
n_obs = 10**7
# n_categories = 10**4

In [3]:
np.random.seed(0)
df = pd.DataFrame(np.random.randint(0, 100000, size=(n_obs, 4)), columns=list('ABCD'))

In [4]:
np.random.seed(0)
n_categories = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
df['category'] = np.random.choice(n_categories, n_obs, p=[0.25, 0.05, 0.1, 0.15, 0.05, 0.01, 0.03, 0.01, 0.15, 0.2])

In [5]:
df['category'] = df['category'] + '11111111111'

In [6]:
df

Unnamed: 0,A,B,C,D,category
0,68268,43567,42613,45891,D11111111111
1,21243,95939,97639,41993,I11111111111
2,86293,55026,80471,80966,F11111111111
3,48600,39512,52620,80186,D11111111111
4,17089,32230,18983,89688,D11111111111
...,...,...,...,...,...
9999995,49020,65141,60451,63899,J11111111111
9999996,94605,7258,1077,19023,I11111111111
9999997,21936,66721,50069,82394,I11111111111
9999998,51091,51063,80563,35622,D11111111111


## Faster operations without Pandas: The Groupby Class

In [7]:
class Groupby:
    def __init__(self, dataframe, key_name):
        self.dataframe = dataframe
        self.key_name = key_name
        self.key, self.index_of_key = np.unique(self.dataframe[self.key_name], return_inverse = True)
        self.set_indices()
        
    def set_indices(self):
        self.indices = [ [] for i in range( len(self.key) ) ]  # 不可以 [[]]*n 因為這樣 list 裡面的 list 會指向同一個物件
        for idx, val in enumerate(self.index_of_key):
            self.indices[val].append(idx)
            
        self.indices = [np.array(elt) for elt in self.indices]  # 有了這步會讓運算速度差很多
         
        
    def apply(self, function, vector, broadcast):
        if broadcast:
            result = np.full(len(vector), np.nan)
            for idx in self.indices:
                result[idx] = function(vector[idx])
            
        else:
            result = {}
            for k, idx in enumerate(self.indices):
                result[self.key[k]] = function(vector[idx])

        return result

## Homemade Class

In [8]:
%%time
grouped = Groupby(df, 'category')

CPU times: user 10.9 s, sys: 128 ms, total: 11.1 s
Wall time: 11.1 s


In [9]:
%%time
temp = grouped.apply(np.mean, df['A'], broadcast=False)

CPU times: user 201 ms, sys: 51.8 ms, total: 253 ms
Wall time: 252 ms


In [10]:
temp

{'A11111111111': 50019.983059673286,
 'B11111111111': 50074.96950511512,
 'C11111111111': 50017.36289841316,
 'D11111111111': 49970.515621664745,
 'E11111111111': 50051.83476501165,
 'F11111111111': 50015.94665474744,
 'G11111111111': 50076.83655810439,
 'H11111111111': 49821.369441471004,
 'I11111111111': 50007.956103722565,
 'J11111111111': 50024.362795893525}

In [11]:
%%time
temp = grouped.apply(list, df['A'], broadcast=False)

CPU times: user 602 ms, sys: 70.1 ms, total: 672 ms
Wall time: 671 ms


In [12]:
temp['A11111111111'][0:10]

[20006, 96591, 20737, 50624, 84355, 98611, 99438, 469, 38040, 84523]

## Pandas

In [13]:
%%time
grouped = df.groupby('category')

CPU times: user 96 µs, sys: 3.21 ms, total: 3.31 ms
Wall time: 3.32 ms


In [14]:
%%time
temp = grouped['A'].apply(np.mean)

CPU times: user 405 ms, sys: 47.1 ms, total: 452 ms
Wall time: 451 ms


In [15]:
temp

category
A11111111111    50019.983060
B11111111111    50074.969505
C11111111111    50017.362898
D11111111111    49970.515622
E11111111111    50051.834765
F11111111111    50015.946655
G11111111111    50076.836558
H11111111111    49821.369441
I11111111111    50007.956104
J11111111111    50024.362796
Name: A, dtype: float64

In [16]:
%%time
temp = grouped['A'].apply(list)

CPU times: user 533 ms, sys: 50 ms, total: 583 ms
Wall time: 583 ms


In [17]:
temp

category
A11111111111    [20006, 96591, 20737, 50624, 84355, 98611, 994...
B11111111111    [56894, 34009, 44889, 55913, 99036, 48798, 664...
C11111111111    [797, 40800, 82127, 94201, 63413, 90718, 39335...
D11111111111    [68268, 48600, 17089, 67699, 41504, 44259, 138...
E11111111111    [11723, 23306, 57368, 10100, 49747, 27479, 846...
F11111111111    [86293, 11605, 79285, 88009, 96622, 1918, 4209...
G11111111111    [55153, 33920, 90749, 18728, 60565, 2173, 4247...
H11111111111    [82457, 13729, 19991, 3703, 79391, 51284, 9814...
I11111111111    [21243, 90535, 86107, 7012, 79701, 51939, 5971...
J11111111111    [83966, 47954, 92288, 80163, 19340, 4420, 8638...
Name: A, dtype: object