In [2]:
import pandas as pd
import numpy as np


# groupby mechanics

## 分类统计

In [3]:
df1 = pd.DataFrame({'key1':['a','a','b','b','a'],
                    'key2':['one','two','one','two','one'],
                    'data1':np.random.randn(5),
                    'data2':np.random.randn(5)})
df1

Unnamed: 0,data1,data2,key1,key2
0,-0.048543,-0.022294,a,one
1,-2.138261,-0.755281,a,two
2,-1.089498,0.055245,b,one
3,-0.942707,-0.269355,b,two
4,-1.319833,-1.617892,a,one


In [14]:
mean1 = df1['data1'].groupby([df1['key1'],df1['key2']]).mean()
mean1

key1  key2
a     one    -0.684188
      two    -2.138261
b     one    -1.089498
      two    -0.942707
Name: data1, dtype: float64

In [17]:
df1[['key1','data1','key2']].groupby(by = ['key1','key2'],as_index = True,group_keys = False).mean().unstack()  

Unnamed: 0_level_0,data1,data1
key2,one,two
key1,Unnamed: 1_level_2,Unnamed: 2_level_2
a,-0.684188,-2.138261
b,-1.089498,-0.942707


In [15]:
mean1.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.684188,-2.138261
b,-1.089498,-0.942707


In [19]:
states = ['ohio','california','california','ohio','ohio']
df1['data1'].groupby(by = [states,np.array([2005,2005,2006,2005,2006])]).mean()

california  2005   -2.138261
            2006   -1.089498
ohio        2005   -0.495625
            2006   -1.319833
Name: data1, dtype: float64

In [29]:
df1.groupby(by = ['key1','key2']).count() # 更简单的方法

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,2,2
a,two,1,1
b,one,1,1
b,two,1,1


In [32]:
df1.groupby(by = ['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## 迭代分出来的类

In [34]:
for name,group in df1.groupby(by = ['key1','key2']):
    print(name)
    print(group)

('a', 'one')
      data1     data2 key1 key2
0 -0.048543 -0.022294    a  one
4 -1.319833 -1.617892    a  one
('a', 'two')
      data1     data2 key1 key2
1 -2.138261 -0.755281    a  two
('b', 'one')
      data1     data2 key1 key2
2 -1.089498  0.055245    b  one
('b', 'two')
      data1     data2 key1 key2
3 -0.942707 -0.269355    b  two


In [39]:
pieces = dict(list(df1.groupby('key1')))
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,-1.089498,0.055245,b,one
3,-0.942707,-0.269355,b,two


In [41]:
df1.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [46]:
list(df1.groupby(by = df1.dtypes,axis = 1))[0][1]

Unnamed: 0,data1,data2
0,-0.048543,-0.022294
1,-2.138261,-0.755281
2,-1.089498,0.055245
3,-0.942707,-0.269355
4,-1.319833,-1.617892


In [50]:
df1.groupby('key1')['data1'] is df1.groupby('key1')[['data1']]#和[['data1']] 的结果一样

False

In [51]:
df2 = pd.DataFrame(np.random.randn(6,4),
                   columns = pd.Index(['a','b','c','d']),
                   index = ['one','two','three','four','five','six'])

In [52]:
df2

Unnamed: 0,a,b,c,d
one,0.011484,0.383645,-0.355455,0.169154
two,-0.792145,-0.532953,-0.982333,1.299576
three,-0.735642,-0.481439,-1.128822,-1.803897
four,-0.943761,-1.252705,-0.357421,0.487231
five,-0.344771,0.373569,0.229179,-0.203846
six,0.61131,-0.214866,0.100822,0.169901


In [57]:
mapping = {'a':'red','b':'blue','c':'red','d':'blue','e':'yellow'}
mapping2 = {'one':'small','two':'small','three':'small','four':'large'}
df2.groupby(mapping,axis = 1).mean()

Unnamed: 0,blue,red
one,0.276399,-0.171985
two,0.383312,-0.887239
three,-1.142668,-0.932232
four,-0.382737,-0.650591
five,0.084862,-0.057796
six,-0.022483,0.356066


In [60]:
df2.groupby(mapping2).mean().groupby(mapping,axis =1).mean()

Unnamed: 0,blue,red
large,-0.382737,-0.650591
small,-0.160986,-0.663819


In [61]:
s1 = pd.Series(mapping)
s1

a       red
b      blue
c       red
d      blue
e    yellow
dtype: object

In [63]:
df2.groupby(s1,axis=1).mean()

Unnamed: 0,blue,red
one,0.276399,-0.171985
two,0.383312,-0.887239
three,-1.142668,-0.932232
four,-0.382737,-0.650591
five,0.084862,-0.057796
six,-0.022483,0.356066


In [66]:
df2.groupby(len).count()

Unnamed: 0,a,b,c,d
3,3,3,3,3
4,2,2,2,2
5,1,1,1,1


In [72]:
df2.groupby(by = [mapping2,len]).count()

Unnamed: 0,Unnamed: 1,a,b,c,d
large,4,1,1,1,1
small,3,2,2,2,2
small,5,1,1,1,1


In [94]:
df3 = pd.DataFrame(np.random.normal(0,1,(5,5)),
                   index = pd.MultiIndex.from_arrays(
                       [['US','US','US','JP','JP'],[1,3,5,1,3]],
                   names = ['cty','tensor']))

In [98]:
df3.groupby(level = 'cty',axis = 0).count()

Unnamed: 0_level_0,0,1,2,3,4
cty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
JP,2,2,2,2,2
US,3,3,3,3,3
