Groupby操作延伸

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B':['one','one','two','three','two','two','one','three'],
                  'C':np.random.randn(8),
                  'D':np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.301242,2.243248
1,bar,one,2.448059,-0.386413
2,foo,two,-0.077218,-1.04083
3,bar,three,0.055938,-1.174322
4,foo,two,0.454188,0.518382
5,bar,two,-0.208626,-0.534132
6,foo,one,-0.088971,-1.072454
7,foo,three,-1.646909,0.204324


In [6]:
grouped = df.groupby('A')

In [8]:
grouped.count()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,3,3,3
foo,5,5,5


In [10]:
grouped = df.groupby(['A','B'])

In [11]:
grouped.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,1
bar,three,1,1
bar,two,1,1
foo,one,2,2
foo,three,1,1
foo,two,2,2


In [12]:
def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'a'
    else:
        return 'b'
grouped = df.groupby(get_letter_type, axis=1)
grouped.count().iloc[0]

a    1
b    3
Name: 0, dtype: int64

In [17]:
s = pd.Series([1,2,3,1,2,3], [8,7,5,8,7,5])
s

8    1
7    2
5    3
8    1
7    2
5    3
dtype: int64

In [18]:
grouped = s.groupby(level=0)

In [19]:
grouped.first()

5    3
7    2
8    1
dtype: int64

In [20]:
grouped.last()

5    3
7    2
8    1
dtype: int64

In [21]:
grouped.sum()

5    6
7    4
8    2
dtype: int64

In [22]:
df2 = pd.DataFrame({'X':['A','B','A','B'],
                   'Y':[1,2,3,4]})
df2

Unnamed: 0,X,Y
0,A,1
1,B,2
2,A,3
3,B,4


In [23]:
df2.groupby(['X']).get_group('A')

Unnamed: 0,X,Y
0,A,1
2,A,3


In [24]:
df2.groupby(['X']).get_group('B')

Unnamed: 0,X,Y
1,B,2
3,B,4


In [25]:
arrays = [['bar','bar','baz','baz','foo','foo','qux','qux'],
         ['one','two','one','two','one','two','one','two']]

In [32]:
index = pd.MultiIndex.from_arrays(arrays, names=['First','Second'])

In [33]:
s = pd.Series(np.random.randn(8), index=index)
s

First  Second
bar    one      -0.607613
       two       1.764775
baz    one       0.384537
       two      -0.199080
foo    one      -0.260317
       two      -1.563677
qux    one       0.488453
       two      -0.822126
dtype: float64

In [34]:
grouped = s.groupby(level=0)
grouped.sum()

First
bar    1.157162
baz    0.185458
foo   -1.823994
qux   -0.333673
dtype: float64

In [35]:
grouped = s.groupby(level='First')
grouped.sum()

First
bar    1.157162
baz    0.185458
foo   -1.823994
qux   -0.333673
dtype: float64

In [36]:
grouped = s.groupby(level=1)
grouped.sum()

Second
one    0.005060
two   -0.820108
dtype: float64

In [37]:
grouped = s.groupby(level='Second')
grouped.sum()

Second
one    0.005060
two   -0.820108
dtype: float64

In [38]:
df

Unnamed: 0,A,B,C,D
0,foo,one,0.301242,2.243248
1,bar,one,2.448059,-0.386413
2,foo,two,-0.077218,-1.04083
3,bar,three,0.055938,-1.174322
4,foo,two,0.454188,0.518382
5,bar,two,-0.208626,-0.534132
6,foo,one,-0.088971,-1.072454
7,foo,three,-1.646909,0.204324


In [39]:
grouped = df.groupby(['A','B'])
grouped.aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,2.448059,-0.386413
bar,three,0.055938,-1.174322
bar,two,-0.208626,-0.534132
foo,one,0.212271,1.170794
foo,three,-1.646909,0.204324
foo,two,0.376971,-0.522447


In [40]:
grouped = df.groupby(['A','B'], as_index=False)
grouped.aggregate(np.sum)

Unnamed: 0,A,B,C,D
0,bar,one,2.448059,-0.386413
1,bar,three,0.055938,-1.174322
2,bar,two,-0.208626,-0.534132
3,foo,one,0.212271,1.170794
4,foo,three,-1.646909,0.204324
5,foo,two,0.376971,-0.522447


In [42]:
df.groupby(['A','B']).sum().reset_index()

Unnamed: 0,A,B,C,D
0,bar,one,2.448059,-0.386413
1,bar,three,0.055938,-1.174322
2,bar,two,-0.208626,-0.534132
3,foo,one,0.212271,1.170794
4,foo,three,-1.646909,0.204324
5,foo,two,0.376971,-0.522447


In [44]:
grouped = df.groupby(['A','B'])
grouped.size()

A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

In [46]:
grouped.describe().head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,C,C,C,C,C,C,C,C,D,D,D,D,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
bar,one,1.0,2.448059,,2.448059,2.448059,2.448059,2.448059,2.448059,1.0,-0.386413,,-0.386413,-0.386413,-0.386413,-0.386413,-0.386413
bar,three,1.0,0.055938,,0.055938,0.055938,0.055938,0.055938,0.055938,1.0,-1.174322,,-1.174322,-1.174322,-1.174322,-1.174322,-1.174322
bar,two,1.0,-0.208626,,-0.208626,-0.208626,-0.208626,-0.208626,-0.208626,1.0,-0.534132,,-0.534132,-0.534132,-0.534132,-0.534132,-0.534132
foo,one,2.0,0.106135,0.275922,-0.088971,0.008582,0.106135,0.203689,0.301242,2.0,0.585397,2.344555,-1.072454,-0.243528,0.585397,1.414323,2.243248
foo,three,1.0,-1.646909,,-1.646909,-1.646909,-1.646909,-1.646909,-1.646909,1.0,0.204324,,0.204324,0.204324,0.204324,0.204324,0.204324


In [52]:
grouped = df.groupby('A')
grouped['C'].agg([np.sum, np.mean, np.std])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000023393015E80>


Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2.295371,0.765124,1.463456
foo,-1.057668,-0.211534,0.836584


In [70]:
grouped['C'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2.295371,0.765124,1.463456
foo,-1.057668,-0.211534,0.836584
