In [1]:
%pylab
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


# 第9章 数据聚合与分组运算


对数据集进行分组并对各组应用一个函数（无论是聚合还是转换），这是数据分析工作中的重要环节。在数据集准备好后，通常任务就是**计算分组统计**或**生成透视表**。

pandas提供了一个灵活高效的groupby功能，以一种自然方式对数据集进行切片、切块、摘要等操作。

本章内容：
- 根据一个或多个键（可以是函数、数组或DataFrame列名）拆分pandas对象。
- 计算分组摘要统计，如技术、平均值、标准拆，或用户自定义函数。
- 对DataFrame的列应用各种函数
- 应用组内转换或其他运算，如规格化、线性回归、排名或选取子集等
- 计算透视表或交叉表
- 执行分位数分析以及其他分组分析

## GroupBy技术
分组运算术语“split-apply-combine”拆分-应用-合并。
1. pandas对象中的数据根据提供的一个或多个键被**拆分**为多组。
2. 将一个函数**应用**到各个分组并产生一个新值。
3. 所有结果被**合并**到最终的结果对象中。

分组键可以有多种形式，且类型不必相同：
- 列表或数组，其长度与待分组的轴一样
- 表示DataFrame某个列名的值
- 字典或Series，给出待分组轴上的值与分组名之间的对应关系
- 函数，用于处理轴索引或索引中的各个标签

In [2]:
# 以表格型数据集为例
df = DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
               'key2': ['one', 'two', 'one', 'two', 'one'],
               'data1': np.random.randn(5),
               'data2': np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-0.121208,-1.250004,a,one
1,1.49758,0.373815,a,two
2,-0.845303,1.454581,b,one
3,-0.451083,0.25801,b,two
4,1.689103,0.574249,a,one


In [3]:
# 按key1分组，并计算data1列的平均值
# 访问data1，根据key1调用groupby
grouped = df['data1'].groupby(df['key1'])
grouped
# grouped是一个GroupBy对象，但没有进行任何计算，只含有一些有关分组键df['key1']的中间数据
# 换句话说，该对象已经有了接下来对各分组执行运算所需的一切信息。

<pandas.core.groupby.SeriesGroupBy object at 0x0316B630>

In [4]:
# 调用GroupBy对象的mean方法来计算平均值
grouped.mean()

key1
a    1.021825
b   -0.648193
Name: data1, dtype: float64

In [5]:
# 一次传入多个数组：
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.783947
      two     1.497580
b     one    -0.845303
      two    -0.451083
Name: data1, dtype: float64

In [6]:
# unstack()行-》列，默认操作最内级
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.783947,1.49758
b,-0.845303,-0.451083


In [7]:
# 分组键不仅为Series，可以为任何长度适当的数组：
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()

California  2005    1.497580
            2006   -0.845303
Ohio        2005   -0.286145
            2006    1.689103
Name: data1, dtype: float64

In [8]:
# GroupBy的size方法，返回一个含有分组大小的Series：
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 对分组进行迭代
GroupBy对象支持迭代，可以产生一组二元元组（由分组名和数据块组成）

for name, group in df.groupby('key1'):
    print name
    print group

对于多重键的情况，元组的第一个元素将会是由键值组成的元组：

In [9]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print k1, k2
    print group

a one
      data1     data2 key1 key2
0 -0.121208 -1.250004    a  one
4  1.689103  0.574249    a  one
a two
     data1     data2 key1 key2
1  1.49758  0.373815    a  two
b one
      data1     data2 key1 key2
2 -0.845303  1.454581    b  one
b two
      data1    data2 key1 key2
3 -0.451083  0.25801    b  two


In [10]:
# 将这些数据片段做成一个字典
pieces = dict(list(df.groupby('key1')))
pieces['a']

Unnamed: 0,data1,data2,key1,key2
0,-0.121208,-1.250004,a,one
1,1.49758,0.373815,a,two
4,1.689103,0.574249,a,one


groupby默认在axis=0上进行分组，设置后可以在其他任何轴上分组。根据dtype对列进行分组：

In [11]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [12]:
grouped = df.groupby(df.dtypes, axis=1)

In [13]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -0.121208 -1.250004
 1  1.497580  0.373815
 2 -0.845303  1.454581
 3 -0.451083  0.258010
 4  1.689103  0.574249, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

### 选择一个或一组列
用一个或一组列名对DataFrame产生的GroupBy对象进行索引，就行实现选取部分列进行聚合的目的。
```
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
```
是一下代码的语法糖：
```
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
```

In [14]:
# 计算data2列的平均值并以DataFrame形式得到结果：
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.337877
a,two,0.373815
b,one,1.454581
b,two,0.25801


### 通过字典或Series进行分组
除数组外，分组信息还可以其他形式存在。

In [15]:
people = DataFrame(np.random.randn(5, 5),
                  columns=['a', 'b', 'c', 'd', 'e'],
                  index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

people.ix[2:3, ['b', 'c']] = np.nan # 添加几个NA值
people

Unnamed: 0,a,b,c,d,e
Joe,0.444033,1.320775,0.077583,0.757046,-0.838681
Steve,0.10151,-1.281982,-0.383059,-0.445106,-0.189668
Wes,-1.896736,,,1.29203,-0.853865
Jim,1.247609,0.164016,-0.59889,0.328837,-2.58807
Travis,-0.396995,-0.05719,-0.873712,-1.333719,0.860323


In [16]:
# 已知列的分组关系，并希望根据分组计算列的总计：
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

# 将字典传给groupby
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,0.834629,0.926127
Steve,-0.828165,-1.37014
Wes,1.29203,-2.750601
Jim,-0.270054,-1.176444
Travis,-2.207431,0.406138


Series也有同样的功能，它可以被看做一个固定大小的映射。对于上面那个例子，如果Series作为分组键，则pandas会检查Series以确保其索引跟分组轴是对齐的：

In [17]:
map_series = Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [18]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### 通过函数进行分组
任何被当做分组键的函数都会在各个索引值上被调用一次，其返回值就会被用作分组名称。

以上节为例，其索引值为人的名字，希望根据人名长度进行分组，虽然可以求取一个字符串长度数组，但其实仅仅传入len函数就可以了：

In [19]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.205094,1.484791,-0.521307,2.377913,-4.280616
5,0.10151,-1.281982,-0.383059,-0.445106,-0.189668
6,-0.396995,-0.05719,-0.873712,-1.333719,0.860323


将函数跟数组、列表、字典、Series混合使用也不是问题，因为任何东西最终都会被转换为数组：

In [20]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.896736,1.320775,0.077583,0.757046,-0.853865
3,two,1.247609,0.164016,-0.59889,0.328837,-2.58807
5,one,0.10151,-1.281982,-0.383059,-0.445106,-0.189668
6,two,-0.396995,-0.05719,-0.873712,-1.333719,0.860323


### 根据索引级别分组
通过level关键字传入级别编号或名称即可：

In [21]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]], names=['cty', 'tenor'])

In [22]:
hier_df = DataFrame(np.random.randn(4,5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,1.043607,-1.381621,0.541925,1.298138,-1.560821
1,0.962947,-1.529818,0.42507,-0.709827,-1.587296
2,-0.403503,-0.558785,0.858775,-1.289721,0.286856
3,0.658067,-0.744733,-0.264428,-0.503494,-0.542814


In [23]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 数据聚合
聚合：任何能够从数组产生标量值得数据转换过程。之前的例子比如mean、count、min以及sum等。

**经过优化的GroupBy的方法**

**函数名** | **说明**
- | -
count | 分组中非NA值的数量
sum | 非NA值的和
mean | 非NA值得平均值
median | 非NA值得算数中位数
std、var | 无偏（分母n-1）标准差和方差
min、max | 非NA值得最小和最大值
prod | 非NA值的积
first、last | 第一个和最后一个非NA值

也可以使用自己发明的聚合运算。

运行方式，例如quantile，没有明确地实现于GroupBy，但它是一个Series方法，所以这里是能用的。GroupBy会高效地对Seris进行切片，然后对各片调用piece.quantile()，最后将这些结果组装成最终结果。

如果要使用自己的聚合函数，只需将其传入aggregate或agg方法即可：

In [24]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped = df.groupby('key1')
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.810311,1.824253
b,0.39422,1.196571


In [25]:
# 有些方法也可以在这里用，即使不是聚合运算：
grouped.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,count,3.0,3.0
a,mean,1.021825,-0.100647
a,std,0.994517,1.000405
a,min,-0.121208,-1.250004
a,25%,0.688186,-0.438094
a,50%,1.49758,0.373815
a,75%,1.593341,0.474032
a,max,1.689103,0.574249
b,count,2.0,2.0
b,mean,-0.648193,0.856295


**注意：**自定义的聚合函数要比优化过的表中的函数慢很多，因为在构造中间分组数据块时存在非常大的开销（函数调用、数据重排等）。

### 面向列的多函数应用
对不同的列使用不同的聚合函数，或一次应用多个函数。首先根据sex和smoker对tips进行分组：

In [26]:
tips = pd.read_csv('old-file/ch08/tips.csv')

# 添加“消费占总额百分比”的列
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:5]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [27]:
grouped = tips.groupby(['sex', 'smoker']) # 多层次索引的DataFrame
grouped_pct = grouped['tip_pct'] # 多层次索引的Series

grouped_pct.agg('mean')

sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64

In [28]:
# 传入一组函数或函数名，的到的DataFrame的列就会以相应的函数命名：
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,0.036421,0.195876
Female,Yes,0.18215,0.071595,0.360233
Male,No,0.160669,0.041849,0.220186
Male,Yes,0.152771,0.090588,0.674707


不一定接受GroupBy自动给出的列名，特别是lambda函数，名称为'<lambda>',如果传入的是一个由***(name, function)***元组组成的列表，则各元组的第一个元素就会被用作DataFrame的列名（可以将这种二元元组列表看做一个有序映射）：

In [29]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std), ('ptp', peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar,ptp
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,0.036421,0.195876
Female,Yes,0.18215,0.071595,0.360233
Male,No,0.160669,0.041849,0.220186
Male,Yes,0.152771,0.090588,0.674707


In [30]:
# 一组应用于全部列的函数，或不同的列应用不同的函数。
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)

result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,No,54,0.156921,0.252672,54,18.105185,35.83
Female,Yes,33,0.18215,0.416667,33,17.977879,44.3
Male,No,97,0.160669,0.29199,97,19.791237,48.33
Male,Yes,60,0.152771,0.710345,60,22.2845,50.81


In [31]:
# DataFrame拥有层次化的列，这相当于分别对各列进行聚合，然后用concat将结果组装到一起
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,54,0.156921,0.252672
Female,Yes,33,0.18215,0.416667
Male,No,97,0.160669,0.29199
Male,Yes,60,0.152771,0.710345


In [32]:
# 可以传入带有自定义名称的元组列表
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,No,0.156921,0.001327,18.105185,53.092422
Female,Yes,0.18215,0.005126,17.977879,84.451517
Male,No,0.160669,0.001751,19.791237,76.152961
Male,Yes,0.152771,0.008206,22.2845,98.244673


In [33]:
# 不用的列应用不同的函数，向agg传入一个从列名映射到函数的字典：
grouped.agg({'tip': np.max, 'size': 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,5.2,140
Female,Yes,6.5,74
Male,No,9.0,263
Male,Yes,10.0,150


In [34]:
grouped.agg({'tip_pct': ['min', 'max', 'mean', 'std'], # 对tip_pct这列应用四个不同的函数，返回四个结果
            'size': 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,No,0.056797,0.252672,0.156921,0.036421,140
Female,Yes,0.056433,0.416667,0.18215,0.071595,74
Male,No,0.071804,0.29199,0.160669,0.041849,263
Male,Yes,0.035638,0.710345,0.152771,0.090588,150


### 以“无索引”的形式返回聚合数据
向groupby传入as_index=False以禁用该功能，但是这种用法比较缺乏灵活性

In [35]:
tips.groupby(['sex', 'smoker'], as_index=False).mean()

Unnamed: 0,sex,smoker,total_bill,tip,size,tip_pct
0,Female,No,18.105185,2.773519,2.592593,0.156921
1,Female,Yes,17.977879,2.931515,2.242424,0.18215
2,Male,No,19.791237,3.113402,2.71134,0.160669
3,Male,Yes,22.2845,3.051167,2.5,0.152771


## 分组级运算和转换
聚合是分组运算中的一种而已。是数据转换的一个特例，也就是说，它接受能够将一维数组简化为标量值的函数。本节介绍transform和apply方法，能执行更多的分组运算。

例子，为一个DataFrame添加一个用于存放各个索引分组平均值的列，一个办法是先聚合再合并：

In [36]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.121208,-1.250004,a,one
1,1.49758,0.373815,a,two
2,-0.845303,1.454581,b,one
3,-0.451083,0.25801,b,two
4,1.689103,0.574249,a,one


In [37]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.021825,-0.100647
b,-0.648193,0.856295


In [38]:
pd.merge(df, k1_means, left_on='key1', right_index=True) # 左侧DF中用作连接键的列为key1，右侧行索引用作其连接键

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,-0.121208,-1.250004,a,one,1.021825,-0.100647
1,1.49758,0.373815,a,two,1.021825,-0.100647
4,1.689103,0.574249,a,one,1.021825,-0.100647
2,-0.845303,1.454581,b,one,-0.648193,0.856295
3,-0.451083,0.25801,b,two,-0.648193,0.856295


In [39]:
# tansform方法
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-0.616566,0.631793,-0.398064,0.238453,-0.277408
two,0.67456,-0.558983,-0.490975,-0.058135,-1.388869


In [40]:
people.groupby(key).transform(np.mean) # tansform将np.mean应用到各个分组上，然后将结果放到适当的位置上

Unnamed: 0,a,b,c,d,e
Joe,-0.616566,0.631793,-0.398064,0.238453,-0.277408
Steve,0.67456,-0.558983,-0.490975,-0.058135,-1.388869
Wes,-0.616566,0.631793,-0.398064,0.238453,-0.277408
Jim,0.67456,-0.558983,-0.490975,-0.058135,-1.388869
Travis,-0.616566,0.631793,-0.398064,0.238453,-0.277408


从各组中减去平均值，先创建一个距平化函数（demeaning function），然后传给transform

In [41]:
def demean(arr):
    return arr - arr.mean()

demeaned = people.groupby(key).transform(demean)
demeaned

Unnamed: 0,a,b,c,d,e
Joe,1.060599,0.688982,0.475648,0.518594,-0.561273
Steve,-0.57305,-0.722999,0.107916,-0.386971,1.199201
Wes,-1.28017,,,1.053577,-0.576457
Jim,0.57305,0.722999,-0.107916,0.386971,-1.199201
Travis,0.219571,-0.688982,-0.475648,-1.572171,1.13773


In [42]:
# 姜茶demeaned的分组平均值是否为0：
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,1.850372e-17,-5.5511150000000004e-17,2.775558e-17,-7.401487e-17,0.0
two,0.0,-5.5511150000000004e-17,0.0,0.0,1.110223e-16


### apply: 一般性的“拆分-应用-合并”

跟aggregate一样，tansform也是一个有条件限制的特殊函数：传入的函数只能产生两种结果，要么产生一个可以广播的标量值（如np.mean），要么产生一个相同大小的结果数组。

最一般化的GroupBy方法是apply。

回到小费数据集，根据分组选出最高的5个tip_pct值。

In [45]:
# 在指定列找出最大值，然后把这个值所在的行选取出来
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:]

In [46]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [47]:
# 如果对smoker分组并用该函数调用apply：
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


> 这里发生了什么？

top函数在DF的各个片段上调用（此处为smoker分类的两个片段），然后结果由pandas.concat组装到一起，并以分组名称进行了标记，于是，最终结果就有了一个层次化索引，其内层索引值来自原DF。

如果传给apply的函数能够接受其他参数或关键字，则可以将这些内容放在函数名后面一并传入：

In [48]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982


在GroupBy中，当你调用诸如describe之类的方法时，实际上只是应用了下面两条代码的快捷方式而已：
```python
f = lambda x: x.describe()
grouped.apply(f)
```

In [49]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

smoker       
No      count    151.000000
        mean       0.159328
        std        0.039910
        min        0.056797
        25%        0.136906
        50%        0.155625
        75%        0.185014
        max        0.291990
Yes     count     93.000000
        mean       0.163196
        std        0.085119
        min        0.035638
        25%        0.106771
        50%        0.153846
        75%        0.195059
        max        0.710345
Name: tip_pct, dtype: float64

**禁止分组键**

分组键会跟原始对象的索引共同构成结果对象中的层次化索引。将group_keys=False传入groupby可禁止：

In [50]:
tips.groupby('smoker', group_keys=False).apply(top) # 还是可以看出是两个片段的排序

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


### 分位数和桶分析
pandas根据指定面元或样本分位数将数据拆分为多块的工具（如cut、qcut）。将这些函数跟groupby结合起来，就能非常轻松地实现对数据集的**桶（bucket）或分位数（quantile）分析**了。

以随机数据集为例，利用cut将其装入长度相等的bucket中：

In [51]:
frame = DataFrame({'data1': np.random.randn(1000),
                  'data2': np.random.randn(1000)})

factor = pd.cut(frame.data1, 4)

factor[:10]

0     (-0.00722, 1.494]
1    (-1.508, -0.00722]
2     (-0.00722, 1.494]
3        (1.494, 2.995]
4     (-0.00722, 1.494]
5    (-1.508, -0.00722]
6    (-1.508, -0.00722]
7    (-1.508, -0.00722]
8    (-1.508, -0.00722]
9     (-0.00722, 1.494]
Name: data1, dtype: category
Categories (4, object): [(-3.0151, -1.508] < (-1.508, -0.00722] < (-0.00722, 1.494] < (1.494, 2.995]]

由cut返回的Factor对象可直接用于groupby，可以这样对data2做一些统计计算：

In [55]:
def get_stats(group):
    return{'min': group.min(), 'max': group.max(),
           'count': group.count(), 'mean': group.mean()}

grouped = frame.data2.groupby(factor)

grouped.apply(get_stats)

data1                    
(-3.0151, -1.508]   count     64.000000
                    max        2.213551
                    mean      -0.037275
                    min       -2.019793
(-1.508, -0.00722]  count    438.000000
                    max        3.019414
                    mean       0.031938
                    min       -2.770366
(-0.00722, 1.494]   count    429.000000
                    max        2.939629
                    mean       0.106728
                    min       -2.502154
(1.494, 2.995]      count     69.000000
                    max        2.353987
                    mean      -0.026436
                    min       -2.811387
Name: data2, dtype: float64

In [56]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.0151, -1.508]",64.0,2.213551,-0.037275,-2.019793
"(-1.508, -0.00722]",438.0,3.019414,0.031938,-2.770366
"(-0.00722, 1.494]",429.0,2.939629,0.106728,-2.502154
"(1.494, 2.995]",69.0,2.353987,-0.026436,-2.811387


上面是长度相等的bucket（data1区域的范围相同），要根据样本分位数等到大小相等的bucket，使用qcut，传入labels=False即可只获取分位数的编号：

In [59]:
grouping = pd.qcut(frame.data1, 5, labels=False) # data2的数据值和data1的区间没有关系
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,200.0,3.019414,0.017839,-2.770366
1,200.0,2.282258,0.031613,-2.679929
2,200.0,2.908777,0.084622,-2.053886
3,200.0,2.457831,0.039509,-2.502154
4,200.0,2.939629,0.104243,-2.811387


### 示例：用特定于分组的值填充缺失值

对于缺失数据，有时用dropna滤除，有时用衍生值去填充，用fillna工具，如用平均值去填充NA值：

In [61]:
s = Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    0.529571
2         NaN
3   -0.633795
4         NaN
5    2.191502
dtype: float64

In [66]:
s.fillna(s.mean())

0    0.695759
1    0.529571
2    0.695759
3   -0.633795
4    0.695759
5    2.191502
dtype: float64

对不同的分组填充不同的值：
将数据分组，并使用apply和一个能够对各数据块调用fillna的函数即可。

下面是美国州示例，这些州分为东西部：

In [67]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon',
          'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan

data

Ohio          0.265855
New York     -0.518526
Vermont            NaN
Florida       1.023558
Oregon       -0.369347
Nevada             NaN
California   -0.561821
Idaho              NaN
dtype: float64

In [68]:
data.groupby(group_key).mean()

East    0.256962
West   -0.465584
dtype: float64

In [69]:
# 用分组平均值去填充NA值
fill_mean = lambda g: g.fillna(g.mean())

data.groupby(group_key).apply(fill_mean) # g表示East或West的分组

Ohio          0.265855
New York     -0.518526
Vermont       0.256962
Florida       1.023558
Oregon       -0.369347
Nevada       -0.465584
California   -0.561821
Idaho        -0.465584
dtype: float64

In [70]:
# 预定义各组的填充值，分组具有一个name属性，可以调用
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])

data.groupby(group_key).apply(fill_func)

Ohio          0.265855
New York     -0.518526
Vermont       0.500000
Florida       1.023558
Oregon       -0.369347
Nevada       -1.000000
California   -0.561821
Idaho        -1.000000
dtype: float64

### 示例：随机采样和排列
从一个大数据集中随机抽取样本以进行蒙特卡罗模拟（Monte Carlo simulation）或其他分析工作。“抽取”的方式有很多，其中一些的效率会比其他的高很多。一个办法是，选取np.random.permutation(N)的前K个元素，其中N为完整数据的大小，K为期望的样本大小。

作为例子，下面是构造一副英语型扑克牌的一个方式：

In [73]:
# 红桃Hearts 黑桃Spades 梅花Clubs 方片Diamonds
suits = ['H', 'S', 'C', 'D']
card_val = (range(1,11) + [10] * 3) * 4
base_names = ['A'] + range(2, 11) + ['J', 'Q', 'K']
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)
    
deck = Series(card_val, index=cards) # 一个长度为52的Series，索引为牌名

In [74]:
# 现在，从整副牌中抽出5张：
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck))[:n])

draw(deck)

3S     3
KS    10
QS    10
5S     5
3D     3
dtype: int64

从每种花色中随机抽取两张牌，由于花色是牌名的最后一个字符，可以依据此进行分组，并使用apply：

In [87]:
get_suit = lambda card: card[-1] # 取最后一个字母
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

QC    10
6C     6
AD     1
4D     4
2H     2
KH    10
3S     3
2S     2
dtype: int64

### 示例：分组加权平均数和相关系数
根据groupby的“拆分-应用-合并”范式，DF的列与列之间或两个Series之间的运算（如分组加权平均）成为一种标准作业。

以下面这个数据集为例，它含有分组键，值以及一些权重值：

In [88]:
df = DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
               'data': np.random.randn(8),
               'weights': np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,-0.017662,0.593289
1,a,-0.591703,0.864474
2,a,0.417375,0.085661
3,a,-0.397416,0.6286
4,b,-0.652509,0.802134
5,b,1.195267,0.905585
6,b,0.472329,0.696561
7,b,-0.472641,0.566812


In [89]:
# 利用category计算分组加权平均数：
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
grouped.apply(get_wavg)

category
a   -0.338879
b    0.208719
dtype: float64

In [93]:
# 实际例子，yahoo的数据集
# 其中含有标准普尔500指数（SPX字段）和几只股票的收盘价：
close_px = pd.read_csv('old-file/ch09/stock_px.csv', parse_dates=True, index_col=0)

close_px[-4:]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [96]:
# 计算一个由日收益率（通过百分比数变化计算）与SPX之间的年度相关系数组成的DF
rets = close_px.pct_change().dropna()
spx_corr = lambda x: x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x: x.year)
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [97]:
# 苹果和微软的年度相关系数
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

### 示例：面向分组的线性回归
定义regress函数（利用statsmodels库）对各类数据块执行普通最小二乘法（Ordinary Least Square， OLS）回归：