# 分组、聚合、过滤、转换

In [1]:
import pandas as pd
import numpy as np

## 定义聚合

In [2]:
flights = pd.read_csv('flights.csv')
flights.head()

Unnamed: 0,date,airline,origin,dest,dep_time,arr_time,cancelled,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2018-01-01,UA,LAS,IAH,100,547,0,134.0,1222.0,0,0,0,0,0
1,2018-01-01,WN,DEN,PHX,515,720,0,91.0,602.0,0,0,0,0,0
2,2018-01-01,B6,JFK,BOS,550,657,0,39.0,187.0,0,83,8,0,0
3,2018-01-01,B6,DTW,BOS,600,754,0,79.0,632.0,0,0,19,0,0
4,2018-01-01,UA,LAS,EWR,600,1348,0,261.0,2227.0,0,0,0,0,0


In [7]:
# 查看 每家航空 的第一条记录
flights.groupby('airline').head(1) 

Unnamed: 0,date,airline,origin,dest,dep_time,arr_time,cancelled,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2018-01-01,UA,LAS,IAH,100,547,0,134.0,1222.0,0,0,0,0,0
1,2018-01-01,WN,DEN,PHX,515,720,0,91.0,602.0,0,0,0,0,0
2,2018-01-01,B6,JFK,BOS,550,657,0,39.0,187.0,0,83,8,0,0
8,2018-01-01,AS,SEA,SFO,605,816,0,97.0,679.0,0,0,0,0,0
10,2018-01-01,AA,DFW,DCA,610,959,0,131.0,1192.0,0,0,0,0,0
12,2018-01-01,F9,DEN,LAS,650,755,0,93.0,628.0,17,0,1,0,0
13,2018-01-01,OO,SFO,LAX,650,825,0,56.0,337.0,0,0,0,0,0
15,2018-01-01,YX,DCA,LGA,700,823,0,39.0,214.0,82,0,0,0,0
16,2018-01-01,DL,LGA,MCO,700,953,0,155.0,950.0,0,0,0,0,0
29,2018-01-01,NK,DFW,LGA,806,1226,0,163.0,1389.0,0,0,0,0,0


In [14]:
# 对每个航空公司的'arr_time'列计算平均值
flights.groupby('airline').agg({'arr_time':'mean'}).head()  # pandas.core.frame.DataFrame

Unnamed: 0_level_0,arr_time
airline,Unnamed: 1_level_1
9E,1501.621022
AA,1488.175815
AS,1478.302404
B6,1456.79717
DL,1520.030754


In [15]:
flights.groupby('airline')['arr_time'].agg('mean').head()  # pandas.core.series.Series

airline
9E    1501.621022
AA    1488.175815
AS    1478.302404
B6    1456.797170
DL    1520.030754
Name: arr_time, dtype: float64

In [16]:
flights.groupby('airline')['arr_time'].agg(np.mean).head()

  flights.groupby('airline')['arr_time'].agg(np.mean).head()


airline
9E    1501.621022
AA    1488.175815
AS    1478.302404
B6    1456.797170
DL    1520.030754
Name: arr_time, dtype: float64

In [17]:
flights.groupby('airline')['arr_time'].mean().head()

airline
9E    1501.621022
AA    1488.175815
AS    1478.302404
B6    1456.797170
DL    1520.030754
Name: arr_time, dtype: float64

In [19]:
# groupby 方法产生的是一个dataframegroupby对象
grouped = flights.groupby('airline')
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [20]:
# 如果agg接受的不是聚合函数，则会导致异常
flights.groupby('airline')['arr_time'].agg(np.sqrt)

ValueError: Must produce aggregated value

## 用多个列和函数进行分组和聚合

In [4]:
# 多生成一列：通过date生成weekday
flights['date'] = pd.to_datetime(flights['date'])
flights['weekday'] = flights['date'].dt.day_name()

In [12]:
# 每家航空公司每周取消的航班数
flights.groupby(['airline','weekday'])['CANCELLED'.lower()].agg('sum').head(7)

airline  weekday  
9E       Friday        8
         Monday        7
         Saturday      0
         Sunday        1
         Thursday      4
         Tuesday       6
         Wednesday    10
Name: cancelled, dtype: int64

In [14]:
# 分组可以是多组，选取可以是多组，聚合函数也可以是多个
flights.groupby(['airline', 'weekday'])[['dep_time', 'arr_time']].agg(['max', 'min']).head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,dep_time,dep_time,arr_time,arr_time
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,max,min
airline,weekday,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
9E,Friday,2155,550,2353,720
9E,Monday,2155,535,2346,657
9E,Saturday,2040,600,2230,716
9E,Sunday,2100,600,2356,719
9E,Thursday,2155,535,2356,657
9E,Tuesday,2100,600,2356,718
9E,Wednesday,2135,535,2252,657


In [15]:
# 用列表和嵌套字典对多列分组和聚合
group_cols = ['origin', 'dest']
agg_dict = {'CANCELLED'.lower():['sum', 'mean', 'size'], 'air_time':['mean', 'var']}

flights.groupby(group_cols).agg(agg_dict).head()

# flights.groupby(['origin', 'dest']).agg({'CANCELLED'.lower(): ['sum', 'mean', 'size'], 'air_time':['mean', 'var']}).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cancelled,cancelled,cancelled,air_time,air_time
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
origin,dest,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,BOS,7,0.023026,304,121.434343,68.543817
ATL,CLT,2,0.007634,262,42.776923,17.70294
ATL,DCA,6,0.020906,287,79.178571,45.107783
ATL,DEN,2,0.009302,215,169.533019,159.520232
ATL,DFW,2,0.00692,289,111.066202,107.523574


## 分组后去除多级索引

In [22]:
airline_info = flights.groupby(['airline', 'weekday'])\
                      .agg({'distance': ['sum', 'mean'], 
                            'air_time':['min', 'max']})\
                      .astype(int)
airline_info.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,distance,distance,air_time,air_time
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
airline,weekday,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
9E,Friday,92800,583,25,221
9E,Monday,94936,612,24,216
9E,Saturday,53815,597,33,180
9E,Sunday,87996,575,26,220
9E,Thursday,88785,558,33,213


In [23]:
# 行和列都有两级索引，get_level_values(0)取出第一级
level0 = airline_info.columns.get_level_values(0)  # 0:第一行；1:第二行
level0

Index(['distance', 'distance', 'air_time', 'air_time'], dtype='object')

In [24]:
level1 = airline_info.columns.get_level_values(1)
level1

Index(['sum', 'mean', 'min', 'max'], dtype='object')

In [25]:
# 一级、二级索引拼接成新的列索引
airline_info.columns = level0 + '_' + level1
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,distance_sum,distance_mean,air_time_min,air_time_max
airline,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9E,Friday,92800,583,25,221
9E,Monday,94936,612,24,216
9E,Saturday,53815,597,33,180
9E,Sunday,87996,575,26,220
9E,Thursday,88785,558,33,213
9E,Tuesday,87688,584,27,224
9E,Wednesday,100132,585,30,222


In [26]:
# reset_index()可以将行索引变成单级
airline_info.reset_index().head(7)

Unnamed: 0,airline,weekday,distance_sum,distance_mean,air_time_min,air_time_max
0,9E,Friday,92800,583,25,221
1,9E,Monday,94936,612,24,216
2,9E,Saturday,53815,597,33,180
3,9E,Sunday,87996,575,26,220
4,9E,Thursday,88785,558,33,213
5,9E,Tuesday,87688,584,27,224
6,9E,Wednesday,100132,585,30,222


In [27]:
# pandas 默认会在分组运算后，将所有分组的列表放在索引中，as_index设为false可以避免这么做，分组使用reset_index也可以达到同样的效果
flights.groupby(['airline'], as_index=False)['distance'].agg('mean').round(0)  # as_index=False表示分组列不作为结果DataFrame的索引

Unnamed: 0,airline,distance
0,9E,585.0
1,AA,1095.0
2,AS,1411.0
3,B6,1265.0
4,DL,1099.0
5,EV,316.0
6,F9,1047.0
7,MQ,497.0
8,NK,1108.0
9,OH,391.0


In [29]:
# 上面这么做，会默认对airline排序，sort设为false可以避免排序
flights.groupby(['airline'], as_index=False, sort=False)['distance'].agg('mean')

Unnamed: 0,airline,distance
0,UA,1225.780172
1,WN,821.89149
2,B6,1264.721436
3,AS,1411.189273
4,AA,1094.742357
5,F9,1047.314636
6,OO,650.460432
7,YX,603.692188
8,DL,1099.227411
9,NK,1107.986252


## 自定义聚合函数

In [30]:
college = pd.read_csv('college.csv')
college.head()

Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


In [31]:
college.groupby('stabbr')['ugds'].agg(['mean', 'std']).round(0).head()

Unnamed: 0_level_0,mean,std
stabbr,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2493.0,4052.0
AL,2790.0,4658.0
AR,1644.0,3143.0
AS,1276.0,
AZ,4130.0,14894.0


In [32]:
# 远离平均值的标准差的最大个数，写一个自定义函数
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

# agg聚合函数在调用方法时，直接引入自定义函数名
college.groupby('stabbr')['ugds'].agg(max_deviation).round(1).head()

stabbr
AK    2.6
AL    5.8
AR    6.3
AS    NaN
AZ    9.9
Name: ugds, dtype: float64

In [34]:
# 自定义聚合函数也适用于多个数值列
college.groupby('stabbr')[['ugds', 'satvrmid', 'satmtmid']].agg(max_deviation).round(1).head()

Unnamed: 0_level_0,ugds,satvrmid,satmtmid
stabbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2.6,,
AL,5.8,1.6,1.8
AR,6.3,2.2,2.3
AS,,,
AZ,9.9,1.9,1.4


In [35]:
# 自定义聚合函数也可以和预先定义的函数一起使用
college.groupby(['stabbr', 'relaffil'])[['ugds', 'satvrmid', 'satmtmid']]\
       .agg([max_deviation, 'mean', 'std']).round(1).head()


Unnamed: 0_level_0,Unnamed: 1_level_0,ugds,ugds,ugds,satvrmid,satvrmid,satvrmid,satmtmid,satmtmid,satmtmid
Unnamed: 0_level_1,Unnamed: 1_level_1,max_deviation,mean,std,max_deviation,mean,std,max_deviation,mean,std
stabbr,relaffil,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0


In [36]:
# pandas使用函数名作为返回列的名字，可以直接使用remane方法修改，或通过__name__属性修改
max_deviation.__name__

'max_deviation'

In [37]:
max_deviation.__name__ = 'MaxDeviation'

In [38]:
college.groupby(['stabbr','relaffil'])[['ugds', 'satvrmid', 'satmtmid']]\
       .agg([max_deviation, 'mean','std']).round(1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ugds,ugds,ugds,satvrmid,satvrmid,satvrmid,satmtmid,satmtmid,satmtmid
Unnamed: 0_level_1,Unnamed: 1_level_1,MaxDeviation,mean,std,MaxDeviation,mean,std,MaxDeviation,mean,std
stabbr,relaffil,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0


## 用 args 和 *kwargs 自定义聚合函数

In [39]:
# 用inspect模块查看groupby对象的agg方法的签名
grouped = college.groupby(['stabbr', 'relaffil'])

In [40]:
import inspect
inspect.signature(grouped.agg)


<Signature (func=None, *args, engine=None, engine_kwargs=None, **kwargs)>

In [41]:
def pct_between_1_3k(s):
    return s.between(1000, 3000).mean()

college.groupby(['stabbr', 'relaffil'])['ugds'].agg(pct_between_1_3k).head(9)

stabbr  relaffil
AK      0           0.142857
        1           0.000000
AL      0           0.236111
        1           0.333333
AR      0           0.279412
        1           0.111111
AS      0           1.000000
AZ      0           0.096774
        1           0.000000
Name: ugds, dtype: float64

In [42]:
def pct_between(s, low, high):
    return s.between(low, high).mean()

# 使用自定义聚合函数，并传入最大最小值
college.groupby(['stabbr', 'relaffil'])['ugds'].agg(pct_between, 1000, 10000).head(9)

stabbr  relaffil
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: ugds, dtype: float64

In [43]:
# 显示指定的最大和最小值
college.groupby(['stabbr', 'relaffil'])['ugds'].agg(pct_between, high=10000, low=1000).head(9)

stabbr  relaffil
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: ugds, dtype: float64

In [44]:
# 也可以关键字参数和非关键字参数混合使用，只要非关键字参数在后面
college.groupby(['stabbr', 'relaffil'])['ugds'].agg(pct_between, 1000, high=10000).head(9)

stabbr  relaffil
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: ugds, dtype: float64

In [46]:
# pandas 不支持多重聚合时，使用参数
college.groupby(['stabbr', 'relaffil'])['ugds'].agg(['mean', pct_between],low=100, high=1000)

TypeError: GroupBy.mean() got an unexpected keyword argument 'low'

In [59]:
def pct_between(s, low, high):
    return s.between(low, high).mean()

# 用闭包自定义聚合函数
def make_agg_func(func, name, *args, **kwargs):
    def wrapper(x):
        return func(x, *args, **kwargs)
    
    wrapper.__name__ = name
    return wrapper

my_agg1 = make_agg_func(pct_between, 'pct_1_3k', low=1000, high=3000)
my_agg2 = make_agg_func(pct_between, 'pct_10_30k', 10000, 30000)

college.groupby(['stabbr', 'relaffil']).agg({'ugds': [('mean'), ('pct_1_3k', my_agg1), ('pct_10_30k', my_agg2)]}).head(9)

Unnamed: 0_level_0,Unnamed: 1_level_0,ugds,ugds,ugds
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,pct_1_3k,pct_10_30k
stabbr,relaffil,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AK,0,3508.857143,0.142857,0.142857
AK,1,123.333333,0.0,0.0
AL,0,3248.774648,0.236111,0.083333
AL,1,979.722222,0.333333,0.0
AR,0,1793.691176,0.279412,0.014706
AR,1,917.785714,0.111111,0.0
AS,0,1276.0,1.0,0.0
AZ,0,4363.533898,0.096774,0.048387
AZ,1,692.75,0.0,0.0


In [60]:
college.groupby(['stabbr', 'relaffil']).agg({'ugds':[('pct_1_3k', my_agg1)]})  # pct_1_3k 是给自定义聚合操作指定的名称，它会成为输出结果中的一列名。

Unnamed: 0_level_0,Unnamed: 1_level_0,ugds
Unnamed: 0_level_1,Unnamed: 1_level_1,pct_1_3k
stabbr,relaffil,Unnamed: 2_level_2
AK,0,0.142857
AK,1,0.000000
AL,0,0.236111
AL,1,0.333333
AR,0,0.279412
...,...,...
WI,0,0.137931
WI,1,0.360000
WV,0,0.246154
WV,1,0.375000
