# 分组、聚合、过滤、转换

In [1]:
import pandas as pd
import numpy as np

## 定义聚合

In [9]:
flights = pd.read_csv('flights.csv')
flights.head()

Unnamed: 0,date,airline,origin,dest,dep_time,arr_time,cancelled,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2018-01-01,UA,LAS,IAH,100,547,0,134.0,1222.0,0,0,0,0,0
1,2018-01-01,WN,DEN,PHX,515,720,0,91.0,602.0,0,0,0,0,0
2,2018-01-01,B6,JFK,BOS,550,657,0,39.0,187.0,0,83,8,0,0
3,2018-01-01,B6,DTW,BOS,600,754,0,79.0,632.0,0,0,19,0,0
4,2018-01-01,UA,LAS,EWR,600,1348,0,261.0,2227.0,0,0,0,0,0


In [7]:
# 查看 每家航空 的第一条记录
flights.groupby('airline').head(1) 

Unnamed: 0,date,airline,origin,dest,dep_time,arr_time,cancelled,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2018-01-01,UA,LAS,IAH,100,547,0,134.0,1222.0,0,0,0,0,0
1,2018-01-01,WN,DEN,PHX,515,720,0,91.0,602.0,0,0,0,0,0
2,2018-01-01,B6,JFK,BOS,550,657,0,39.0,187.0,0,83,8,0,0
8,2018-01-01,AS,SEA,SFO,605,816,0,97.0,679.0,0,0,0,0,0
10,2018-01-01,AA,DFW,DCA,610,959,0,131.0,1192.0,0,0,0,0,0
12,2018-01-01,F9,DEN,LAS,650,755,0,93.0,628.0,17,0,1,0,0
13,2018-01-01,OO,SFO,LAX,650,825,0,56.0,337.0,0,0,0,0,0
15,2018-01-01,YX,DCA,LGA,700,823,0,39.0,214.0,82,0,0,0,0
16,2018-01-01,DL,LGA,MCO,700,953,0,155.0,950.0,0,0,0,0,0
29,2018-01-01,NK,DFW,LGA,806,1226,0,163.0,1389.0,0,0,0,0,0


In [14]:
# 对每个航空公司的'arr_time'列计算平均值
flights.groupby('airline').agg({'arr_time':'mean'}).head()  # pandas.core.frame.DataFrame

Unnamed: 0_level_0,arr_time
airline,Unnamed: 1_level_1
9E,1501.621022
AA,1488.175815
AS,1478.302404
B6,1456.79717
DL,1520.030754


In [15]:
flights.groupby('airline')['arr_time'].agg('mean').head()  # pandas.core.series.Series

airline
9E    1501.621022
AA    1488.175815
AS    1478.302404
B6    1456.797170
DL    1520.030754
Name: arr_time, dtype: float64

In [16]:
flights.groupby('airline')['arr_time'].agg(np.mean).head()

  flights.groupby('airline')['arr_time'].agg(np.mean).head()


airline
9E    1501.621022
AA    1488.175815
AS    1478.302404
B6    1456.797170
DL    1520.030754
Name: arr_time, dtype: float64

In [17]:
flights.groupby('airline')['arr_time'].mean().head()

airline
9E    1501.621022
AA    1488.175815
AS    1478.302404
B6    1456.797170
DL    1520.030754
Name: arr_time, dtype: float64

In [19]:
# groupby 方法产生的是一个dataframegroupby对象
grouped = flights.groupby('airline')
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [20]:
# 如果agg接受的不是聚合函数，则会导致异常
flights.groupby('airline')['arr_time'].agg(np.sqrt)

ValueError: Must produce aggregated value

## 用多个列和函数进行分组和聚合

In [4]:
# 多生成一列：通过date生成weekday
flights['date'] = pd.to_datetime(flights['date'])
flights['weekday'] = flights['date'].dt.day_name()

In [12]:
# 每家航空公司每周取消的航班数
flights.groupby(['airline','weekday'])['CANCELLED'.lower()].agg('sum').head(7)

airline  weekday  
9E       Friday        8
         Monday        7
         Saturday      0
         Sunday        1
         Thursday      4
         Tuesday       6
         Wednesday    10
Name: cancelled, dtype: int64

In [14]:
# 分组可以是多组，选取可以是多组，聚合函数也可以是多个
flights.groupby(['airline', 'weekday'])[['dep_time', 'arr_time']].agg(['max', 'min']).head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,dep_time,dep_time,arr_time,arr_time
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,max,min
airline,weekday,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
9E,Friday,2155,550,2353,720
9E,Monday,2155,535,2346,657
9E,Saturday,2040,600,2230,716
9E,Sunday,2100,600,2356,719
9E,Thursday,2155,535,2356,657
9E,Tuesday,2100,600,2356,718
9E,Wednesday,2135,535,2252,657


In [15]:
# 用列表和嵌套字典对多列分组和聚合
group_cols = ['origin', 'dest']
agg_dict = {'CANCELLED'.lower():['sum', 'mean', 'size'], 'air_time':['mean', 'var']}

flights.groupby(group_cols).agg(agg_dict).head()

# flights.groupby(['origin', 'dest']).agg({'CANCELLED'.lower(): ['sum', 'mean', 'size'], 'air_time':['mean', 'var']}).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cancelled,cancelled,cancelled,air_time,air_time
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
origin,dest,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,BOS,7,0.023026,304,121.434343,68.543817
ATL,CLT,2,0.007634,262,42.776923,17.70294
ATL,DCA,6,0.020906,287,79.178571,45.107783
ATL,DEN,2,0.009302,215,169.533019,159.520232
ATL,DFW,2,0.00692,289,111.066202,107.523574


## 分组后去除多级索引

In [22]:
airline_info = flights.groupby(['airline', 'weekday'])\
                      .agg({'distance': ['sum', 'mean'], 
                            'air_time':['min', 'max']})\
                      .astype(int)
airline_info.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,distance,distance,air_time,air_time
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
airline,weekday,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
9E,Friday,92800,583,25,221
9E,Monday,94936,612,24,216
9E,Saturday,53815,597,33,180
9E,Sunday,87996,575,26,220
9E,Thursday,88785,558,33,213


In [23]:
# 行和列都有两级索引，get_level_values(0)取出第一级
level0 = airline_info.columns.get_level_values(0)  # 0:第一行；1:第二行
level0

Index(['distance', 'distance', 'air_time', 'air_time'], dtype='object')

In [24]:
level1 = airline_info.columns.get_level_values(1)
level1

Index(['sum', 'mean', 'min', 'max'], dtype='object')

In [25]:
# 一级、二级索引拼接成新的列索引
airline_info.columns = level0 + '_' + level1
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,distance_sum,distance_mean,air_time_min,air_time_max
airline,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9E,Friday,92800,583,25,221
9E,Monday,94936,612,24,216
9E,Saturday,53815,597,33,180
9E,Sunday,87996,575,26,220
9E,Thursday,88785,558,33,213
9E,Tuesday,87688,584,27,224
9E,Wednesday,100132,585,30,222


In [26]:
# reset_index()可以将行索引变成单级
airline_info.reset_index().head(7)

Unnamed: 0,airline,weekday,distance_sum,distance_mean,air_time_min,air_time_max
0,9E,Friday,92800,583,25,221
1,9E,Monday,94936,612,24,216
2,9E,Saturday,53815,597,33,180
3,9E,Sunday,87996,575,26,220
4,9E,Thursday,88785,558,33,213
5,9E,Tuesday,87688,584,27,224
6,9E,Wednesday,100132,585,30,222


In [27]:
# pandas 默认会在分组运算后，将所有分组的列表放在索引中，as_index设为false可以避免这么做，分组使用reset_index也可以达到同样的效果
flights.groupby(['airline'], as_index=False)['distance'].agg('mean').round(0)  # as_index=False表示分组列不作为结果DataFrame的索引

Unnamed: 0,airline,distance
0,9E,585.0
1,AA,1095.0
2,AS,1411.0
3,B6,1265.0
4,DL,1099.0
5,EV,316.0
6,F9,1047.0
7,MQ,497.0
8,NK,1108.0
9,OH,391.0


In [29]:
# 上面这么做，会默认对airline排序，sort设为false可以避免排序
flights.groupby(['airline'], as_index=False, sort=False)['distance'].agg('mean')

Unnamed: 0,airline,distance
0,UA,1225.780172
1,WN,821.89149
2,B6,1264.721436
3,AS,1411.189273
4,AA,1094.742357
5,F9,1047.314636
6,OO,650.460432
7,YX,603.692188
8,DL,1099.227411
9,NK,1107.986252


## 自定义聚合函数

In [30]:
college = pd.read_csv('college.csv')
college.head()

Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


In [31]:
college.groupby('stabbr')['ugds'].agg(['mean', 'std']).round(0).head()

Unnamed: 0_level_0,mean,std
stabbr,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2493.0,4052.0
AL,2790.0,4658.0
AR,1644.0,3143.0
AS,1276.0,
AZ,4130.0,14894.0


In [32]:
# 远离平均值的标准差的最大个数，写一个自定义函数
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

# agg聚合函数在调用方法时，直接引入自定义函数名
college.groupby('stabbr')['ugds'].agg(max_deviation).round(1).head()

stabbr
AK    2.6
AL    5.8
AR    6.3
AS    NaN
AZ    9.9
Name: ugds, dtype: float64

In [34]:
# 自定义聚合函数也适用于多个数值列
college.groupby('stabbr')[['ugds', 'satvrmid', 'satmtmid']].agg(max_deviation).round(1).head()

Unnamed: 0_level_0,ugds,satvrmid,satmtmid
stabbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2.6,,
AL,5.8,1.6,1.8
AR,6.3,2.2,2.3
AS,,,
AZ,9.9,1.9,1.4


In [35]:
# 自定义聚合函数也可以和预先定义的函数一起使用
college.groupby(['stabbr', 'relaffil'])[['ugds', 'satvrmid', 'satmtmid']]\
       .agg([max_deviation, 'mean', 'std']).round(1).head()


Unnamed: 0_level_0,Unnamed: 1_level_0,ugds,ugds,ugds,satvrmid,satvrmid,satvrmid,satmtmid,satmtmid,satmtmid
Unnamed: 0_level_1,Unnamed: 1_level_1,max_deviation,mean,std,max_deviation,mean,std,max_deviation,mean,std
stabbr,relaffil,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0


In [36]:
# pandas使用函数名作为返回列的名字，可以直接使用remane方法修改，或通过__name__属性修改
max_deviation.__name__

'max_deviation'

In [37]:
max_deviation.__name__ = 'MaxDeviation'

In [38]:
college.groupby(['stabbr','relaffil'])[['ugds', 'satvrmid', 'satvrmid']]\
       .agg([max_deviation, 'mean','std']).round(1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ugds,ugds,ugds,satvrmid,satvrmid,satvrmid,satmtmid,satmtmid,satmtmid
Unnamed: 0_level_1,Unnamed: 1_level_1,MaxDeviation,mean,std,MaxDeviation,mean,std,MaxDeviation,mean,std
stabbr,relaffil,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0


## 用 args 和 *kwargs 自定义聚合函数

In [39]:
# 用inspect模块查看groupby对象的agg方法的签名
grouped = college.groupby(['stabbr', 'relaffil'])

In [40]:
import inspect
inspect.signature(grouped.agg)


<Signature (func=None, *args, engine=None, engine_kwargs=None, **kwargs)>

In [41]:
def pct_between_1_3k(s):
    return s.between(1000, 3000).mean()

college.groupby(['stabbr', 'relaffil'])['ugds'].agg(pct_between_1_3k).head(9)

stabbr  relaffil
AK      0           0.142857
        1           0.000000
AL      0           0.236111
        1           0.333333
AR      0           0.279412
        1           0.111111
AS      0           1.000000
AZ      0           0.096774
        1           0.000000
Name: ugds, dtype: float64

In [42]:
def pct_between(s, low, high):
    return s.between(low, high).mean()

# 使用自定义聚合函数，并传入最大最小值
college.groupby(['stabbr', 'relaffil'])['ugds'].agg(pct_between, 1000, 10000).head(9)

stabbr  relaffil
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: ugds, dtype: float64

In [43]:
# 显示指定的最大和最小值
college.groupby(['stabbr', 'relaffil'])['ugds'].agg(pct_between, high=10000, low=1000).head(9)

stabbr  relaffil
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: ugds, dtype: float64

In [44]:
# 也可以关键字参数和非关键字参数混合使用，只要非关键字参数在后面
college.groupby(['stabbr', 'relaffil'])['ugds'].agg(pct_between, 1000, high=10000).head(9)

stabbr  relaffil
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: ugds, dtype: float64

In [46]:
# pandas 不支持多重聚合时，使用参数
college.groupby(['stabbr', 'relaffil'])['ugds'].agg(['mean', pct_between],low=100, high=1000)

TypeError: GroupBy.mean() got an unexpected keyword argument 'low'

In [59]:
def pct_between(s, low, high):
    return s.between(low, high).mean()

# 用闭包自定义聚合函数
def make_agg_func(func, name, *args, **kwargs):
    def wrapper(x):
        return func(x, *args, **kwargs)
    
    wrapper.__name__ = name
    return wrapper

my_agg1 = make_agg_func(pct_between, 'pct_1_3k', low=1000, high=3000)
my_agg2 = make_agg_func(pct_between, 'pct_10_30k', 10000, 30000)

college.groupby(['stabbr', 'relaffil']).agg({'ugds': [('mean'), ('pct_1_3k', my_agg1), ('pct_10_30k', my_agg2)]}).head(9)

Unnamed: 0_level_0,Unnamed: 1_level_0,ugds,ugds,ugds
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,pct_1_3k,pct_10_30k
stabbr,relaffil,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AK,0,3508.857143,0.142857,0.142857
AK,1,123.333333,0.0,0.0
AL,0,3248.774648,0.236111,0.083333
AL,1,979.722222,0.333333,0.0
AR,0,1793.691176,0.279412,0.014706
AR,1,917.785714,0.111111,0.0
AS,0,1276.0,1.0,0.0
AZ,0,4363.533898,0.096774,0.048387
AZ,1,692.75,0.0,0.0


In [60]:
college.groupby(['stabbr', 'relaffil']).agg({'ugds':[('pct_1_3k', my_agg1)]})  # pct_1_3k 是给自定义聚合操作指定的名称，它会成为输出结果中的一列名。

Unnamed: 0_level_0,Unnamed: 1_level_0,ugds
Unnamed: 0_level_1,Unnamed: 1_level_1,pct_1_3k
stabbr,relaffil,Unnamed: 2_level_2
AK,0,0.142857
AK,1,0.000000
AL,0,0.236111
AL,1,0.333333
AR,0,0.279412
...,...,...
WI,0,0.137931
WI,1,0.360000
WV,0,0.246154
WV,1,0.375000


## 检查分组对象

In [2]:
college = pd.read_csv('college.csv')
grouped = college.groupby(['stabbr', 'relaffil'])
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [3]:
# ⽤dir函数找到该对象所有的可用函数
[attr for attr in dir(grouped) if not attr.startswith('_')]  # 排除以下划线开头的属性/方法

['agg',
 'aggregate',
 'all',
 'any',
 'apply',
 'bfill',
 'boxplot',
 'city',
 'corr',
 'corrwith',
 'count',
 'cov',
 'cumcount',
 'cummax',
 'cummin',
 'cumprod',
 'cumsum',
 'curroper',
 'describe',
 'diff',
 'distanceonly',
 'dtypes',
 'ewm',
 'expanding',
 'ffill',
 'fillna',
 'filter',
 'first',
 'get_group',
 'grad_debt_mdn_supp',
 'groups',
 'hbcu',
 'head',
 'hist',
 'idxmax',
 'idxmin',
 'indices',
 'instnm',
 'last',
 'max',
 'md_earn_wne_p10',
 'mean',
 'median',
 'menonly',
 'min',
 'ndim',
 'ngroup',
 'ngroups',
 'nth',
 'nunique',
 'ohlc',
 'pct_change',
 'pctfloan',
 'pctpell',
 'pipe',
 'plot',
 'pptug_ef',
 'prod',
 'quantile',
 'rank',
 'relaffil',
 'resample',
 'rolling',
 'sample',
 'satmtmid',
 'satvrmid',
 'sem',
 'shift',
 'size',
 'skew',
 'stabbr',
 'std',
 'sum',
 'tail',
 'take',
 'transform',
 'ug25abv',
 'ugds',
 'ugds_2mor',
 'ugds_aian',
 'ugds_asian',
 'ugds_black',
 'ugds_hisp',
 'ugds_nhpi',
 'ugds_nra',
 'ugds_unkn',
 'ugds_white',
 'value_counts',


In [4]:
# ⽤ngroups属性查看分组的数量（数据被分成了多少个组）
grouped.ngroups 

112

In [12]:
# 查看每个分组的唯一识别标签，groups属性是一个字典，包含每个独立分组与行索引标签的对应
groups = list(grouped.groups.keys())  # grouped.groups：返回一个字典，键是组名，值是该组包含的行索引
groups[:6]

[('AK', 0), ('AK', 1), ('AL', 0), ('AL', 1), ('AR', 0), ('AR', 1)]

In [13]:
grouped.get_group(('FL', 1)).head()

Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
712,The Baptist College of Florida,Graceville,FL,0.0,0.0,0.0,1,545.0,465.0,0.0,...,0.0308,0.0,0.0507,0.2291,1,0.5878,0.5602,0.3531,30800.0,20052
713,Barry University,Miami,FL,0.0,0.0,0.0,1,470.0,462.0,0.0,...,0.0164,0.0741,0.0841,0.1518,1,0.5045,0.6733,0.4361,44100.0,28250
714,Gooding Institute of Nurse Anesthesia,Panama City,FL,0.0,0.0,0.0,1,,,0.0,...,,,,,0,,,,,PrivacySuppressed
715,Bethune-Cookman University,Daytona Beach,FL,1.0,0.0,0.0,1,405.0,395.0,0.0,...,0.0198,0.0205,0.019,0.0523,1,0.7758,0.8867,0.0647,29400.0,36250
724,Johnson University Florida,Kissimmee,FL,0.0,0.0,0.0,1,480.0,470.0,0.0,...,0.0045,0.0045,0.0136,0.1636,1,0.6689,0.7384,0.2185,26300.0,20199


In [14]:
# groupby对象是⼀个可迭代对象，可以挨个查看每个独立分组
from IPython.display import display

i = 0
for name, group in grouped:
    print(name)
    display(group.head(2))
    i += 1
    if i == 5:
        break

('AK', np.int64(0))


Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
60,University of Alaska Anchorage,Anchorage,AK,0.0,0.0,0.0,0,,,0.0,...,0.098,0.0181,0.0457,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5
62,University of Alaska Fairbanks,Fairbanks,AK,0.0,0.0,0.0,0,,,0.0,...,0.0401,0.011,0.306,0.3887,1,0.2263,0.255,0.4519,36200,19355.0


('AK', np.int64(1))


Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
61,Alaska Bible College,Palmer,AK,0.0,0.0,0.0,1,,,0.0,...,0.037,0.0,0.0,0.1481,1,0.3571,0.2857,0.4286,,PrivacySuppressed
64,Alaska Pacific University,Anchorage,AK,0.0,0.0,0.0,1,555.0,503.0,0.0,...,0.0945,0.0,0.0873,0.3745,1,0.3152,0.5297,0.491,47000.0,23250


('AL', np.int64(0))


Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5


('AL', np.int64(1))


Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370
10,Birmingham Southern College,Birmingham,AL,0.0,0.0,0.0,1,560.0,560.0,0.0,...,0.0051,0.0,0.0051,0.0017,1,0.192,0.4809,0.0152,44200,27000


('AR', np.int64(0))


Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
128,University of Arkansas at Little Rock,Little Rock,AR,0.0,0.0,0.0,0,470.0,510.0,0.0,...,0.0755,0.0283,0.0003,0.4126,1,0.3941,0.4775,0.4062,33900,21736
129,University of Arkansas for Medical Sciences,Little Rock,AR,0.0,0.0,0.0,0,,,0.0,...,0.0281,0.007,0.0169,0.2433,1,0.3944,0.6144,0.5133,61400,12500


In [24]:
grouped.head(2)  # 包含所有组的前 2 行数据

Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0000,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.0100,0.2607,1,0.3460,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0000,0.0000,0.2715,0.4536,1,0.6801,0.7795,0.8540,40100,23370
10,Birmingham Southern College,Birmingham,AL,0.0,0.0,0.0,1,560.0,560.0,0.0,...,0.0051,0.0000,0.0051,0.0017,1,0.1920,0.4809,0.0152,44200,27000
43,Prince Institute-Southeast,Elmhurst,IL,0.0,0.0,0.0,0,,,0.0,...,0.0000,0.0000,0.0000,0.0000,1,0.7857,0.9375,0.6569,PrivacySuppressed,20992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5289,Pacific Islands University,Mangilao,GU,0.0,0.0,0.0,1,,,0.0,...,0.0000,0.0000,0.0000,0.1846,1,0.9730,0.0000,0.2533,PrivacySuppressed,PrivacySuppressed
6439,Touro University Nevada,Henderson,NV,0.0,0.0,0.0,1,,,0.0,...,0.0323,0.0000,0.0645,0.0323,1,0.0000,0.2000,0.4000,,PrivacySuppressed
7352,Marinello School of Beauty-Henderson,Henderson,NV,,,,1,,,,...,,,,,0,,,,21200,9796.5
7404,University of the Virgin Islands-Albert A. Sheen,St. Croix,VI,,,,1,,,,...,,,,,1,,,,31800,15150


In [15]:
grouped.head(2).head(6)  # 第二次 .head(6) 没有实际过滤效果，只是为了显示前六行

Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
10,Birmingham Southern College,Birmingham,AL,0.0,0.0,0.0,1,560.0,560.0,0.0,...,0.0051,0.0,0.0051,0.0017,1,0.192,0.4809,0.0152,44200,27000.0
43,Prince Institute-Southeast,Elmhurst,IL,0.0,0.0,0.0,0,,,0.0,...,0.0,0.0,0.0,0.0,1,0.7857,0.9375,0.6569,PrivacySuppressed,20992.0
60,University of Alaska Anchorage,Anchorage,AK,0.0,0.0,0.0,0,,,0.0,...,0.098,0.0181,0.0457,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5


In [26]:
# nth 方法可以选出每个分组指定行的数据，下面选出的是第二行和最后一行
grouped.nth([1, -1]).head(8)  

Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
10,Birmingham Southern College,Birmingham,AL,0.0,0.0,0.0,1,560.0,560.0,0.0,...,0.0051,0.0,0.0051,0.0017,1,0.192,0.4809,0.0152,44200,27000.0
62,University of Alaska Fairbanks,Fairbanks,AK,0.0,0.0,0.0,0,,,0.0,...,0.0401,0.011,0.306,0.3887,1,0.2263,0.255,0.4519,36200,19355.0
64,Alaska Pacific University,Anchorage,AK,0.0,0.0,0.0,1,555.0,503.0,0.0,...,0.0945,0.0,0.0873,0.3745,1,0.3152,0.5297,0.491,47000,23250.0
70,Empire Beauty School-Paradise Valley,Phoenix,AZ,0.0,0.0,0.0,1,,,0.0,...,0.04,0.0,0.0,0.16,0,0.6349,0.5873,0.4651,17800,9588.0
71,Empire Beauty School-Tucson,Tucson,AZ,0.0,0.0,0.0,0,,,0.0,...,0.0,0.0,0.0079,0.2222,1,0.7962,0.6615,0.4229,18200,9833.0
129,University of Arkansas for Medical Sciences,Little Rock,AR,0.0,0.0,0.0,0,,,0.0,...,0.0281,0.007,0.0169,0.2433,1,0.3944,0.6144,0.5133,61400,12500.0
134,Lyon College,Batesville,AR,0.0,0.0,0.0,1,505.0,528.0,0.0,...,0.0,0.0333,0.0638,0.0101,1,0.4578,0.674,0.0524,38600,25000.0


## 过滤状态

In [27]:
# 查看有多少个分组
grouped = college.groupby('stabbr')
grouped.ngroups

59

In [28]:
# nunique可以得到同样的效果
college['stabbr'].nunique()

59

In [32]:
college[['ugds', 'ugds_white']].head()

Unnamed: 0,ugds,ugds_white
0,4206.0,0.0333
1,11383.0,0.5922
2,291.0,0.299
3,5451.0,0.6988
4,4811.0,0.0158


In [34]:
def check_minority(df, threshold):
    minority_pct = 1 - df['ugds_white']  # 计算非白人比例
    total_minority = (df['ugds'] * minority_pct).sum()  # 计算非白人总数
    total_ugds = df['ugds'].sum()  # 计算总人数
    total_minority_pct = total_minority / total_ugds  # 计算非白人比例
    return total_minority_pct > threshold

# grouped 变量有一个filter()方法，可以接受一个自定义函数，决定是否保留一个分组
college_filtered = grouped.filter(check_minority, threshold=.5)  # grouped.filter()返回原始数据中符合条件的所有行（而非聚合结果）。
college_filtered.head()

Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,...,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
68,Everest College-Phoenix,Phoenix,AZ,0.0,0.0,0.0,1,,,0.0,...,0.0373,0.0,0.1026,0.4749,0,0.8291,0.7151,0.67,28600,9500
69,Collins College,Phoenix,AZ,0.0,0.0,0.0,0,,,0.0,...,0.0241,0.0,0.3855,0.3373,0,0.7205,0.8228,0.4764,25700,47000
70,Empire Beauty School-Paradise Valley,Phoenix,AZ,0.0,0.0,0.0,1,,,0.0,...,0.04,0.0,0.0,0.16,0,0.6349,0.5873,0.4651,17800,9588
71,Empire Beauty School-Tucson,Tucson,AZ,0.0,0.0,0.0,0,,,0.0,...,0.0,0.0,0.0079,0.2222,1,0.7962,0.6615,0.4229,18200,9833
72,Thunderbird School of Global Management,Glendale,AZ,0.0,0.0,0.0,0,,,0.0,...,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,118900,PrivacySuppressed


In [36]:
# 用一些不同的阈值，检查 shape 和 college['stabbr'] 的个数
college.shape

(7535, 27)

In [37]:
college_filtered.shape

(3028, 27)

In [39]:
college['stabbr'].nunique()

59

In [38]:
college_filtered['stabbr'].nunique()

20

In [40]:
college_filtered_20 = grouped.filter(check_minority, threshold=.2)
college_filtered_20.shape

(7461, 27)

In [41]:
college_filtered_20['stabbr'].nunique()

57

In [42]:
college_filtered_70 = grouped.filter(check_minority, threshold=.7)
college_filtered_70.shape

(957, 27)

In [43]:
college_filtered_70['stabbr'].nunique()

10

In [44]:
college_filtered_95 = grouped.filter(check_minority, threshold=.95)
college_filtered_95.shape

(156, 27)

## 举例1 -- 减肥对赌

In [2]:
import pandas as pd
import numpy as np

In [3]:
weight_loss = pd.read_csv('weight_loss.csv')
weight_loss.query('Month == "Jan"')  # 等同于 weight_loss[weight_loss['Month'] == 'Jan']

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,197
1,Amy,Jan,Week 1,291
2,Bob,Jan,Week 2,189
3,Amy,Jan,Week 2,288
4,Bob,Jan,Week 3,189
5,Amy,Jan,Week 3,283
6,Bob,Jan,Week 4,190
7,Amy,Jan,Week 4,283


In [4]:
# 定义一个求减肥比例的函数
def find_perc_loss(s):
    return (s - s.iloc[0]) / s.iloc[0]

bob_jan = weight_loss.query('Name=="Bob" and Month=="Jan"')
find_perc_loss(bob_jan['Weight'])

0    0.000000
2   -0.040609
4   -0.040609
6   -0.035533
Name: Weight, dtype: float64

In [7]:
# 对name和month进行分组，然后使用transform方法，传入函数，对数值进行转换
pcnt_loss = weight_loss.groupby(['Name', 'Month'])['Weight'].transform(find_perc_loss)  # 对分组后的 Weight（体重）列 应用转换
pcnt_loss.head(8)

0    0.000000
1    0.000000
2   -0.040609
3   -0.010309
4   -0.040609
5   -0.027491
6   -0.035533
7   -0.027491
Name: Weight, dtype: float64

In [8]:
# transfrom 之后的结果，行数不变，可以复制给原始dataframe 作为一个新列
weight_loss['Perc Weight Loss'] = pcnt_loss.round(3)
weight_loss.query('Name=="Bob" and Month in ["Jan", "Feb"]')

Unnamed: 0,Name,Month,Week,Weight,Perc Weight Loss
0,Bob,Jan,Week 1,197,0.0
2,Bob,Jan,Week 2,189,-0.041
4,Bob,Jan,Week 3,189,-0.041
6,Bob,Jan,Week 4,190,-0.036
8,Bob,Feb,Week 1,188,0.0
10,Bob,Feb,Week 2,185,-0.016
12,Bob,Feb,Week 3,183,-0.027
14,Bob,Feb,Week 4,180,-0.043


In [9]:
# 查看第四周数据
week4 = weight_loss.query('Week == "Week 4"')
week4

Unnamed: 0,Name,Month,Week,Weight,Perc Weight Loss
6,Bob,Jan,Week 4,190,-0.036
7,Amy,Jan,Week 4,283,-0.027
14,Bob,Feb,Week 4,180,-0.043
15,Amy,Feb,Week 4,275,-0.021
22,Bob,Mar,Week 4,171,-0.023
23,Amy,Mar,Week 4,267,-0.022
30,Bob,Apr,Week 4,162,-0.041
31,Amy,Apr,Week 4,258,-0.023
38,Bob,May,Week 4,153,-0.044
39,Amy,May,Week 4,250,-0.023


In [10]:
# 用 pivot重构dataframe，让amy和bob的数据并排放置
winner = week4.pivot(index='Month', columns='Name', values='Perc Weight Loss')  # pivot 方法将 长格式数据 重塑为 宽格式透视表
winner

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Apr,-0.023,-0.041
Aug,-0.026,-0.053
Feb,-0.021,-0.043
Jan,-0.027,-0.036
Jul,-0.025,-0.049
Jun,-0.028,-0.046
Mar,-0.022,-0.023
May,-0.023,-0.044
Oct,-0.028,-0.052
Sep,-0.031,-0.048


In [12]:
winner['Winner'] = np.where(winner['Amy'] < winner['Bob'], 'Amy', 'Bob')


In [13]:
winner.head()

Name,Amy,Bob,Winner
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apr,-0.023,-0.041,Bob
Aug,-0.026,-0.053,Bob
Feb,-0.021,-0.043,Bob
Jan,-0.027,-0.036,Bob
Jul,-0.025,-0.049,Bob


In [14]:
winner[['Amy', 'Bob']].style.highlight_min(axis=1)

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Apr,-0.023,-0.041
Aug,-0.026,-0.053
Feb,-0.021,-0.043
Jan,-0.027,-0.036
Jul,-0.025,-0.049
Jun,-0.028,-0.046
Mar,-0.022,-0.023
May,-0.023,-0.044
Oct,-0.028,-0.052
Sep,-0.031,-0.048


In [15]:
# ⽤value_counts()返回最后的比分
winner.Winner.value_counts()

Winner
Bob    10
Name: count, dtype: int64

In [16]:
# pandas 默认是按照字母顺序进行排序的
week4a = week4.copy()
month_chron = week4a['Month'].unique()
month_chron

array(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',
       'Oct'], dtype=object)

In [18]:
# 转换为categorical 变量，可以做成按时间排序
week4a['Month'] = pd.Categorical(week4a['Month'], categories=month_chron, ordered=True)  # 将月份列转换为有序分类变量
week4a.pivot(index='Month', columns='Name', values='Perc Weight Loss')

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,-0.027,-0.036
Feb,-0.021,-0.043
Mar,-0.022,-0.023
Apr,-0.023,-0.041
May,-0.023,-0.044
Jun,-0.028,-0.046
Jul,-0.025,-0.049
Aug,-0.026,-0.053
Sep,-0.031,-0.048
Oct,-0.028,-0.052


## 举例2 -- ⽤apply计算每州的加权平均STA分数

In [19]:
college = pd.read_csv('college.csv')
subset = ['ugds', 'satmtmid', 'satvrmid']
college2 = college.dropna(subset=subset)  # 删除在 ugds、satmtmid 和 satvrmid 列中存在缺失值的行
college.shape

(7535, 27)

In [20]:
college2.shape

(1184, 27)

In [None]:
def weighted_math_average(df):
    weighted_math = df['ugds'] * df['satvrmid']
    return int(weighted_math.sum() / df['ugds'].sum())

college2.groupby('stabbr').apply(weighted_math_average).head()

In [None]:
college2.groupby('stabbr').agg(weighted_math_average).head()

In [None]:
college2.groupby('stabbr')['satvrmid'].agg(weighted_math_average)

In [None]:
from collections import OrderedDict
def weighted_average(df):
    data = OrderedDict()
    weight_m = df['ugds'] * df['satmtmid']
    weight_v = df['ugds'] * df['satvrmid']
    data['weighted_math_avg'] = weight_m.sum() / df['ugds'].sum()
    data['weighted_verbal_avg'] = weight_v.sum() / df['ugds'].sum()
    data['math_avg'] = df['satmtmid'].mean()
    data['verbal_avg'] = df['satvrmid'].mean()
    data['count'] = len(df)
    return pd.Series(data, dtype='int')

college2.groupby('stabbr').apply(weighted_average).head(10)

In [None]:
from collections import OrderedDict
def weighted_average(df):
    data = OrderedDict()
    weight_m = df['ugds'] * df['satmtmid']
    weight_v = df['ugds'] * df['satvrmid']
    wm_avg = weight_m.sum() / df['ugds'].sum()
    wv_avg = weight_v.sum() / df['ugds'].sum()
    data['weighted_math_avg'] = wm_avg
    data['weighted_verbal_avg'] = wv_avg
    data['math_avg'] = df['satmtmid'].mean()
    data['verbal_avg'] = df['satvrmid'].mean()
    data['count'] = len(df)
    return pd.Series(data, dtype='int')

college2.groupby('stabbr').apply(weighted_average).head(10)

In [None]:
from scipy.stats import gmean, hmean

def calculate_means(df):
    df_means = pd.DataFrame(index=['Arithmetic', 'Weighted', 'Geometric', 'Harmonic'])
    cols = ['SATMTMID', 'SATVRMID']
    for col in cols:
        arithmetic = df[col].mean()
        weighted = np.average(df[col], weights=df['UGDS'])
        geometric = gmean(df[col])
        harmonic = hmean(df[col])
    df_means[col] = [arithmetic, weighted, geometric, harmonic]
    df_means['count'] = len(df)
    return df_means.astype(int)

college2.groupby('STABBR').filter(lambda x:len(x) != 1).groupby('stabbr').apply(calculate_means).head(10)