In [2]:
# 数据聚合
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

In [3]:
df = DataFrame({'key1':['a','a','b','b','a'],
               'key2':['one','two','one','two','one'],
               'data1':np.random.randn(5),
               'data2':np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,2.259548,-0.394677,a,one
1,-1.134629,-1.321474,a,two
2,1.194774,0.047607,b,one
3,0.706641,-0.026281,b,two
4,-2.043443,1.838628,a,one


In [4]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)# 计算分组之后的分位数

key1
a    1.580713
b    1.145960
Name: data1, dtype: float64

In [5]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak) # 对分组之后的数据使用自定义聚合函数

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4.302991,3.160102
b,0.488132,0.073889


In [6]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.306175,2.267969,-2.043443,-1.589036,-1.134629,0.562459,2.259548,3.0,0.040825,1.624441,-1.321474,-0.858076,-0.394677,0.721975,1.838628
b,2.0,0.950707,0.345162,0.706641,0.828674,0.950707,1.072741,1.194774,2.0,0.010663,0.052247,-0.026281,-0.007809,0.010663,0.029135,0.047607


In [7]:
# 优化过的聚合函数：
# count：     非NA值的数量
# sum：       非NA值的和
# mean：      非NA值的平均数
# median：    非NA值的中位数
# std/var：   无偏（分母为n - 1）的标准差和方差
# min/max：   非NA值的最小/最大值
# prod：      非NA值的积
# first/last：第一个/最后一个非NA值

In [8]:
import seaborn as sns

In [9]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [10]:
# 新加一列，小费与账单金额的比例
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [11]:
# 面向列的多函数应用

In [15]:
# 根据用餐时间做分组
grouped = tips.groupby(['smoker','time'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')

smoker  time  
Yes     Lunch     0.170404
        Dinner    0.160828
No      Lunch     0.160920
        Dinner    0.158653
Name: tip_pct, dtype: float64

In [13]:
tips.groupby(['smoker','time']).agg({'tip_pct':np.mean})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct
smoker,time,Unnamed: 2_level_1
Yes,Lunch,0.170404
Yes,Dinner,0.160828
No,Lunch,0.16092
No,Dinner,0.158653


In [16]:
grouped_pct.agg(['mean','std',peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Yes,Lunch,0.170404,0.04277,0.1693
Yes,Dinner,0.160828,0.095153,0.674707
No,Lunch,0.16092,0.038989,0.19335
No,Dinner,0.158653,0.040458,0.235193


In [18]:
tips.groupby(['smoker','time']).agg({'tip_pct':['mean','std',peak_to_peak]})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,peak_to_peak
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Yes,Lunch,0.170404,0.04277,0.1693
Yes,Dinner,0.160828,0.095153,0.674707
No,Lunch,0.16092,0.038989,0.19335
No,Dinner,0.158653,0.040458,0.235193


In [21]:
#列的重命名 ，命名为foo  bar
grouped_pct.agg([('foo','mean'),('bar',np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1
Yes,Lunch,0.170404,0.04277
Yes,Dinner,0.160828,0.095153
No,Lunch,0.16092,0.038989
No,Dinner,0.158653,0.040458


In [22]:
# 对group后的两个字段分别作用functions
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Yes,Lunch,23,0.170404,0.259314,23,17.39913,43.11
Yes,Dinner,70,0.160828,0.710345,70,21.859429,50.81
No,Lunch,45,0.16092,0.266312,45,17.050889,41.19
No,Dinner,106,0.158653,0.29199,106,20.09566,48.33


In [23]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Yes,Lunch,23,0.170404,0.259314
Yes,Dinner,70,0.160828,0.710345
No,Lunch,45,0.16092,0.266312
No,Dinner,106,0.158653,0.29199


In [24]:
ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Yes,Lunch,0.170404,0.001829,17.39913,61.958436
Yes,Dinner,0.160828,0.009054,21.859429,104.148753
No,Lunch,0.16092,0.00152,17.050889,59.587154
No,Dinner,0.158653,0.001637,20.09566,69.604821


In [25]:
#不同的列对应不同的函数
grouped.agg({'tip' : np.max, 'size' : 'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1
Yes,Lunch,5.0,51
Yes,Dinner,10.0,173
No,Lunch,6.7,113
No,Dinner,9.0,290


In [26]:
# 每一列可以对应不同数量的函数
grouped.agg({'tip_pct':['mean','sum','count','std'],
            'size':['sum','min','max']})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size,size,size
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sum,count,std,sum,min,max
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Yes,Lunch,0.170404,3.919295,23,0.04277,51,1,4
Yes,Dinner,0.160828,11.257937,70,0.095153,173,1,5
No,Lunch,0.16092,7.241404,45,0.038989,113,1,6
No,Dinner,0.158653,16.817194,106,0.040458,290,1,6


In [29]:
# 以“无索引”的形式返回聚合数据
# 把原来的索引变成列
tips.groupby(['smoker','time'],as_index=False).mean()

Unnamed: 0,smoker,time,total_bill,tip,size,tip_pct
0,Yes,Lunch,17.39913,2.834348,2.217391,0.170404
1,Yes,Dinner,21.859429,3.066,2.471429,0.160828
2,No,Lunch,17.050889,2.673778,2.511111,0.16092
3,No,Dinner,20.09566,3.126887,2.735849,0.158653
