In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from pandas import DataFrame,Series

  from pandas.core import datetools


In [2]:
# 分组级运算和转换
df = DataFrame({'key1':['a','a','b','b','a'],
               'key2':['one','two','one','two','one'],
               'data1':np.random.randn(5),
               'data2':np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,1.831478,-0.118929,a,one
1,-0.251455,-0.001471,a,two
2,0.304247,-0.729769,b,one
3,-0.737132,1.026276,b,two
4,0.770519,0.61001,a,one


In [3]:
k1_means = df.groupby(['key1']).mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.783514,0.163203
b,-0.216443,0.148254


In [6]:
pd.merge(df,k1_means,left_on='key1',right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,1.831478,-0.118929,a,one,0.783514,0.163203
1,-0.251455,-0.001471,a,two,0.783514,0.163203
4,0.770519,0.61001,a,one,0.783514,0.163203
2,0.304247,-0.729769,b,one,-0.216443,0.148254
3,-0.737132,1.026276,b,two,-0.216443,0.148254


In [10]:
people = DataFrame(np.random.randn(5,5),
                  columns=['a','b','c','d','e'],
                  index=['Joe','Steve','Wes','Jim','Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,0.622658,0.458627,1.155816,0.30764,0.70038
Steve,0.325097,-0.412252,0.758,-0.013444,1.115646
Wes,-0.640393,-0.076713,0.785959,-1.065629,-0.425132
Jim,-0.248061,0.273052,-0.074628,-0.263787,1.138415
Travis,-2.347733,0.175672,1.490583,0.561797,-2.572534


In [11]:
# 使用key来进行分组，则第一行、第三行、第五行的均值
# 第二行、第四行的均值
key = ['one','two','one','two','one'] # 每一行的名称
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-0.788489,0.185862,1.144119,-0.065398,-0.765762
two,0.038518,-0.0696,0.341686,-0.138616,1.127031


In [12]:
# 这行代码的意思是，使用key进行分组，计算出mean值，
# 再将计算出来的均值填充到这些行。
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,-0.788489,0.185862,1.144119,-0.065398,-0.765762
Steve,0.038518,-0.0696,0.341686,-0.138616,1.127031
Wes,-0.788489,0.185862,1.144119,-0.065398,-0.765762
Jim,0.038518,-0.0696,0.341686,-0.138616,1.127031
Travis,-0.788489,0.185862,1.144119,-0.065398,-0.765762


In [14]:
# 自定义填充（转换）规则，demean
def demean(arr):
    return arr - arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned

Unnamed: 0,a,b,c,d,e
Joe,1.411147,0.272765,0.011697,0.373037,1.466142
Steve,0.286579,-0.342652,0.416314,0.125171,-0.011384
Wes,0.148097,-0.262575,-0.35816,-1.000232,0.34063
Jim,-0.286579,0.342652,-0.416314,-0.125171,0.011384
Travis,-1.559244,-0.01019,0.346463,0.627194,-1.806772


In [16]:
# 因为前面每个值都减去了平均值，所以应该是0.
# 显示不为0是因为浮点数计算误差
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-1.480297e-16,0.0,-1.110223e-16,3.700743e-17,1.480297e-16
two,0.0,0.0,0.0,0.0,0.0


In [17]:
# apply : 一般性的“拆分-应用-合并”

In [18]:
import seaborn as sns

In [19]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [20]:
tips['tip_pct'] = tips.tip/tips.total_bill
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [21]:
# 获取小费比例最高的n条数据
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips,n=3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [22]:
# 先按是否吸烟分组，再分别查看小费比例
tips.groupby('smoker').apply(top,n=2,column='tip_pct')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199


In [32]:
result = tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799


In [38]:
# 对于unstack的作用，如下理解
'''
Pivot a level of the (necessarily hierarchical) index labels, returning
    a DataFrame having a new level of column labels whose inner-most level
    consists of the pivoted index labels.
'''
# 根据level这个参数，将多层的index中对应的某一层index，比如level=-1，
# 表示将最内层的index作为columns的新一层，生成新的dataframe。
# 如果dataframe的index是单层的，那么生成新的Series，此时Series的
# index是多层的
result.unstack('smoker')

       smoker
count  Yes        93.000000
       No        151.000000
mean   Yes         0.163196
       No          0.159328
std    Yes         0.085119
       No          0.039910
min    Yes         0.035638
       No          0.056797
25%    Yes         0.106771
       No          0.136906
50%    Yes         0.153846
       No          0.155625
75%    Yes         0.195059
       No          0.185014
max    Yes         0.710345
       No          0.291990
dtype: float64

In [45]:
# 对于stack的作用，理解如下：
'''
Pivot a level of the (possibly hierarchical) column labels, returning a
    DataFrame (or Series in the case of an object with a single level of
    column labels) having a hierarchical index with a new inner-most level
    of row labels.
'''
# 将多层的columns，根据level参数，选定某一层，提取出来，转换成index
# 的新一层，填充到最内层，生成多层index
result = tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
result.unstack(level=0).stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
day,Unnamed: 1_level_1,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Thur,142,No,41.19,5.0,Male,No,Thur,Lunch,5.0,0.121389
Thur,197,Yes,43.11,5.0,Female,Yes,Thur,Lunch,4.0,0.115982
Fri,94,No,22.75,3.25,Female,No,Fri,Dinner,2.0,0.142857
Fri,95,Yes,40.17,4.73,Male,Yes,Fri,Dinner,4.0,0.11775
Sat,170,Yes,50.81,10.0,Male,Yes,Sat,Dinner,3.0,0.196812
Sat,212,No,48.33,9.0,Male,No,Sat,Dinner,4.0,0.18622
Sun,156,No,48.17,5.0,Male,No,Sun,Dinner,6.0,0.103799
Sun,182,Yes,45.35,3.5,Male,Yes,Sun,Dinner,3.0,0.077178


In [46]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199


In [47]:
result.unstack('smoker')

       smoker
count  Yes        93.000000
       No        151.000000
mean   Yes         0.163196
       No          0.159328
std    Yes         0.085119
       No          0.039910
min    Yes         0.035638
       No          0.056797
25%    Yes         0.106771
       No          0.136906
50%    Yes         0.153846
       No          0.155625
75%    Yes         0.195059
       No          0.185014
max    Yes         0.710345
       No          0.291990
dtype: float64

In [48]:
'''
default:axis=0
axis : {0 or 'index', 1 or 'columns'}, default 0
        * 0 or 'index': apply function to each column
        * 1 or 'columns': apply function to each row
'''
result.apply(lambda x:x.describe())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,122.0,0.161262,0.062514,0.046217,0.121838,0.154735,0.190036,0.501167
std,41.012193,0.002735,0.031968,0.014961,0.021309,0.001258,0.007103,0.295822
min,93.0,0.159328,0.03991,0.035638,0.106771,0.153846,0.185014,0.29199
25%,107.5,0.160295,0.051212,0.040928,0.114305,0.154291,0.187525,0.396578
50%,122.0,0.161262,0.062514,0.046217,0.121838,0.154735,0.190036,0.501167
75%,136.5,0.162229,0.073817,0.051507,0.129372,0.15518,0.192547,0.605756
max,151.0,0.163196,0.085119,0.056797,0.136906,0.155625,0.195059,0.710345


In [50]:
# 禁止分组键
# 禁止构成多重索引
# 此时smoker没有作为一层index，而是作为一个columns存在
tips.groupby('smoker',group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199


In [52]:
# 分位数和桶分析
frame = DataFrame({'data1':np.random.randn(1000),
                  'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1,4)# 切4份，等距划分 qcut是等频划分
factor[:5]# 前5个元素

0      (2.027, 3.98]
1    (0.0737, 2.027]
2    (-1.88, 0.0737]
3    (-1.88, 0.0737]
4    (0.0737, 2.027]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.841, -1.88] < (-1.88, 0.0737] < (0.0737, 2.027] < (2.027, 3.98]]

In [57]:
def get_stats(group):
    return {'min':group.min(),
           'max':group.max(),
           'count':group.count(),
           'mean':group.mean()}
# 根据data1的分段，对data2进行分组
grouped = frame.data2.groupby(factor)
result = grouped.apply(get_stats)
result.unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.841, -1.88]",31.0,1.840632,0.128661,-0.910717
"(-1.88, 0.0737]",520.0,3.275179,0.00125,-3.422634
"(0.0737, 2.027]",428.0,2.730142,-0.006583,-3.065804
"(2.027, 3.98]",21.0,2.299309,-0.178091,-2.379393


In [58]:
# False的话默认是数字编号
grouping = pd.qcut(frame.data1,10,labels=list('ABCDEFGHIJ'))
grouping.head() # 返回每个元素的区间编号

0    J
1    G
2    E
3    B
4    H
Name: data1, dtype: category
Categories (10, object): [A < B < C < D ... G < H < I < J]

In [59]:
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,100.0,2.281504,0.10774,-2.398468
B,100.0,3.275179,-0.117852,-2.818199
C,100.0,2.888282,-0.054843,-2.4989
D,100.0,2.060446,0.123455,-3.422634
E,100.0,1.844163,-0.057588,-2.569686
F,100.0,2.606474,0.136986,-2.718105
G,100.0,2.730142,-0.096189,-2.527637
H,100.0,1.978674,-0.035566,-2.675974
I,100.0,2.458013,-0.045963,-3.022102
J,100.0,2.299309,0.02063,-3.065804


In [None]:
# 示例：用特定分组的值填充缺失值