# 数据分组运算

In [2]:
import pandas as pd
import numpy as np

In [13]:
# 分组运算后保持shape
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1, 10, 8),
            'data2': np.random.randint(1, 10, 8)}
df_obj = pd.DataFrame(dict_obj)
df_obj

Unnamed: 0,data1,data2,key1,key2
0,6,9,a,one
1,8,6,b,one
2,3,2,a,two
3,8,6,b,three
4,1,1,a,two
5,2,9,b,two
6,4,1,a,one
7,7,4,a,three


In [7]:
# 按key1分组后，计算data1，data2的统计信息并附加到原始表格中
k1_sum = df_obj.groupby('key1').sum().add_prefix('sum_')
k1_sum

Unnamed: 0_level_0,sum_data1,sum_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,21,15
b,18,10


In [8]:
# 方法1，使用merge
pd.merge(df_obj, k1_sum, left_on='key1', right_index=True)

Unnamed: 0,data1,data2,key1,key2,sum_data1,sum_data2
0,3,4,a,one,21,15
2,1,4,a,two,21,15
4,8,2,a,two,21,15
6,3,2,a,one,21,15
7,6,3,a,three,21,15
1,9,4,b,one,18,10
3,6,2,b,three,18,10
5,3,4,b,two,18,10


* transform方法

In [14]:
# 方法2，使用transform
k1_sum_tf = df_obj.groupby('key1').transform(np.sum).add_prefix('sum_')
df_obj[k1_sum_tf.columns] = k1_sum_tf
df_obj

Unnamed: 0,data1,data2,key1,key2,sum_data1,sum_data2,sum_key2
0,6,9,a,one,6,9,one
1,8,6,b,one,8,6,one
2,3,2,a,two,3,2,two
3,8,6,b,three,8,6,three
4,1,1,a,two,1,1,two
5,2,9,b,two,2,9,two
6,4,1,a,one,4,1,one
7,7,4,a,three,7,4,three


In [15]:
# 自定义函数传入transform
def diff_mean(s):
    """
        返回数据与均值的差值
    """
    return s - s.mean()

df_obj.groupby('key1').transform(diff_mean)

Unnamed: 0,data1,data2,sum_data1,sum_data2
0,1.8,5.6,1.8,5.6
1,2.0,-1.0,2.0,-1.0
2,-1.2,-1.4,-1.2,-1.4
3,2.0,-1.0,2.0,-1.0
4,-3.2,-2.4,-3.2,-2.4
5,-4.0,2.0,-4.0,2.0
6,-0.2,-2.4,-0.2,-2.4
7,2.8,0.6,2.8,0.6


In [2]:
dataset_path = './starcraft.csv'
df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerWeek', 
                                             'TotalHours', 'APM'])

* apply

In [5]:
def top_n(df, n=3, column='APM'):
    """
        返回每个分组按 column 的 top n 数据
    """
    return df.sort_values(by=column)[-n:]

df_data.groupby('LeagueIndex').apply(top_n)

Unnamed: 0_level_0,Unnamed: 1_level_0,LeagueIndex,Age,HoursPerWeek,TotalHours,APM
LeagueIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1753,1,20.0,28.0,100.0,139.6362
1,2246,1,27.0,8.0,250.0,141.6282
1,2214,1,20.0,12.0,730.0,172.953
2,1520,2,29.0,6.0,250.0,151.647
2,3229,2,16.0,24.0,110.0,156.738
2,3062,2,20.0,6.0,100.0,179.625
3,2883,3,16.0,8.0,800.0,208.95
3,484,3,19.0,42.0,450.0,220.0692
3,1557,3,22.0,6.0,200.0,226.6554
4,2637,4,23.0,24.0,650.0,227.2272


* 禁止分组 group_keys=False

In [6]:
df_data.groupby('LeagueIndex', group_keys=False).apply(top_n)

Unnamed: 0,LeagueIndex,Age,HoursPerWeek,TotalHours,APM
1753,1,20.0,28.0,100.0,139.6362
2246,1,27.0,8.0,250.0,141.6282
2214,1,20.0,12.0,730.0,172.953
1520,2,29.0,6.0,250.0,151.647
3229,2,16.0,24.0,110.0,156.738
3062,2,20.0,6.0,100.0,179.625
2883,3,16.0,8.0,800.0,208.95
484,3,19.0,42.0,450.0,220.0692
1557,3,22.0,6.0,200.0,226.6554
2637,4,23.0,24.0,650.0,227.2272
