# 数据聚合与分组运算

In [1]:
import pandas as pd
import numpy as np

## 1、groupby函数

In [25]:
df = pd.DataFrame({'key1':['a','b','c','a','b'],'data1':[1,2,3,4,5],'key2':['a1','a1','c1','d1','a1'],'data2':[1,2,3,5,4]})
print(df)

  key1  data1 key2  data2
0    a      1   a1      1
1    b      2   a1      2
2    c      3   c1      3
3    a      4   d1      5
4    b      5   a1      4


In [18]:
# 以key1为分组条件，对data1进行分组
dg1 = df['data1'].groupby(df['key1'])
# 结果如下
# a 1 4
# b 2 5
# c 3

# 打印分组的个数情况
print(dg1.value_counts())

# 求每个类别的统计描述
print(dg1.describe())

key1  data1
a     1        1
      4        1
b     2        1
      5        1
c     3        1
Name: data1, dtype: int64
      count  mean      std  min   25%  50%   75%  max
key1                                                 
a       2.0   2.5  2.12132  1.0  1.75  2.5  3.25  4.0
b       2.0   3.5  2.12132  2.0  2.75  3.5  4.25  5.0
c       1.0   3.0      NaN  3.0  3.00  3.0  3.00  3.0


In [21]:
# 以key1、key2为分组条件，对data1进行分组
dg2 = df['data1'].groupby([df['key1'],df['key2']])

# 打印分组的个数情况
print(dg2.value_counts())

# 求每个类别的统计描述
print(dg2.describe())

key1  key2  data1
a     a1    1        1
      d1    4        1
b     a1    2        1
            5        1
c     c1    3        1
Name: data1, dtype: int64
           count  mean      std  min   25%  50%   75%  max
key1 key2                                                 
a    a1      1.0   1.0      NaN  1.0  1.00  1.0  1.00  1.0
     d1      1.0   4.0      NaN  4.0  4.00  4.0  4.00  4.0
b    a1      2.0   3.5  2.12132  2.0  2.75  3.5  4.25  5.0
c    c1      1.0   3.0      NaN  3.0  3.00  3.0  3.00  3.0


In [28]:
# 以key1为分组
dg3 = df.groupby('key1')
# 打印分组的个数情况
print(dg3.sum())

# 每个分组的个数
print(dg3.size())

      data1  data2
key1              
a         5      6
b         7      6
c         3      3
key1
a    2
b    2
c    1
dtype: int64


In [34]:
# 分组迭代
for name,group in df.groupby('key1'):
    print(name)
    print("====")
    print(group)
    print('---------------')
    
for name,group in df.groupby(['key1','key2']):
    print(name)
    print("====")
    print(group)
    print('---------------')

a
====
  key1  data1 key2  data2
0    a      1   a1      1
3    a      4   d1      5
---------------
b
====
  key1  data1 key2  data2
1    b      2   a1      2
4    b      5   a1      4
---------------
c
====
  key1  data1 key2  data2
2    c      3   c1      3
---------------
('a', 'a1')
====
  key1  data1 key2  data2
0    a      1   a1      1
---------------
('a', 'd1')
====
  key1  data1 key2  data2
3    a      4   d1      5
---------------
('b', 'a1')
====
  key1  data1 key2  data2
1    b      2   a1      2
4    b      5   a1      4
---------------
('c', 'c1')
====
  key1  data1 key2  data2
2    c      3   c1      3
---------------


## 2、数据聚合
**即对已分组好的组进行函数处理**

In [64]:
# 对dg3的分组，各组进行求和
# print(dg3.sum())

# 对dg3的分组，各组进行求最大值
# print(dg3.max())

# 对dg3的分组，各组进行求最小值
# print(dg3.min())


# 自定义聚合函数
def max_min(arr):
    return arr.max() - arr.min()
    
print(dg3.agg(max_min))

# 同时调用多个聚合函数
print(dg3.agg(['sum','max',max_min]))

      data1  data2
key1              
a         3      4
b         3      2
c         0      0
     data1             data2            
       sum max max_min   sum max max_min
key1                                    
a        5   4       3     6   5       4
b        7   5       3     6   4       2
c        3   3       0     3   3       0


In [81]:
print(dg1.size())
print(dg1.transform(sum))
print(dg1.apply(max))
print(pd.concat([df,dg1.transform(sum)],axis=1))

key1
a    2
b    2
c    1
Name: data1, dtype: int64
0    5
1    7
2    3
3    5
4    7
Name: data1, dtype: int64
key1
a    4
b    5
c    3
Name: data1, dtype: int64
  key1  data1 key2  data2  data1
0    a      1   a1      1      5
1    b      2   a1      2      7
2    c      3   c1      3      3
3    a      4   d1      5      5
4    b      5   a1      4      7
