In [1]:
import numpy as np
import pandas as pd

In [37]:
data = pd.DataFrame({
     'Date': ['08-24'] * 4 + ['08-25'] * 4 + ['08-26'] * 4, 
     'Time': ['09:00', '12:00'] * 6,
     'No': (['SC00178'] * 2 + ['SC00181'] * 2) * 3,
     'HJ89': np.random.randint(0, 20, 12),
     'VT56': np.random.randint(10, 20, 12)})
data

Unnamed: 0,Date,Time,No,HJ89,VT56
0,08-24,09:00,SC00178,7,18
1,08-24,12:00,SC00178,15,16
2,08-24,09:00,SC00181,2,11
3,08-24,12:00,SC00181,8,18
4,08-25,09:00,SC00178,4,18
5,08-25,12:00,SC00178,15,11
6,08-25,09:00,SC00181,9,11
7,08-25,12:00,SC00181,8,18
8,08-26,09:00,SC00178,16,17
9,08-26,12:00,SC00178,7,11


### 分组

In [38]:
# 通过参数by指定需要根据哪些列进行聚合
agg = data.groupby(by = 'Date')
agg
# 返回的DataFrameGroupBy为一个DataFrame迭代对象？

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x137dd9340>

In [39]:
for name, frame in agg:
    print("[name]", name, "<type>", type(name))
    print("[frame]", type(frame))
    print("<value>\n", frame)

[name] 08-24 <type> <class 'str'>
[frame] <class 'pandas.core.frame.DataFrame'>
<value>
     Date   Time       No  HJ89  VT56
0  08-24  09:00  SC00178     7    18
1  08-24  12:00  SC00178    15    16
2  08-24  09:00  SC00181     2    11
3  08-24  12:00  SC00181     8    18
[name] 08-25 <type> <class 'str'>
[frame] <class 'pandas.core.frame.DataFrame'>
<value>
     Date   Time       No  HJ89  VT56
4  08-25  09:00  SC00178     4    18
5  08-25  12:00  SC00178    15    11
6  08-25  09:00  SC00181     9    11
7  08-25  12:00  SC00181     8    18
[name] 08-26 <type> <class 'str'>
[frame] <class 'pandas.core.frame.DataFrame'>
<value>
      Date   Time       No  HJ89  VT56
8   08-26  09:00  SC00178    16    17
9   08-26  12:00  SC00178     7    11
10  08-26  09:00  SC00181     5    15
11  08-26  12:00  SC00181    16    14


In [40]:
# 可以将DataFrame迭代器转换为字典，其中key是分组的键，value是分组后每组的数据块
entries = dict(list(agg))
entries["08-24"]

Unnamed: 0,Date,Time,No,HJ89,VT56
0,08-24,09:00,SC00178,7,18
1,08-24,12:00,SC00178,15,16
2,08-24,09:00,SC00181,2,11
3,08-24,12:00,SC00181,8,18


### 聚合

In [41]:
# 计算根据No分组之后HJ89指标的均值
# 方法一：
# ⚠️ 此时返回的是Series
data.groupby(by = "No")["HJ89"].mean()

No
SC00178    10.666667
SC00181     8.000000
Name: HJ89, dtype: float64

In [49]:
# ⚠️ 此时返回的是DataFrame
data.groupby(by = "No")[["HJ89"]].mean()

Unnamed: 0_level_0,HJ89
No,Unnamed: 1_level_1
SC00178,10.666667
SC00181,8.0


In [42]:
# 计算根据No分组之后HJ89指标的均值
# 方法二：
data["HJ89"].groupby(data['No']).mean()

No
SC00178    10.666667
SC00181     8.000000
Name: HJ89, dtype: float64

In [43]:
# 计算根据No分组之后HJ89指标的均值
# 希望返回的结果包含行索引
# 参数as_index为True即返回数据对象不包含行索引
data.groupby(by = "No", as_index = False)["HJ89"].mean()

Unnamed: 0,No,HJ89
0,SC00178,10.666667
1,SC00181,8.0


In [44]:
# 计算每只股票的HJ89指标的minmax差值
# 自定义聚合函数
def diff(arr):
    return arr.max() - arr.min()

data.groupby(by = "No", as_index = False)["HJ89"].agg(diff)

Unnamed: 0,No,HJ89
0,SC00178,12
1,SC00181,14


In [55]:
# 计算根据Date和No分组之后所有指标的均值
data.groupby(by = ["Date", "No"], as_index = False)[["HJ89", "VT56"]].mean()

Unnamed: 0,Date,No,HJ89,VT56
0,08-24,SC00178,11.0,17.0
1,08-24,SC00181,5.0,14.5
2,08-25,SC00178,9.5,14.5
3,08-25,SC00181,8.5,14.5
4,08-26,SC00178,11.5,14.0
5,08-26,SC00181,10.5,14.5
