In [1]:
# -*- encoding: utf-8 -*-
import numpy as np
import pandas as pd

# 模拟数据
index = pd.Index(data=["Tom", "Bob", "Mary", "James", "Andy", "Alice"], name="name")
data = {
    "age": [18, 30, np.nan, 40, np.nan, 30],
    "city": ["Bei Jing ", "Shang Hai ", "Guang Zhou", "Shen Zhen", np.nan, " "],
    "sex": [None, "male", "female", "male", np.nan, "unknown"],
    "birth": ["2000-02-10", "1988-10-17", None, "1978-08-08", np.nan, "1988-10-17"]
}
user_info = pd.DataFrame(data=data, index=index)
user_info["birth"] = pd.to_datetime(user_info.birth)
print(user_info)
print('*' * 60)


        age        city      sex      birth
name                                       
Tom    18.0   Bei Jing      None 2000-02-10
Bob    30.0  Shang Hai      male 1988-10-17
Mary    NaN  Guang Zhou   female        NaT
James  40.0   Shen Zhen     male 1978-08-08
Andy    NaN         NaN      NaN        NaT
Alice  30.0              unknown 1988-10-17
************************************************************


In [2]:
# 分组聚合操作：将筛选后的数据再加工
# 分组：列出生重复值时可以分组，索引不要分组
g1 = user_info.groupby("sex")
print(g1.groups)


{'female': ['Mary'], 'male': ['Bob', 'James'], 'unknown': ['Alice']}


In [3]:
g2 = user_info.groupby(["sex","age",])
print(g2.groups)


{('female', nan): ['Mary'], ('male', 30.0): ['Bob'], ('male', 40.0): ['James'], ('unknown', 30.0): ['Alice'], (nan, 18.0): ['Tom'], (nan, nan): ['Andy']}


In [4]:
g3 = user_info.groupby(["sex",],sort=True)
print(g3.groups)


{'female': ['Mary'], 'male': ['Bob', 'James'], 'unknown': ['Alice']}


In [5]:
g4= user_info.groupby(["sex",],sort=False)
print(g4.groups)


{'male': ['Bob', 'James'], 'female': ['Mary'], 'unknown': ['Alice']}


In [6]:
print('*' * 60)
# 遍历分组：n是分组值，g是明细数据
for n,g in g4:
    print(n)
    print(g)


************************************************************
male
        age        city   sex      birth
name                                    
Bob    30.0  Shang Hai   male 1988-10-17
James  40.0   Shen Zhen  male 1978-08-08
female
      age        city     sex birth
name                               
Mary  NaN  Guang Zhou  female   NaT
unknown
        age city      sex      birth
name                                
Alice  30.0       unknown 1988-10-17


In [7]:
# 聚合：分组是为了统计，统计就需要聚合
# 此处len函数实际是计算age列的数组的长度，简单说就是计数
df2 = g4["age"].agg(len)
print(df2)


sex
male       2.0
female     1.0
unknown    1.0
Name: age, dtype: float64


In [8]:
df3 = g4.agg(len)
print(df3)
df4 = g2["age"].agg(len)
print(df4)
df5 = g2.agg(len)
print(df5)


         age  city  birth
sex                      
male     2.0     2      2
female   1.0     1      1
unknown  1.0     1      1
sex      age 
male     30.0    1.0
         40.0    1.0
unknown  30.0    1.0
Name: age, dtype: float64
              city  birth
sex     age              
male    30.0     1      1
        40.0     1      1
unknown 30.0     1      1


In [9]:
# 将多层索引转换为表格，方便读取数据
# 1
print(df5.reset_index())
# 2
g5 = user_info.groupby(["sex","age",],as_index=False)
df6 = g5.agg(len)
print(df6)


       sex   age  city  birth
0     male  30.0     1      1
1     male  40.0     1      1
2  unknown  30.0     1      1
       sex   age  city  birth
0     male  30.0     1      1
1     male  40.0     1      1
2  unknown  30.0     1      1
