In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('groupby.csv')

In [3]:
data

Unnamed: 0,company,salary,age
0,A,13,40
1,A,49,18
2,C,15,16
3,B,40,49
4,A,48,40
5,A,12,27
6,B,17,18
7,B,5,39
8,B,37,19
9,B,15,18


In [9]:
# groupby 可以理解为，按照哪些字段进行分组

In [10]:
#  以 company 为分组
#  会返回一个 DataFrameGroupBy 对象

In [4]:
g = data.groupby('company')

In [5]:
g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x10daba850>

In [14]:
# 可以通过list 查看 DataFrameGroupBy 对象内部情况
# company 通过groupby  -- > A , B , C
# 总结来说，groupby的过程就是将原有的DataFrame按照groupby的字段（这里是company），划分为若干个分组DataFrame

In [6]:
list(g)

[('A',
    company  salary  age
  0       A      13   40
  1       A      49   18
  4       A      48   40
  5       A      12   27),
 ('B',
    company  salary  age
  3       B      40   49
  6       B      17   18
  7       B       5   39
  8       B      37   19
  9       B      15   18),
 ('C',
    company  salary  age
  2       C      15   16)]

In [None]:
# 通过 get_group（）方法，我们可以查看一个组的数据情况

In [10]:
g.get_group('B')

Unnamed: 0,company,salary,age
3,B,40,49
6,B,17,18
7,B,5,39
8,B,37,19
9,B,15,18


In [16]:
# 通过groupby 分组后，可以通过 agg 函数，进行聚合操作

In [17]:
# min    最小值
# max    最大值
# sum    求和
# mean   均值
# median 中位数
# count  计数
# nunique 计算去重后个数(类似SQL count distinct)

In [18]:
# 计算不同公司员工的平均年龄和平均薪水

In [11]:
data

Unnamed: 0,company,salary,age
0,A,13,40
1,A,49,18
2,C,15,16
3,B,40,49
4,A,48,40
5,A,12,27
6,B,17,18
7,B,5,39
8,B,37,19
9,B,15,18


In [12]:
data.groupby('company').agg('mean')

Unnamed: 0_level_0,salary,age
company,Unnamed: 1_level_1,Unnamed: 2_level_1
A,30.5,31.25
B,22.8,28.6
C,15.0,16.0


In [20]:
# 指定列，进行操作
# 计算不同公司员工的平均年龄以及薪水的中位数

In [13]:
data.groupby('company').agg({'age':'mean','salary':'median'})

Unnamed: 0_level_0,age,salary
company,Unnamed: 1_level_1,Unnamed: 2_level_1
A,31.25,30.5
B,28.6,17.0
C,16.0,15.0


In [22]:
# 计算不同公司员工的平均年龄以及薪水的中位数，均值，总人数

In [22]:
data.groupby('company').agg({'age':'mean','salary':['median','mean','count']})

Unnamed: 0_level_0,age,salary,salary,salary
Unnamed: 0_level_1,mean,median,mean,count
company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,31.25,30.5,30.5,4
B,28.6,17.0,22.8,5
C,16.0,15.0,15.0,1


In [15]:
# 扩展
#  count   计数,count不包含NaN值
#  size    计数时包含NaN值
#  nunique 计算去重后个数(类似SQL count distinct)

In [23]:
df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"],
                   "City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"],
                   "Val":[4,3,3,np.nan,np.nan,4]})

In [24]:
df

Unnamed: 0,Name,City,Val
0,Alice,Seattle,4.0
1,Bob,Seattle,3.0
2,Mallory,Portland,3.0
3,Mallory,Seattle,
4,Bob,Seattle,
5,Mallory,Portland,4.0


In [27]:
df.groupby('City').agg({'Val':['count','size'],'Name':'nunique'})

Unnamed: 0_level_0,Val,Val,Name
Unnamed: 0_level_1,count,size,nunique
City,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Portland,2,2,1
Seattle,2,4,3


In [26]:
df

Unnamed: 0,Name,City,Val
0,Alice,Seattle,4.0
1,Bob,Seattle,3.0
2,Mallory,Portland,3.0
3,Mallory,Seattle,
4,Bob,Seattle,
5,Mallory,Portland,4.0
