In [2]:
# 引入模块
# -*- coding:utf-8 -*-

# 常用包的函数
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from numpy.random import randn
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

# 解决显示汉字不正确问题
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

%matplotlib inline

pd.set_option('precision', 6) #设置精度
pd.set_option('display.float_format', lambda x: '%.2f' % x) 
pd.options.display.max_rows = 100


In [None]:
# pandas的分组运算
# 根据一个或者多个键拆分pandas对象
# 计算分组摘要统计，如计数、平均值、标准差、等自定义函数
# 对DataFrame的列应用各种函数
# 应用组内转换和运算，如规格化、线性回归、排名或选取子集
# 计算透视表或交叉表
# 执行分位数分析以及其他分组分析

In [4]:
df = DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':randn(5),'data2':randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-0.75,-0.96,a,one
1,1.38,-1.78,a,two
2,0.15,1.15,b,one
3,0.51,-0.54,b,two
4,0.24,-0.22,a,one


In [6]:
# 按照key1进行分组，计算data1列的平均值
# 方位data1, 根据key1调用groupby
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x0000028BED074390>

In [8]:
# 这里grouped时候GroupBy对象
# 它实际上还没有进行任何计算，只是含有一些有关分组键df['key1']的中间数据
# 即该对象已经有了接下来对各分组执行运算所需的一切信息
grouped.mean()

key1
a   0.29
b   0.33
Name: data1, dtype: float64

In [10]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one    -0.26
      two     1.38
b     one     0.15
      two     0.51
Name: data1, dtype: float64

In [11]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.26,1.38
b,0.15,0.51


In [12]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.75,-0.96,a,one
1,1.38,-1.78,a,two
2,0.15,1.15,b,one
3,0.51,-0.54,b,two
4,0.24,-0.22,a,one


In [15]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
   data1  data2 key1 key2
0  -0.75  -0.96    a  one
1   1.38  -1.78    a  two
4   0.24  -0.22    a  one
b
   data1  data2 key1 key2
2   0.15   1.15    b  one
3   0.51  -0.54    b  two


In [16]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.75,-0.96,a,one
1,1.38,-1.78,a,two
2,0.15,1.15,b,one
3,0.51,-0.54,b,two
4,0.24,-0.22,a,one


In [18]:
# 多重键的情况，元组的第一个元素将会是由键值组成的元组
for(k1,k2),group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)

a one
   data1  data2 key1 key2
0  -0.75  -0.96    a  one
4   0.24  -0.22    a  one
a two
   data1  data2 key1 key2
1   1.38  -1.78    a  two
b one
   data1  data2 key1 key2
2   0.15   1.15    b  one
b two
   data1  data2 key1 key2
3   0.51  -0.54    b  two


In [19]:
# 将数据切片做成一个字典
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,0.15,1.15,b,one
3,0.51,-0.54,b,two


In [21]:
# groupby默认是在axis=0上进行分组的，通过设置也可以在其他任何轴上进行分组
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [24]:
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))

{dtype('float64'):    data1  data2
 0  -0.75  -0.96
 1   1.38  -1.78
 2   0.15   1.15
 3   0.51  -0.54
 4   0.24  -0.22, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [26]:
# 选取一个或一组列
# 由DataFrame产生的GroupBy对象，如果用一个或一组列名对其进行索引
# 就能实现选取部分列进行聚合的目的，即
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
# 是以下代码的语法糖
df['data1'].groupby(df['key1'])
df[['data1']].groupby(df['key1'])

<pandas.core.groupby.DataFrameGroupBy object at 0x0000028BF0B81B70>

In [29]:
# 对于大数据集，可能只需要对部分列进行聚合
# 如上只计算data2的平均值并以DataFrame形式得到结果
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.59
a,two,-1.78
b,one,1.15
b,two,-0.54


In [4]:
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['city','tenor'])
columns

MultiIndex(levels=[['JP', 'US'], [1, 3, 5]],
           labels=[[1, 1, 1, 0, 0], [0, 1, 2, 0, 1]],
           names=['city', 'tenor'])

In [6]:
hier_df = DataFrame(randn(4,5),columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.19,-0.04,-0.24,-1.54,-0.8
1,1.41,-1.88,0.94,-0.95,0.27
2,-0.46,-0.91,1.23,-0.07,-0.49
3,-1.33,-2.34,0.59,0.26,0.7


In [7]:
# count 分组中非NA值的数量
# sum 非NA值的和
# mean 非NA值的平均值
# median 非NA值的算术中位数
# std var 标准差和方差
# min max 非NA值的对小值和最大值
# prod 非NA值的乘积
# first last 第一个和最后一个非NA得值

In [8]:
# 层次化索引数据集最方便的地方就在于它能够根据索引级别进行数据聚合