In [1]:
import pandas as pd
import numpy as np

In [2]:
# pivot_table 可理解为 Excel中使用数据透视表

In [2]:
df = pd.read_excel('sales-funnel.xlsx')

In [3]:
df.head()

Unnamed: 0,Account,Name,Rep,Manager,Product,Quantity,Price,Status
0,714466,Trantow-Barrows,Craig Booker,Debra Henley,CPU,1,30000,presented
1,714466,Trantow-Barrows,Craig Booker,Debra Henley,Software,1,10000,presented
2,714466,Trantow-Barrows,Craig Booker,Debra Henley,Maintenance,2,5000,pending
3,737550,"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,CPU,1,35000,declined
4,146832,Kiehn-Spinka,Daniel Hilton,Debra Henley,CPU,2,65000,won


In [7]:
# 当我们建立数据透视表时，我认为最简单的方法是一步一步地做。添加项目并检查每个步骤，以验证你正在获得预期的结果。
# 创建一个最简单的数据透视表，一定需要有一个索引，即 index

In [None]:
# Manager

In [4]:
df.pivot_table(index='Manager')

Unnamed: 0_level_0,Account,Price,Quantity
Manager,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Debra Henley,513112.222222,26111.111111,1.444444
Fred Anderson,405039.0,35875.0,2.125


In [12]:
# Manager Rep

In [5]:
df.pivot_table(index=['Manager','Rep'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Account,Price,Quantity
Manager,Rep,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Debra Henley,Craig Booker,720237.0,20000.0,1.25
Debra Henley,Daniel Hilton,194874.0,38333.333333,1.666667
Debra Henley,John Smith,576220.0,20000.0,1.5
Fred Anderson,Cedric Moss,196016.5,27500.0,1.25
Fred Anderson,Wendy Yule,614061.5,44250.0,3.0


In [14]:
# 指定 value

In [6]:
df.pivot_table(index=['Manager','Rep'],values='Price')

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Manager,Rep,Unnamed: 2_level_1
Debra Henley,Craig Booker,20000.0
Debra Henley,Daniel Hilton,38333.333333
Debra Henley,John Smith,20000.0
Fred Anderson,Cedric Moss,27500.0
Fred Anderson,Wendy Yule,44250.0


In [17]:
# 更改value 统计方式
# np.sum,len,np.mean

In [9]:
df.pivot_table(index=['Manager','Rep'],values='Price',aggfunc=[np.sum,len,np.mean])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,len,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price,Price
Manager,Rep,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Debra Henley,Craig Booker,80000,4,20000.0
Debra Henley,Daniel Hilton,115000,3,38333.333333
Debra Henley,John Smith,40000,2,20000.0
Fred Anderson,Cedric Moss,110000,4,27500.0
Fred Anderson,Wendy Yule,177000,4,44250.0


In [24]:
# 显示相关产品销售情况
# columns

In [11]:
df.pivot_table(index=['Manager','Rep'],values='Price',aggfunc=[np.sum],columns='Product')

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum
Unnamed: 0_level_1,Product,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Debra Henley,Craig Booker,65000.0,5000.0,,10000.0
Debra Henley,Daniel Hilton,105000.0,,,10000.0
Debra Henley,John Smith,35000.0,5000.0,,
Fred Anderson,Cedric Moss,95000.0,5000.0,,10000.0
Fred Anderson,Wendy Yule,165000.0,7000.0,5000.0,


In [28]:
# 对缺失值，填充为0
# fill_value

In [12]:
df.pivot_table(index=['Manager','Rep'],values='Price',aggfunc=[np.sum],columns='Product',fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum
Unnamed: 0_level_1,Product,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Debra Henley,Craig Booker,65000,5000,0,10000
Debra Henley,Daniel Hilton,105000,0,0,10000
Debra Henley,John Smith,35000,5000,0,0
Fred Anderson,Cedric Moss,95000,5000,0,10000
Fred Anderson,Wendy Yule,165000,7000,5000,0


In [30]:
# 添加 行 与 列 统计
# margins
# 统计方式，取决于 aggfunc

In [15]:
df.pivot_table(index=['Manager','Rep'],values='Price',aggfunc=[np.sum],columns='Product',fill_value=0,margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Product,CPU,Maintenance,Monitor,Software,All
Manager,Rep,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Debra Henley,Craig Booker,65000,5000,0,10000,80000
Debra Henley,Daniel Hilton,105000,0,0,10000,115000
Debra Henley,John Smith,35000,5000,0,0,40000
Fred Anderson,Cedric Moss,95000,5000,0,10000,110000
Fred Anderson,Wendy Yule,165000,7000,5000,0,177000
All,,465000,22000,5000,30000,522000


In [34]:
# 扩展
# aggfunc 可传入字典，指定列进行相关计算

In [19]:
df.pivot_table(index=['Manager','Rep'],values=['Price','Quantity'],aggfunc={'Price':np.mean,'Quantity':np.sum},columns='Product',fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Price,Price,Price,Quantity,Quantity,Quantity,Quantity
Unnamed: 0_level_1,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Debra Henley,Craig Booker,32500,5000,0,10000,2,2,0,1
Debra Henley,Daniel Hilton,52500,0,0,10000,4,0,0,1
Debra Henley,John Smith,35000,5000,0,0,1,2,0,0
Fred Anderson,Cedric Moss,47500,5000,0,10000,3,1,0,1
Fred Anderson,Wendy Yule,82500,7000,5000,0,7,3,2,0
