In [1]:
import numpy as np
import pandas as pd
import random

In [102]:
# 查看Pandas版本号
pd.__version__

'0.23.4'

### Pandas三个基本数据结构

#### Series
1 核心：series相比于ndarray，是一个自带索引index的数组 → 一维数组 + 对应索引

2 series和dict相比，series更像一个有顺序的字典

In [103]:
lst = [0.25, 0.5, 0.75, 1]
data = pd.Series(lst)
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [104]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [105]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [106]:
data[1]

0.5

In [107]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [108]:
# Numpy数组通过隐式定义的整数索引获取数值，而Pandas的Series对象用一种显式定义的索引与数值关联。
lst = [0.25, 0.5, 0.75, 1]
ind = list('abcd')
data = pd.Series(lst,index=ind)
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [109]:
data['b']

0.5

In [110]:
data[['c','d','a']]

c    0.75
d    1.00
a    0.25
dtype: float64

In [111]:
data[data>0.5]

c    0.75
d    1.00
dtype: float64

In [112]:
data*100

a     25.0
b     50.0
c     75.0
d    100.0
dtype: float64

In [113]:
np.exp(data)

a    1.284025
b    1.648721
c    2.117000
d    2.718282
dtype: float64

In [114]:
#也可以使用不连续或不按顺序的索引
lst = [0.25, 0.5, 0.75, 1]
ind = [2,5,3,7]
data = pd.Series(lst,index=ind)
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [115]:
data[5]

0.5

In [116]:
# 用字典创建Series,其索引会默认按照顺序排列
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)

In [117]:
population['California']

38332521

In [118]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [119]:
# 创建Series，pd.Series(data, index = index)

In [120]:
# data 为Numpy数组
S1 = pd.Series(np.array([2,4,6]))
S1

0    2
1    4
2    6
dtype: int32

In [121]:
# data 为列表
S2 = pd.Series([2,4,6])
S2

0    2
1    4
2    6
dtype: int64

In [122]:
# data为标量
S3 = pd.Series(5, index = [100,200,300])
S3

100    5
200    5
300    5
dtype: int64

In [123]:
# data为字典,Series只保留显式定义的键值对
data = {2:'a', 1:'b', 3:'c'}
S4 = pd.Series(data, index = [3,2])
S4

3    c
2    a
dtype: object

In [124]:
# pandas中使用isnull与notnull函数检查缺失数据
S4['v'] = None
pd.isnull(S4)

3    False
2    False
v     True
dtype: bool

In [125]:
pd.notnull(S4)

3     True
2     True
v    False
dtype: bool

In [126]:
# Series对象和其索引都有name属性
S4.name = 'nikola'
S4.index.name = 'cardle'
print(S4)
S4.rename('HAHA') 

cardle
3       c
2       a
v    None
Name: nikola, dtype: object


cardle
3       c
2       a
v    None
Name: HAHA, dtype: object

In [127]:
# Series的索引可以通过按位置赋值的方式进行改变
S4

cardle
3       c
2       a
v    None
Name: nikola, dtype: object

In [128]:
S4.index = list('cmy')
S4.index.name = 'cardle'
S4

cardle
c       c
m       a
y    None
Name: nikola, dtype: object

In [129]:
S4['y'] = 100
S4['y']

100

In [4]:
#通过给index赋值修改索引
a = pd.Series(list('abcd'))

In [6]:
a.index = list('mxyz')
a

m    a
x    b
y    c
z    d
dtype: object

#### DataFrame

In [130]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [131]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [132]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [133]:
states.head(2)

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662


In [134]:
states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [135]:
states.loc['New York']

population    19651127
area            141297
Name: New York, dtype: int64

In [136]:
states['debt'] = 100
states

Unnamed: 0,population,area,debt
California,38332521,423967,100
Texas,26448193,695662,100
New York,19651127,141297,100
Florida,19552860,170312,100
Illinois,12882135,149995,100


In [137]:
states['debt'] = np.arange(5)
states

Unnamed: 0,population,area,debt
California,38332521,423967,0
Texas,26448193,695662,1
New York,19651127,141297,2
Florida,19552860,170312,3
Illinois,12882135,149995,4


In [138]:
states['debt'] = pd.Series(np.arange(2),index = ['New York','Florida'])
states

Unnamed: 0,population,area,debt
California,38332521,423967,
Texas,26448193,695662,
New York,19651127,141297,0.0
Florida,19552860,170312,1.0
Illinois,12882135,149995,


In [139]:
del states['debt']
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [140]:
# DataFrame属性
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [141]:
states.columns

Index(['population', 'area'], dtype='object')

In [142]:
states.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]], dtype=int64)

In [143]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [144]:
# 创建DataFrame对象
# 1>通过二维数组创建
obj = pd.DataFrame([[1,2],[2,3],[3,4]],
            columns = ['foo', 'bar'],
            index = list('abc'))
obj

Unnamed: 0,foo,bar
a,1,2
b,2,3
c,3,4


In [145]:
# 2>由numpy创建的结构化数组创建
pd.DataFrame(np.random.random(12).reshape(4,3),index = list('abcd'),columns = list('xyz'))

Unnamed: 0,x,y,z
a,0.898038,0.545221,0.307739
b,0.418202,0.666636,0.56584
c,0.482091,0.008555,0.595637
d,0.477229,0.250868,0.031358


In [146]:
# 3> 利用单个Series对象创建DataFrame
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [147]:
# 4> 通过Series对象字典创建
pdDataFrame({'population': population,
              'area': area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [148]:
# 5> 通过单一字典创建，但此种方式必须指定显示指定行索引，否则会报错
data4 = {'a':1,'b':2}
data4
pd.DataFrame(data4,index = ['x'])

Unnamed: 0,a,b
x,1,2


In [149]:
# 6>通过嵌套字典创建DataFrame,字典键作为列，内部字典键作为行索引
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
pd.DataFrame(pop)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [150]:
# 7> 通过数组，列表和元组组成的字典创建
data1 = {'a':[1,2,3],  'b':[3,4,5], 'c':[5,6,7]}
pd.DataFrame(data1)

Unnamed: 0,a,b,c
0,1,3,5
1,2,4,6
2,3,5,7


In [151]:
data2 ={'one':np.random.rand(3),'two':np.random.rand(3)}
pd.DataFrame(data2)

Unnamed: 0,one,two
0,0.344309,0.03567
1,0.750323,0.52669
2,0.136667,0.23727


In [152]:
# 8>由单一列表创建
pd.DataFrame(list('abcde'),index=list('xyzmn'),columns=['column'])

Unnamed: 0,column
x,a
y,b
z,c
m,d
n,e


In [153]:
# 9> 由列表或元组构成的列表创建
data3 = [[1,2,3], [3,4,5],[5,6,7]]
pd.DataFrame(data3, index = list('abc'), columns = list('xyz'))

Unnamed: 0,x,y,z
a,1,2,3
b,3,4,5
c,5,6,7


In [154]:
# 10> 通过字典构成的列表创建
data = [{'a':i, 'b':2**i} for i in range(3)]
print(data)
print('-----------------')
pd.DataFrame(data)

[{'a': 0, 'b': 1}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]
-----------------


Unnamed: 0,a,b
0,0,1
1,1,2
2,2,4


In [155]:
pd.DataFrame([{'a':1,"b":2},{'b':3, 'd':4}])

Unnamed: 0,a,b,d
0,1.0,2,
1,,3,4.0


#### Index对象

In [156]:
# 可以将Index看作是一个不可变数组或有序集合,它与Numpy数组之间的不同在于Index对象的索引是不可变的
ind = pd.Index([3,3,4,7,11])
ind

Int64Index([3, 3, 4, 7, 11], dtype='int64')

In [157]:
ind[1]

3

In [158]:
ind[::2]

Int64Index([3, 4, 11], dtype='int64')

In [159]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [16]:
# 重建索引
data = random.sample(range(1,100),4)
index = list('abcd')
s1 = pd.Series(data, index =index)
s1

a    83
b    95
c    51
d    87
dtype: int64

In [161]:
s2 = s1.reindex(list('abcde'))
s2

a    30.0
b    70.0
c    59.0
d    37.0
e     NaN
dtype: float64

In [162]:
# 缺失值向前填充method，向后填充为bfill
s3 = s1.reindex(list('abcdef'),method='ffill')
s3

a    30
b    70
c    59
d    37
e    37
f    37
dtype: int64

In [163]:
df = pd.DataFrame(np.arange(9).reshape(3,3),
                 index = list('acd'),
                 columns = list('xyz'))
df

Unnamed: 0,x,y,z
a,0,1,2
c,3,4,5
d,6,7,8


In [164]:
df1 = df.reindex(index = list('abcd'),columns = ['x','y','m'])
df1

Unnamed: 0,x,y,m
a,0.0,1.0,
b,,,
c,3.0,4.0,
d,6.0,7.0,


In [165]:
# 更为简洁的标签索引方法
p = list('abd')
q = list('xym')
df1.loc[p,q]

Unnamed: 0,x,y,m
a,0.0,1.0,
b,,,
d,6.0,7.0,


In [166]:
# 轴向上删除条目
print(df1)
print(df1.drop(['a','b']))
print(df1.drop(['m'],axis=1))

     x    y   m
a  0.0  1.0 NaN
b  NaN  NaN NaN
c  3.0  4.0 NaN
d  6.0  7.0 NaN
     x    y   m
c  3.0  4.0 NaN
d  6.0  7.0 NaN
     x    y
a  0.0  1.0
b  NaN  NaN
c  3.0  4.0
d  6.0  7.0


### 数据取值与选择

In [26]:
data = [0.25, 0.5, 0.75,1]
index = list('abcd')
s = pd.Series(data, index = index)
s

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [168]:
s['b']

0.5

In [169]:
'b' in s

True

In [170]:
s.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [171]:
s.values

array([0.25, 0.5 , 0.75, 1.  ])

In [172]:
list(s.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [173]:
# Series通过索引赋值拓展
s['e'] =100
s

a      0.25
b      0.50
c      0.75
d      1.00
e    100.00
dtype: float64

In [174]:
# Series显示切片
s['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [175]:
# Series隐式切片
s[:3]

a    0.25
b    0.50
c    0.75
dtype: float64

In [176]:
# 掩码
s[(s>=0.5) & (s<1)]

b    0.50
c    0.75
dtype: float64

In [177]:
# 花式索引
s[['a','c']]

a    0.25
c    0.75
dtype: float64

In [178]:
# 索引器loc,iloc和ix
data = list('abc')
index = [1,3,5]
s = pd.Series(data,index = index)
s

1    a
3    b
5    c
dtype: object

In [179]:
# 显示索引,避免混淆引入索引器
s[5]

'c'

In [180]:
# 切片隐式索引
s[1:3]

3    b
5    c
dtype: object

In [181]:
# loc显示索引
s.loc[5]

'c'

In [182]:
s.loc[3:5]

3    b
5    c
dtype: object

In [183]:
#iloc隐式索引
s.iloc[1]

'b'

In [184]:
s.iloc[1:3]

3    b
5    c
dtype: object

In [185]:
# DataFrame数据选择方法
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [186]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [187]:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [188]:
data['density']=data['pop']/data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [189]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [190]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [191]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [192]:
data.loc[:'Illinois',:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [193]:
data.loc[data.density>100,['pop','density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


### Pandas数值运算方法

#### 保留索引

In [194]:
rng = np.random.RandomState(1)
ser = rng.randint(0,100,4)
ser

array([37, 12, 72,  9])

In [195]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,5,0,0,1
1,7,6,9,2
2,4,5,2,4


In [196]:
np.exp(ser)

array([1.17191424e+16, 1.62754791e+05, 1.85867175e+31, 8.10308393e+03])

In [197]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,148.413159,1.0,1.0,2.718282
1,1096.633158,403.428793,8103.083928,7.389056
2,54.59815,148.413159,7.389056,54.59815


#### 索引对齐

In [198]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [199]:
area

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64

In [200]:
population

California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64

In [201]:
area/population

Alaska             NaN
California    0.011060
New York           NaN
Texas         0.026303
dtype: float64

In [202]:
area.index|population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [203]:
A = pd.Series([2,4,6])
B = pd.Series([1,3,5],index = [1,2,3])
print(A)
print(B)

0    2
1    4
2    6
dtype: int64
1    1
2    3
3    5
dtype: int64


In [204]:
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [205]:
A.add(B, fill_value= 0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [206]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,11,10
1,14,18


In [207]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,4,7,7
1,9,1,7
2,0,6,9


In [208]:
A+B

Unnamed: 0,A,B,C
0,18.0,14.0,
1,15.0,27.0,
2,,,


In [209]:
fill = A.stack()
fill.mean()

13.25

In [210]:
A.add(B, fill_value = fill.mean())

Unnamed: 0,A,B,C
0,18.0,14.0,20.25
1,15.0,27.0,20.25
2,19.25,13.25,22.25


In [211]:
A = rng.randint(10, size=(3, 4))
A

array([[9, 7, 6, 9],
       [1, 0, 1, 8],
       [8, 3, 9, 8]])

In [212]:
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-8,-7,-5,-1
2,-1,-4,3,-1


In [213]:
df.subtract(df['R'], axis=0) #axis=0指广播沿着X轴进行

Unnamed: 0,Q,R,S,T
0,2,0,-1,2
1,1,0,1,8
2,5,0,6,5


In [214]:
half = df.iloc[0,::2]
half

Q    9
S    6
Name: 0, dtype: int32

In [215]:
# 只会沿着显式的指定的值广播
print(df)
df-half

   Q  R  S  T
0  9  7  6  9
1  1  0  1  8
2  8  3  9  8


Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-8.0,,-5.0,
2,-1.0,,3.0,


#### 函数的映射

In [216]:
df = pd.DataFrame(np.random.randn(4,3), columns=list('xyz'), index=list('abcd') )
df

Unnamed: 0,x,y,z
a,-1.097632,1.628977,-0.126475
b,-0.069048,2.014971,1.506189
c,-0.485492,-0.865264,-0.080881
d,-0.79569,-0.143022,-0.830975


In [217]:
np.abs(df)

Unnamed: 0,x,y,z
a,1.097632,1.628977,0.126475
b,0.069048,2.014971,1.506189
c,0.485492,0.865264,0.080881
d,0.79569,0.143022,0.830975


In [218]:
f = lambda x:x.max()-x.min()
df.apply(f)

x    1.028585
y    2.880235
z    2.337164
dtype: float64

In [219]:
df.apply(f, axis=1)

a    2.726609
b    2.084019
c    0.784383
d    0.687953
dtype: float64

In [220]:
def f(x):
    return pd.Series([x.min(),x.max()], index=['min','max'])

In [221]:
df.apply(f)

Unnamed: 0,x,y,z
min,-1.097632,-0.865264,-0.830975
max,-0.069048,2.014971,1.506189


In [222]:
format = lambda m : '%.2f' % m
df.applymap(format)

Unnamed: 0,x,y,z
a,-1.1,1.63,-0.13
b,-0.07,2.01,1.51
c,-0.49,-0.87,-0.08
d,-0.8,-0.14,-0.83


In [223]:
# 修改单列为百分比格式
f1 = lambda x: '%.2f%%' % (x*100)

In [224]:
df['x'] = df['x'].apply(f1)
df

Unnamed: 0,x,y,z
a,-109.76%,1.628977,-0.126475
b,-6.90%,2.014971,1.506189
c,-48.55%,-0.865264,-0.080881
d,-79.57%,-0.143022,-0.830975


In [225]:
df['y'].map(f1)

a    162.90%
b    201.50%
c    -86.53%
d    -14.30%
Name: y, dtype: object

#### 排序与排名

In [226]:
# Series建立过程会默认按照索引排序
obj = pd.Series(range(4), index = list('dbac'))
obj

d    0
b    1
a    2
c    3
dtype: int64

In [227]:
# Series按照值排序
obj.sort_values(ascending = False)

c    3
a    2
b    1
d    0
dtype: int64

In [228]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index = ['three','one'], columns = list('dbca'))
frame

Unnamed: 0,d,b,c,a
three,0,1,2,3
one,4,5,6,7


In [229]:
#DF按照索引排序，默认axis=0
frame.sort_index()

Unnamed: 0,d,b,c,a
one,4,5,6,7
three,0,1,2,3


In [230]:
#DF按照索引排序（先按照列索引降序，然后按照行索引升序）
frame.sort_index(axis='columns',ascending = False).sort_index(ascending = True)

Unnamed: 0,d,c,b,a
one,4,6,5,7
three,0,2,1,3


In [231]:
# 排序过程缺失值的位置
obj = pd.Series([4, np.NaN, 7 ,np.nan ,-3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [232]:
# NaN默认排序在最后
obj.sort_values(na_position='first')

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [233]:
# NaN 指定排序在最前
obj.sort_values(na_position='last')

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [234]:
# DF排序指定‘by'参数
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [235]:
frame.sort_values(by=['a','b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [236]:
# DF实现按照行排序，按照列排序，按照值排序
frame.sort_index().sort_index(axis = 1).sort_values(by='a')

Unnamed: 0,a,b
0,0,4
2,0,-3
1,1,7
3,1,2


In [237]:
# Series排名
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [238]:
#默认排名方式为method=‘average'即相同排名的情况取排名值得平均值
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [239]:
obj.rank(method = 'first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [240]:
obj.rank(method = 'dense')

0    5.0
1    1.0
2    5.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64

In [241]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [242]:
frame.rank(axis = 1,ascending = False)

Unnamed: 0,b,a,c
0,1.0,2.0,3.0
1,1.0,3.0,2.0
2,3.0,2.0,1.0
3,1.0,2.0,3.0


#### 函数重复值标签的轴索引

In [243]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [244]:
#检查索引值是否唯一
obj.index.unique()

Index(['a', 'b', 'c'], dtype='object')

In [245]:
df = pd.DataFrame(np.random.randn(4,3),index = list('aabb'))
df

Unnamed: 0,0,1,2
a,0.045552,0.851084,0.657508
a,0.219123,-0.915801,1.852851
b,0.063889,0.404446,-0.51811
b,-0.866617,-0.64249,-0.615988


In [246]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.063889,0.404446,-0.51811
b,-0.866617,-0.64249,-0.615988


#### 描述性统计的概述与计算

In [247]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [248]:
# 默认返回列上加和的Series
df.sum()

one    9.25
two   -5.80
dtype: float64

In [249]:
# Na值会默认被排除在计算当中
df.sum(axis = 1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [250]:
# 默认返回单列中的最大最小值的索引标签号
df.idxmax()

one    b
two    d
dtype: object

In [251]:
df.idxmin()

one    d
two    b
dtype: object

In [252]:
# 一次性产生多个汇总,针对数值型数据
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [253]:
# 针对非数值型数据
obj = pd.Series(list('aabc')*4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [254]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

#### 相关性和协方差

In [255]:
import pandas_datareader.data as web

In [256]:
all_data = {ticker:web.get_data_yahoo(ticker) for ticker in ['AAPL','IBM','MSFT','GOOG']}

ConnectionError: HTTPSConnectionPool(host='finance.yahoo.com', port=443): Max retries exceeded with url: /quote/AAPL/history?period1=1262289600&period2=1556049599&interval=1d&frequency=1d&filter=history (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x000001F19C447160>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。'))

In [None]:
all_data

In [None]:
price = pd.DataFrame({ticker:data['Adj Close'] for ticker,data in all_data.items()})
price.head()

In [None]:
volume = pd.DataFrame({ticker:data['Volume'] for ticker, data in all_data.items()})
volume.head()

In [None]:
df form(x):
    return x

In [None]:
# df.pct_change()即后一个值减去前一个值，再除以前一个值，相同与日环比
returns = price.pct_change()#.applymap(lambda x: '%.2f%%' % (x*100))
returns.tail()

In [None]:
# 计算两只股票的相关性
returns['MSFT'].corr(returns['IBM'])

In [None]:
# 计算两个股票的协方差
returns['MSFT'].cov(returns['IBM'])

In [None]:
# 更为简洁的表达
returns.MSFT.corr(returns.IBM)

In [None]:
# DF直接计算两两的相关性与协方差
returns.corr()

In [None]:
returns.cov()

In [None]:
# 返回行或列与另外一个序列DF的相关性
returns.corrwith(returns.IBM)

#### 唯一值计数和成员属性

In [None]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

In [None]:
obj.unique()

In [None]:
obj.value_counts()

In [None]:
pd.value_counts(obj.values, sort=False)

In [None]:
mask = obj.isin(['b', 'c'])
mask

In [None]:
obj[mask]

In [None]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

In [258]:
# 获取多个相关列的直方图数据
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [259]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


### 第7章：数据清洗与准备

#### 处理缺失值

In [4]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [266]:
string_data[0] =None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [267]:
pd.isnull(string_data)

0     True
1    False
2     True
3    False
dtype: bool

#### 过滤缺失值

In [8]:
from numpy import nan as NA

In [9]:
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
# 只要含有na值就删除
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [283]:
# 只删除全部为na值的行
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [285]:
data[3]=NA
data

Unnamed: 0,0,1,2,3
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [286]:
#在列方向上删除NAN,how默认值为any即删除所有含有nan的行
data.dropna(how='all',axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,0.619787,-1.029487,0.456126
1,-0.214287,0.043178,-0.20394
2,0.139635,1.060607,-0.555656
3,-0.736977,1.493798,0.463952
4,1.48983,-0.178036,1.219982
5,-0.283586,0.157924,1.132628
6,0.188258,0.302137,-0.715227


In [17]:
df.iloc[:4,1] = NA
df

Unnamed: 0,0,1,2
0,0.619787,,0.456126
1,-0.214287,,-0.20394
2,0.139635,,-0.555656
3,-0.736977,,0.463952
4,1.48983,-0.178036,1.219982
5,-0.283586,0.157924,1.132628
6,0.188258,0.302137,-0.715227


In [18]:
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.619787,,
1,-0.214287,,
2,0.139635,,-0.555656
3,-0.736977,,0.463952
4,1.48983,-0.178036,1.219982
5,-0.283586,0.157924,1.132628
6,0.188258,0.302137,-0.715227


In [19]:
df.dropna()

Unnamed: 0,0,1,2
4,1.48983,-0.178036,1.219982
5,-0.283586,0.157924,1.132628
6,0.188258,0.302137,-0.715227


In [20]:
# 保留含有两个非空值的行
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.139635,,-0.555656
3,-0.736977,,0.463952
4,1.48983,-0.178036,1.219982
5,-0.283586,0.157924,1.132628
6,0.188258,0.302137,-0.715227


#### 补全缺失值

In [21]:
df

Unnamed: 0,0,1,2
0,0.619787,,
1,-0.214287,,
2,0.139635,,-0.555656
3,-0.736977,,0.463952
4,1.48983,-0.178036,1.219982
5,-0.283586,0.157924,1.132628
6,0.188258,0.302137,-0.715227


In [303]:
# 执行所有na填充为指定值
df.fillna(value = 0)

Unnamed: 0,0,1,2
0,-1.048073,0.0,0.0
1,0.905735,0.0,0.0
2,-2.223779,0.0,0.772754
3,0.009582,0.0,0.404117
4,-0.242436,0.56651,-0.808025
5,-0.414826,-0.711173,0.360844
6,-1.271862,-0.166739,0.630906


In [304]:
# 利用字典的形式指定列填充对应值
df.fillna({1:0.5,2:3})

Unnamed: 0,0,1,2
0,-1.048073,0.5,3.0
1,0.905735,0.5,3.0
2,-2.223779,0.5,0.772754
3,0.009582,0.5,0.404117
4,-0.242436,0.56651,-0.808025
5,-0.414826,-0.711173,0.360844
6,-1.271862,-0.166739,0.630906


In [307]:
# 利用插值法填充na,limit用于向前或向后填充时最大的填充范围
df.fillna(method='bfill',limit=2)

Unnamed: 0,0,1,2
0,-1.048073,,0.772754
1,0.905735,,0.772754
2,-2.223779,0.56651,0.772754
3,0.009582,0.56651,0.404117
4,-0.242436,0.56651,-0.808025
5,-0.414826,-0.711173,0.360844
6,-1.271862,-0.166739,0.630906


In [308]:
data = pd.Series([1,NA,3,5,NA,7])
data

0    1.0
1    NaN
2    3.0
3    5.0
4    NaN
5    7.0
dtype: float64

In [309]:
# 利用平均值补全Na值
data.fillna(data.mean())

0    1.0
1    4.0
2    3.0
3    5.0
4    4.0
5    7.0
dtype: float64

#### 数据转化

In [54]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [48]:
# 判定Series是否重复

In [49]:
data = pd.Series([1,2,3,3,1])
data

0    1
1    2
2    3
3    3
4    1
dtype: int64

In [50]:
data.duplicated()

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [51]:
# 确认行与行之间是否存在重复值的情况，但按照顺序重复行中的第一行并不标记为重复
data.duplicated()

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [52]:
data.drop_duplicates()

0    1
1    2
2    3
dtype: int64

In [55]:
data['v1']=range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [60]:
# subset参数确认判定列，即以指定列在列方向上的重复值返回布尔值
data.duplicated(['k1'])

0    False
1    False
2     True
3     True
4     True
5     True
6     True
dtype: bool

In [57]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [58]:
data.drop_duplicates(['k1','k2'],keep='first')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [319]:
#duplicated和drop_duplicates默认都时保留第一个观测到的值，可选：first，last，False
data.drop_duplicates(['k1','k2'],keep=False)

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4


In [321]:
data.drop_duplicates(['k1','k2'],keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


#### 使用函数和映射进行数据转换

In [322]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [323]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [324]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [325]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [326]:
# 等价代码
data['animal'] = data['food'].map(lambda x:meat_to_animal[x.lower()])
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


#### 替代值

In [61]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [62]:
# 替代一个值
data.replace(-999,NA)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [329]:
# 一次替代多个值
data.replace([-999,-1000],NA)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [330]:
# 将不同的值替换为不同的值
data.replace({-999:'a',-1000:'b'})

0    1
1    a
2    2
3    a
4    b
5    3
dtype: object

In [336]:
# 也可以以列表的方式替换,第一个列表为要替换的值，第二个列表为替换后的值
data.replace([-999,-1000],['a','b'])

0    1
1    a
2    2
3    a
4    b
5    3
dtype: object

#### 重命名轴索引

In [338]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [339]:
transform = lambda x:x[:4].upper()

In [341]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [343]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [356]:
data.rename(index = str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [357]:
# 变更行列名称应用
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [352]:
# 尝试解决DF中单列的转化，利用map应用lambda进行
df = pd.DataFrame(np.random.randn(4,3),index=['one','two','three','four'])
df

Unnamed: 0,0,1,2
one,-1.789471,1.125341,0.199546
two,-1.894278,1.259244,-1.410265
three,0.175478,0.231638,0.159104
four,-0.407458,-0.711151,-1.767895


In [353]:
df[0]=['asddf','asdijflk','opiyqwe','lkj']
df

Unnamed: 0,0,1,2
one,asddf,1.125341,0.199546
two,asdijflk,1.259244,-1.410265
three,opiyqwe,0.231638,0.159104
four,lkj,-0.711151,-1.767895


In [354]:
df[0]=df[0].map(lambda x:x[:7].upper())
df

Unnamed: 0,0,1,2
one,ASDDF,1.125341,0.199546
two,ASDIJFL,1.259244,-1.410265
three,OPIYQWE,0.231638,0.159104
four,LKJ,-0.711151,-1.767895


In [360]:
df.rename(index={'one':'a','two':'b','three':'c',"four":'d'},columns={0:'x',1:'y',2:'z'},inplace=True)
df

Unnamed: 0,x,y,z
a,ASDDF,1.125341,0.199546
b,ASDIJFL,1.259244,-1.410265
c,OPIYQWE,0.231638,0.159104
d,LKJ,-0.711151,-1.767895


#### 离散化和分箱

In [63]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,25,35,60,100]
labels = list('ABCD')

In [64]:
# 将数据分段归属，默认左开右边区间，可以修改right参数改变为左闭右开
cats = pd.cut(ages,bins,right=False) # right=True为默认左开右闭
cats

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [372]:
cats.codes

array([0, 0, 1, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [373]:
cats.categories

IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)]
              closed='left',
              dtype='interval[int64]')

In [374]:
pd.value_counts(cats,ascending=False)

[25, 35)     4
[18, 25)     4
[35, 60)     3
[60, 100)    1
dtype: int64

In [65]:
# 为分段进行命名
cats = pd.cut(ages,bins,right=False,labels=labels) 
pd.value_counts(cats).sort_index()

A    4
B    4
C    3
D    1
dtype: int64

In [76]:
# 传入整数的个箱数，替代自定义的边界值
# precision将十进制精度限制为两位
data = np.random.rand(20)
c1 = pd.cut(data,bins=4,precision=2,labels=list('abcd'),retbins=True)
data.sort()
data

array([6.46608840e-06, 1.20037757e-02, 3.76931762e-02, 6.85049728e-02,
       1.07035524e-01, 1.16586288e-01, 1.89322900e-01, 4.07529697e-01,
       5.54141005e-01, 6.59353801e-01, 6.69056619e-01, 6.75059535e-01,
       6.87524311e-01, 6.96350568e-01, 7.03651235e-01, 7.30724255e-01,
       8.34387107e-01, 8.36237345e-01, 9.09028488e-01, 9.86962476e-01])

In [71]:
# 传入整数的个箱数，替代自定义的边界值
# precision将十进制精度限制为两位
data = np.random.rand(20)
c1 = pd.cut(data,bins=4,precision=2,retbins=False,include_lowest=True)
c1

[(0.11, 0.33], (0.11, 0.33], (0.54, 0.75], (0.75, 0.97], (0.54, 0.75], ..., (0.11, 0.33], (0.75, 0.97], (0.54, 0.75], (0.75, 0.97], (0.75, 0.97]]
Length: 20
Categories (4, interval[float64]): [(0.11, 0.33] < (0.33, 0.54] < (0.54, 0.75] < (0.75, 0.97]]

In [72]:
# 传入整数的个箱数，替代自定义的边界值
# precision将十进制精度限制为两位
data = np.random.rand(20)
c1 = pd.cut(data,bins=4,precision=2,retbins=False,include_lowest=False)
c1

[(0.0028, 0.25], (0.73, 0.97], (0.0028, 0.25], (0.0028, 0.25], (0.49, 0.73], ..., (0.49, 0.73], (0.0028, 0.25], (0.25, 0.49], (0.73, 0.97], (0.0028, 0.25]]
Length: 20
Categories (4, interval[float64]): [(0.0028, 0.25] < (0.25, 0.49] < (0.49, 0.73] < (0.73, 0.97]]

In [67]:
pd.value_counts(c1)

a    6
d    5
b    5
c    4
dtype: int64

In [394]:
#qcut基于样本分位数进行分箱
data = np.random.randn(1000)
cats = pd.qcut(data,4,labels=list('ABCD'),precision=1)
cats

[B, D, B, B, B, ..., A, B, A, A, B]
Length: 1000
Categories (4, object): [A < B < C < D]

In [396]:
pd.value_counts(cats).sort_index()

A    250
B    250
C    250
D    250
dtype: int64

In [398]:
#qcut参数可以直接传入分位数
cats = pd.qcut(data,[0.1,0.3,0.5,0.9,1],precision=1)
cats

[(-0.6, -0.02], (1.3, 2.9], (-0.6, -0.02], (-0.6, -0.02], (-1.4000000000000001, -0.6], ..., NaN, (-0.6, -0.02], NaN, (-1.4000000000000001, -0.6], (-0.6, -0.02]]
Length: 1000
Categories (4, interval[float64]): [(-1.4000000000000001, -0.6] < (-0.6, -0.02] < (-0.02, 1.3] < (1.3, 2.9]]

#### 检测和过滤异常值

In [91]:
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.096428,-0.02416,0.019188,-0.009492
std,1.016385,1.018289,1.036855,0.995981
min,-2.859235,-3.250887,-3.055285,-3.273383
25%,-0.801991,-0.715054,-0.709916,-0.664535
50%,-0.112315,-0.005763,0.026759,-0.038066
75%,0.589389,0.733668,0.691885,0.64771
max,3.499846,3.112271,3.309386,3.232722


In [78]:
# 找出一列中绝对值大于三的值
col = data[2]
col[np.abs(col)>3]

821   -3.558985
972   -3.127898
Name: 2, dtype: float64

In [79]:
a = np.abs(data)
a[a[2]>3]

Unnamed: 0,0,1,2,3
821,0.57798,0.62422,3.558985,0.91694
972,1.921863,0.653172,3.127898,1.057685


In [87]:
pd.value_counts((np.abs(data) > 3).any(axis=1))

False    991
True       9
dtype: int64

In [80]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
98,0.461837,3.522187,-1.304365,1.275297
114,-0.309053,-4.635216,0.258907,-0.556353
404,3.326457,0.937037,0.038374,-0.784079
701,-1.289208,-1.991523,-0.649372,3.084233
702,0.883743,-1.369807,-0.211309,3.055515
821,0.57798,0.62422,-3.558985,0.91694
846,-0.420008,-1.828525,0.705385,3.368355
856,1.394688,-3.220477,-0.086766,-1.328374
972,1.921863,-0.653172,-3.127898,1.057685


In [94]:
np.sign(data)*3

Unnamed: 0,0,1,2,3
0,3.0,3.0,3.0,-3.0
1,-3.0,-3.0,3.0,-3.0
2,-3.0,3.0,-3.0,3.0
3,-3.0,3.0,-3.0,-3.0
4,3.0,3.0,3.0,3.0
5,-3.0,3.0,3.0,3.0
6,3.0,3.0,-3.0,3.0
7,-3.0,3.0,3.0,3.0
8,3.0,3.0,3.0,-3.0
9,3.0,-3.0,3.0,3.0


In [441]:
data[np.abs(data)>3]

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,


In [97]:
data[np.abs(data)>3] = np.sign(data)*3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.101001,-0.024022,0.018934,-0.009417
std,1.006191,1.00928,1.027994,0.976758
min,-2.859235,-2.905874,-2.806546,-2.917564
25%,-0.801991,-0.715054,-0.709916,-0.664535
50%,-0.112315,-0.005763,0.026759,-0.038066
75%,0.589389,0.733668,0.691885,0.64771
max,2.890304,2.737949,2.871775,2.882762


#### 置换和随机抽样

In [99]:
np.random.permutation(10)

array([6, 3, 7, 5, 8, 1, 9, 0, 4, 2])

In [98]:
#numpy.random.permution对DF,Se进行置换（随机重排序）
df = pd.DataFrame(np.arange(20).reshape(5,4))
sampler = np.random.permutation(5) #[ˌpɜːmjuˈteɪʃn]
sampler

array([3, 4, 1, 2, 0])

In [103]:
# 按照sampler的顺序在行索引上排列
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3


In [414]:
# 选择随机子集
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
1,4,5,6,7


In [107]:
# replace为True时n可以大于实际元素个数
choices = pd.Series([5,7,-1,6,4])
draws = choices.sample(n=10, replace=True)
draws

0    5
4    4
3    6
0    5
1    7
4    4
2   -1
1    7
2   -1
2   -1
dtype: int64

#### 计算指标/虚拟变量

In [109]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [110]:
# 类似于数据透视表中，行为索引，列为不重复的指定列值，值为唯一的交叉计数个数
pd.get_dummies(df['key'])  #[ˈdʌmiz]原型

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [111]:
#对DF使用len函数，返回的时DF的行数
len(pd.get_dummies(df['key']))

6

In [450]:
dummies = pd.get_dummies(df['key'], prefix = 'key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [454]:
#df[['data1']]为DF，而df['data1']时Series
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [112]:
# 以下为理解extend与append，extend添加元素，append添加整体
a = "A|B|C".split("|")

In [113]:
a.extend(('1'))
a

['A', 'B', 'C', '1']

In [114]:
a.append(['1'])
a

['A', 'B', 'C', '1', ['1']]

In [475]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [478]:
bins=np.linspace(0,1,6)
bins

array([0. , 0.2, 0.4, 0.6, 0.8, 1. ])

In [479]:
pd.cut(values,bins)

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [480]:
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [133]:
a = pd.Series([5,2,3,4,5])
for i,j in a.items():
    print (i,j)

0 5
1 2
2 3
3 4
4 5


#### 字符串操作

In [2]:
val = 'a,b,  guido'

In [4]:
val.split(',')

['a', 'b', '  guido']

In [8]:
a = [x.strip() for x in val.split(',')]

In [9]:
f,s,t = [x.strip() for x in val.split(',')]
f+'::'+s+'::'+t

'a::b::guido'

In [11]:
'::'.join(a)

'a::b::guido'

#### 正则表达式

In [12]:
import re

In [13]:
text = 'foo    bar\t baz   \tqux'
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [14]:
#自行编译,形成一个可复用的正则表达式
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

#### Pandas中的向量化字符串函数

### 第八章：数据规整：连接|联合|重塑

#### 分层索引

In [140]:
data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1    1.114055
   2    0.323292
   3   -1.008647
b  1   -0.997351
   3    0.183910
c  1   -0.992652
   2    0.171583
d  2    1.014932
   3   -0.039178
dtype: float64

In [17]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

In [18]:
data['b']

1   -1.529284
3   -1.120928
dtype: float64

In [19]:
data['b':'c']

b  1   -1.529284
   3   -1.120928
c  1   -0.470564
   2   -0.383729
dtype: float64

In [20]:
data.loc[['b','d']]

b  1   -1.529284
   3   -1.120928
d  2   -0.005378
   3   -0.826033
dtype: float64

In [141]:
data.loc[:,2]

a    0.323292
c    0.171583
d    1.014932
dtype: float64

In [22]:
data.unstack()

Unnamed: 0,1,2,3
a,0.404981,-0.725702,1.277812
b,-1.529284,,-1.120928
c,-0.470564,-0.383729,
d,,-0.005378,-0.826033


In [23]:
data.unstack().stack()

a  1    0.404981
   2   -0.725702
   3    1.277812
b  1   -1.529284
   3   -1.120928
c  1   -0.470564
   2   -0.383729
d  2   -0.005378
   3   -0.826033
dtype: float64

In [2]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'],
                              ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [175]:
frame.loc['a']

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,1,2
2,3,4,5


In [167]:
frame.loc[['a','b']]

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [168]:
frame.loc['a':'b']

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [169]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [170]:
frame[['Ohio','Colorado']]

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [174]:
#列方向的切片索引无值输出,理解为pands默认此行为为获取行值
frame['Ohio':'Colorado']

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2


In [177]:
frame.loc['a']['Ohio']

color,Green,Red
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,1
2,3,4


In [178]:
frame.loc['a':'b']['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [179]:
frame.loc[['a','b']]['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [180]:
frame.loc['a',1]['Ohio','Red']

1

In [181]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [182]:
frame.loc['a',1]['Ohio']

color
Green    0
Red      1
Name: (a, 1), dtype: int32

In [164]:
# iloc函数只取行列最内层索引
frame.iloc[3,1]

10

In [143]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state','color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [35]:
frame.index.name

In [36]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [38]:
pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']], names=['state', 'color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

##### 重排序和层级排序

In [39]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [40]:
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [44]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [46]:
#索引安好字典顺序从最外层开始排序，数据选择性能会更好
frame.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


##### 按照层级进行汇总统计

In [47]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [51]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [52]:
frame.sum(level='color',axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


##### 使用DataFrame的列进行索引

In [12]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                      'c': ['one', 'one', 'one', 'two', 'two',
                            'two', 'two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [13]:
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [18]:
frame2 = frame.set_index(['c','d'],verify_integrity=True)
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [17]:
frame2 = frame.set_index(['c','d'],append=True)
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,a,b
Unnamed: 0_level_1,c,d,Unnamed: 3_level_1,Unnamed: 4_level_1
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [55]:
frame2 = frame.set_index(['c','d'],drop=False)
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [58]:
# reset_index是set_index的反操作，分层索引的索引层级会被移动到列中
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [25]:
frame2.reset_index(level=0,drop=False,col_level=0,col_fill=1)

Unnamed: 0_level_0,c,a,b
d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,one,0,7
1,one,1,6
2,one,2,5
0,two,3,4
1,two,4,3
2,two,5,2
3,two,6,1


#### 联合与合并数据集

##### 数据库风格的DataFrame连接

In [61]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [60]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [62]:
# 一对多连接
pd.merge(df1,df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [63]:
# 相当于inner内连接,列名有相同的情况下，on参数会默认连接，但最好进行显示索引
pd.merge(df1,df2,on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [64]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})

In [65]:
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [66]:
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [67]:
pd.merge(df3,df4,left_on='lkey',right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [68]:
pd.merge(df1,df2,how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [69]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})

In [70]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [71]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [72]:
#d 多对对连接的行是笛卡尔积
pd.merge(df1,df2,how='left',on='key')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [73]:
pd.merge(df1,df2,how='inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


In [74]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})

In [75]:
left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [76]:
right

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [82]:
pd.merge(left,right,on=['key1','key2'],how='outer',indicator=True)

Unnamed: 0,key1,key2,lval,rval,_merge
0,foo,one,1.0,4.0,both
1,foo,one,1.0,5.0,both
2,foo,two,2.0,,left_only
3,bar,one,3.0,6.0,both
4,bar,two,,7.0,right_only


In [78]:
# 重复列默认增加后缀_x,_y
pd.merge(left,right,on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [79]:
# 重复列后缀指定
pd.merge(left,right,on='key1',suffixes=['_A','_B'])

Unnamed: 0,key1,key2_A,lval,key2_B,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


##### 根据索引合并

In [26]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

In [27]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [28]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [29]:
pd.merge(left1,right1,left_on='key',right_index=True,how='inner')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [30]:
# 多重索引下的连接
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
                               'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})
righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                      index=[['Nevada', 'Nevada', 'Ohio', 'Ohio',
                              'Ohio', 'Ohio'],
                             [2001, 2000, 2000, 2000, 2001, 2002]],
                      columns=['event1', 'event2'])

In [31]:
lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0.0
1,Ohio,2001,1.0
2,Ohio,2002,2.0
3,Nevada,2001,3.0
4,Nevada,2002,4.0


In [32]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [33]:
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0
3,Nevada,2001,3.0,0.0,1.0
4,Nevada,2002,4.0,,
4,Nevada,2000,,2.0,3.0


In [34]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])

In [35]:
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [36]:
right2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [37]:
pd.merge(left2,right2,left_index= True, right_index=True,how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [38]:
# join方法默认在index上进行连接，默认左连接，可以一次连接多个df
left2.join(right2,how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [39]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [40]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [49]:
# on参数指定为left1的连接键，而right的连接键默认为index，right1.join(left1,on='key',how='outer')会报错
left1.join(right1,on='key',how='outer',lsuffix="A_",rsuffix='B_')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [42]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])
another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [43]:
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [44]:
right2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [45]:
left2.join([right2,another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


In [47]:
left2.join([right2,another],how='outer',sort=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
b,,,7.0,8.0,,
c,3.0,4.0,9.0,10.0,9.0,10.0
d,,,11.0,12.0,,
e,5.0,6.0,13.0,14.0,11.0,12.0
f,,,,,16.0,17.0


##### 沿轴向连接

In [51]:
# 数组横向连接
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [52]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [53]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [54]:
# 默认axis=0跨行进行连接
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [55]:
#沿着列方向合并将形成一个DF，此时必须给出sort参数，否则会提出警告
pd.concat([s1,s2,s3],axis=1,sort=True)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [56]:
# concat默认执行外连接，也可以指定参数join='inner'
s4 = pd.concat([s1,s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [57]:
pd.concat([s1,s4], axis=1,sort=True)
s4

a    0
b    1
f    5
g    6
dtype: int64

In [58]:
pd.concat([s1,s4], axis=1,sort=True,join='inner')

Unnamed: 0,0,1
a,0,0
b,1,1


In [60]:
# 指定连接其它轴向的轴
pd.concat([s1,s4], axis=1,join_axes=[list('acbef')])

Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,1.0
e,,
f,,5.0


In [61]:
# 指axis=1时，keys为横向合并后的来源列名
pd.concat([s1,s4], axis=1,join_axes=[list('acbef')],keys=['x','y'])

Unnamed: 0,x,y
a,0.0,0.0
c,,
b,1.0,1.0
e,,
f,,5.0


In [160]:
# 在连接轴上创建一个多重索引
result = pd.concat([s1,s1,s3],keys=['one','two','three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [163]:
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [165]:
# 沿着轴向axis=1连接Series的时候，keys则成为DataFrame的列头：
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'],sort=True)

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [166]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                   columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                   columns=['three', 'four'])

In [167]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [168]:
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [170]:
pd.concat([df1,df2],axis=1, keys=['level1','level2'],sort=True)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [173]:
pd.concat({'level1':df1,'level2':df2}, axis=1,sort=True)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [174]:
pd.concat({'level1':df1,'level2':df2}, axis=1,sort=True,names=['upper','lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [175]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])

In [176]:
df1

Unnamed: 0,a,b,c,d
0,-1.580141,0.09911,-0.0861,1.555551
1,0.398668,1.32878,-0.694596,0.339487
2,-0.117966,-0.24793,-0.080612,-0.520547


In [177]:
df2

Unnamed: 0,b,d,a
0,-0.350806,-0.722759,1.500297
1,-0.671514,1.109691,1.525642


In [188]:
#sort参数在后续pandas版本就必须给定，sort=True即按照未合并的轴方向进行排序
#ignore_index参数重置索引值
pd.concat([df1,df2],sort=True,ignore_index = True)

Unnamed: 0,a,b,c,d
0,-1.580141,0.09911,-0.0861,1.555551
1,0.398668,1.32878,-0.694596,0.339487
2,-0.117966,-0.24793,-0.080612,-0.520547
3,1.500297,-0.350806,,-0.722759
4,1.525642,-0.671514,,1.109691


##### 联合重叠数据

In [218]:
a=pd.Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
b=pd.Series([1,np.nan,3,4,5,np.nan],index=['f','e','d','c','b','a'])

In [219]:
a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [220]:
b

f    1.0
e    NaN
d    3.0
c    4.0
b    5.0
a    NaN
dtype: float64

In [221]:
pd.isnull(a)

f     True
e    False
d     True
c    False
b    False
a     True
dtype: bool

In [222]:
# pd.isnull(a)元素为真，则对应取b中对应索引的值反之则取a中的值
np.where(pd.isnull(a),b,a)

array([1. , 2.5, 3. , 3.5, 4.5, nan])

In [223]:
# 用a的值填补b的缺失值
b.combine_first(a)

f    1.0
e    2.5
d    3.0
c    4.0
b    5.0
a    NaN
dtype: float64

In [224]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})

In [225]:
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [226]:
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [227]:
# 先用df2的数据填充df1中的缺失值，然后返回合并后的数据
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


##### 重塑和透视

In [229]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'], name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [232]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [233]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [234]:
# 可以传入层级序号或名称来拆分一个不同的层级
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [235]:
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [236]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])

In [237]:
s1

a    0
b    1
c    2
d    3
dtype: int64

In [238]:
s2

c    4
d    5
e    6
dtype: int64

In [239]:
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [240]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [242]:
# 默认情况下，堆叠会过滤除缺失值，因此堆叠拆堆的操作是可逆的
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [243]:
# 改变dropna参数可以不滤除na值
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

In [244]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [245]:
df = pd.DataFrame({'left': result, 'right': result + 5},
                  columns=pd.Index(['left', 'right'], name='side'))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [246]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [247]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


##### 将长透视为宽

##### 将宽透视为长

In [248]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [250]:
melted =pd.melt(df,['key'])
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [254]:
# 用pivot方法可以重塑回原先的布局
reshaped = melted.pivot('key','variable','value')
a = reshaped.reset_index()
a

variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


In [257]:
#重命名列名的方式即rename下传递一个字典
a.rename(columns={'A':'a'})

variable,key,a,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


In [258]:
# 指定列的子集作为值列,排除C
pd.melt(df,id_vars=['key'], value_vars=['A','B'])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6


In [259]:
# 不一定非得传入分组指标
pd.melt(df,value_vars = ['A','B','C'])

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


In [260]:
pd.melt(df,value_vars=['key','A','B'])

Unnamed: 0,variable,value
0,key,foo
1,key,bar
2,key,baz
3,A,1
4,A,2
5,A,3
6,B,4
7,B,5
8,B,6


### 第十章：数据聚合与分组操作

#### Grpupby机制

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,2.180807,-0.555634
1,a,two,1.24469,0.668352
2,b,one,-1.381476,-0.514265
3,b,two,0.256585,-0.446614
4,a,one,-0.636666,2.350606


In [3]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001FDDE076390>

In [4]:
grouped.mean()

key1
a    0.929610
b   -0.562445
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one     0.772070
      two     1.244690
b     one    -1.381476
      two     0.256585
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.77207,1.24469
b,-1.381476,0.256585


In [7]:
# groupby的另外的实现方法
df.groupby(df['key1'])['data1'].sum()

key1
a    2.788831
b   -1.124891
Name: data1, dtype: float64

In [8]:
df.groupby(df['key1']).sum()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.788831,2.463324
b,-1.124891,-0.96088


In [9]:
# 分组键可以是正确长度的任何数组
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

In [10]:
states

array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'], dtype='<U10')

In [11]:
years

array([2005, 2005, 2006, 2005, 2006])

In [12]:
df['data1'].groupby([states,years]).mean()

California  2005    1.244690
            2006   -1.381476
Ohio        2005    1.218696
            2006   -0.636666
Name: data1, dtype: float64

In [30]:
df.groupby([states,years])['data1'].mean()

California  2005    1.244690
            2006   -1.381476
Ohio        2005    1.218696
            2006   -0.636666
Name: data1, dtype: float64

In [13]:
# 分组信息为df列名时，分组依据可以直接传递列名，而不必用Series
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.92961,0.821108
b,-0.562445,-0.48044


In [14]:
# 和上面的代码书写方式的结果一样
df.groupby(df['key1']).mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.92961,0.821108
b,-0.562445,-0.48044


In [15]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.77207,0.897486
a,two,1.24469,0.668352
b,one,-1.381476,-0.514265
b,two,0.256585,-0.446614


In [16]:
#gruopby有size方法
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

##### 遍历各分组

In [17]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,2.180807,-0.555634
1,a,two,1.24469,0.668352
2,b,one,-1.381476,-0.514265
3,b,two,0.256585,-0.446614
4,a,one,-0.636666,2.350606


In [18]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  2.180807 -0.555634
1    a  two  1.244690  0.668352
4    a  one -0.636666  2.350606
b
  key1 key2     data1     data2
2    b  one -1.381476 -0.514265
3    b  two  0.256585 -0.446614


In [19]:
{name:group for name,group in df.groupby('key1')}

{'a':   key1 key2     data1     data2
 0    a  one  2.180807 -0.555634
 1    a  two  1.244690  0.668352
 4    a  one -0.636666  2.350606, 'b':   key1 key2     data1     data2
 2    b  one -1.381476 -0.514265
 3    b  two  0.256585 -0.446614}

In [20]:
for (k1,k2), gruop in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
2    b  one -1.381476 -0.514265
3    b  two  0.256585 -0.446614
('a', 'two')
  key1 key2     data1     data2
2    b  one -1.381476 -0.514265
3    b  two  0.256585 -0.446614
('b', 'one')
  key1 key2     data1     data2
2    b  one -1.381476 -0.514265
3    b  two  0.256585 -0.446614
('b', 'two')
  key1 key2     data1     data2
2    b  one -1.381476 -0.514265
3    b  two  0.256585 -0.446614


In [21]:
list(df.groupby('key1'))

[('a',   key1 key2     data1     data2
  0    a  one  2.180807 -0.555634
  1    a  two  1.244690  0.668352
  4    a  one -0.636666  2.350606), ('b',   key1 key2     data1     data2
  2    b  one -1.381476 -0.514265
  3    b  two  0.256585 -0.446614)]

In [22]:
pieces = dict(list(df.groupby('key1')))

In [23]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-1.381476,-0.514265
3,b,two,0.256585,-0.446614


In [24]:
# 默认情况groupby在axis=0上进行分组，也可以在其它任何轴上进行分组
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [25]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,2.180807,-0.555634
1,a,two,1.24469,0.668352
2,b,one,-1.381476,-0.514265
3,b,two,0.256585,-0.446614
4,a,one,-0.636666,2.350606


In [26]:
grouped = df.groupby(df.dtypes,axis=1)
for dtype,group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0  2.180807 -0.555634
1  1.244690  0.668352
2 -1.381476 -0.514265
3  0.256585 -0.446614
4 -0.636666  2.350606
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [27]:
df.groupby(df.dtypes,axis=1).count()

Unnamed: 0,float64,object
0,2,2
1,2,2
2,2,2
3,2,2
4,2,2


##### 选择一列或所有列的子集

In [28]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,2.180807,-0.555634
1,a,two,1.24469,0.668352
2,b,one,-1.381476,-0.514265
3,b,two,0.256585,-0.446614
4,a,one,-0.636666,2.350606


In [29]:
# 返回对象为Series
df.groupby('key1')['data1'].mean()

key1
a    0.929610
b   -0.562445
Name: data1, dtype: float64

In [48]:
#返回对象为DataFrame
df.groupby('key1')[['data1']].mean()

Unnamed: 0_level_0,data1
key1,Unnamed: 1_level_1
a,0.014127
b,0.390131


In [52]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.412355
a,two,-2.760524
b,one,-0.236567
b,two,-1.354302


##### 使用字典和Series分组

In [54]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,0.285673,0.21181,0.338814,-0.063257,-1.343643
Steve,-0.985936,-0.836619,-1.243825,-0.412692,1.20162
Wes,-0.159725,0.677647,-0.320539,-0.235212,0.269852
Jim,-0.722042,0.641873,0.784822,1.53467,0.762213
Travis,-0.446598,0.368999,0.66449,-1.303829,-0.03427


In [59]:
people.iloc[2:3,1:3] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.285673,0.21181,0.338814,-0.063257,-1.343643
Steve,-0.985936,-0.836619,-1.243825,-0.412692,1.20162
Wes,-0.159725,,,-0.235212,0.269852
Jim,-0.722042,0.641873,0.784822,1.53467,0.762213
Travis,-0.446598,0.368999,0.66449,-1.303829,-0.03427


In [61]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [63]:
# 使用字典进行分组
people.groupby(mapping,axis=1).sum()

Unnamed: 0,blue,red
Joe,0.275556,-0.84616
Steve,-1.656517,-0.620935
Wes,-0.235212,0.110127
Jim,2.319492,0.682044
Travis,-0.63934,-0.111869


In [65]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [67]:
# 使用Series进行分组
people.groupby(map_series,axis='columns').sum()

Unnamed: 0,blue,red
Joe,0.275556,-0.84616
Steve,-1.656517,-0.620935
Wes,-0.235212,0.110127
Jim,2.319492,0.682044
Travis,-0.63934,-0.111869


##### 使用函数进行分组

In [68]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.285673,0.21181,0.338814,-0.063257,-1.343643
Steve,-0.985936,-0.836619,-1.243825,-0.412692,1.20162
Wes,-0.159725,,,-0.235212,0.269852
Jim,-0.722042,0.641873,0.784822,1.53467,0.762213
Travis,-0.446598,0.368999,0.66449,-1.303829,-0.03427


In [70]:
# index应用len()函数，以len(index)的值作为分组依据
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.596094,0.853683,1.123636,1.2362,-0.311578
5,-0.985936,-0.836619,-1.243825,-0.412692,1.20162
6,-0.446598,0.368999,0.66449,-1.303829,-0.03427


In [71]:
# 函数与数组、字典或Series进行混合转化为分组依据
key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.159725,0.21181,0.338814,-0.235212,-1.343643
3,two,-0.722042,0.641873,0.784822,1.53467,0.762213
5,one,-0.985936,-0.836619,-1.243825,-0.412692,1.20162
6,two,-0.446598,0.368999,0.66449,-1.303829,-0.03427


##### 根据索引层级分组

In [81]:
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
df = pd.DataFrame(np.random.randn(4,5),columns=columns)
df.sort_index(axis=1,level=[0,1],ascending=False,inplace=True)
df

cty,US,US,US,JP,JP
tenor,5,3,1,3,1
0,0.393139,-0.935321,-0.875319,0.312859,1.259176
1,-0.952083,-0.508794,1.708362,1.249345,0.614443
2,0.917711,-0.881646,0.194056,-0.720822,-0.684005
3,1.323312,1.062389,0.594634,0.272704,0.130195


In [82]:
# groupby不一定要给出，by参数，可以只当level参数作为分组依据
df.groupby(level = 'cty',axis=1).count().sort_index(axis=1,ascending=False)

cty,US,JP
0,3,2
1,3,2
2,3,2
3,3,2


#### 数据聚合

In [84]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,2.744477,-2.278373
1,a,two,-1.168187,-0.368195
2,b,one,-0.174413,-0.930111
3,b,two,1.394453,-0.848898
4,a,one,0.100286,-0.138711


In [86]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)

key1
a    2.215639
b    1.237567
Name: data1, dtype: float64

In [None]:
# 使用自定义的聚合函数，需要将函数传递给aggregate或agg方法

In [78]:
def pkp(arr):
    return arr.max()-arr.min()
grouped.aggregate(pkp)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,1,2.0,0.067349,10.29
Fri,Yes,3,3.73,0.159925,34.42
Sat,No,3,8.0,0.235193,41.08
Sat,Yes,4,9.0,0.290095,47.74
Sun,No,4,4.99,0.193226,39.4
Sun,Yes,3,5.0,0.644685,38.1
Thur,No,5,5.45,0.19335,33.68
Thur,Yes,2,3.0,0.15124,32.77


In [88]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.558859,1.996234,-1.168187,-0.53395,0.100286,1.422381,2.744477,3.0,-0.928427,1.174705,-2.278373,-1.323284,-0.368195,-0.253453,-0.138711
b,2.0,0.61002,1.109356,-0.174413,0.217804,0.61002,1.002237,1.394453,2.0,-0.889505,0.057427,-0.930111,-0.909808,-0.889505,-0.869201,-0.848898


##### 逐列及多函数应用

In [32]:
# 文件名中含有中文会报错，指定engine参数为python可以解决
tips = pd.read_csv(r'C:\Users\Nikola\Documents\我的数据源\10 利用Python进行数据分析 第二版\examples\tips.csv',
                   engine='python')
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [73]:
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [74]:
# agg函数与.mean实现同样的功能
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [79]:
# 多列验证
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct','tip']
grouped_pct.agg('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,2.8125
Fri,Yes,0.174783,2.714
Sat,No,0.158048,3.102889
Sat,Yes,0.147906,2.875476
Sun,No,0.160113,3.167895
Sun,Yes,0.18725,3.516842
Thur,No,0.160298,2.673778
Thur,Yes,0.163863,3.03


In [80]:
grouped_pct.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,2.8125
Fri,Yes,0.174783,2.714
Sat,No,0.158048,3.102889
Sat,Yes,0.147906,2.875476
Sun,No,0.160113,3.167895
Sun,Yes,0.18725,3.516842
Thur,No,0.160298,2.673778
Thur,Yes,0.163863,3.03


In [81]:
grouped_pct.agg(['mean','std',pkp])

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,pkp,mean,std,pkp
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,0.15165,0.028123,0.067349,2.8125,0.898494,2.0
Fri,Yes,0.174783,0.051293,0.159925,2.714,1.077668,3.73
Sat,No,0.158048,0.039767,0.235193,3.102889,1.642088,8.0
Sat,Yes,0.147906,0.061375,0.290095,2.875476,1.63058,9.0
Sun,No,0.160113,0.042347,0.193226,3.167895,1.224785,4.99
Sun,Yes,0.18725,0.154134,0.644685,3.516842,1.261151,5.0
Thur,No,0.160298,0.038774,0.19335,2.673778,1.282964,5.45
Thur,Yes,0.163863,0.039389,0.15124,3.03,1.113491,3.0


In [82]:
grouped_pct.agg([('foo','mean'),('bar',np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,foo,bar,foo,bar
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.028123,2.8125,0.898494
Fri,Yes,0.174783,0.051293,2.714,1.077668
Sat,No,0.158048,0.039767,3.102889,1.642088
Sat,Yes,0.147906,0.061375,2.875476,1.63058
Sun,No,0.160113,0.042347,3.167895,1.224785
Sun,Yes,0.18725,0.154134,3.516842,1.261151
Thur,No,0.160298,0.038774,2.673778,1.282964
Thur,Yes,0.163863,0.039389,3.03,1.113491


In [146]:
func = ['count','mean','max']
res = grouped['tip_pct','total_bill'].agg(func)
res

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [147]:
res['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [148]:
func_tup = [('D','mean'),('A',np.var)]
grouped['tip_pct','total_bill'].agg(func_tup)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,D,A,D,A
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [83]:
grouped.agg({'tip':np.max,'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [85]:
grouped.agg({'tip':np.max,'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,3.5,9
Fri,Yes,4.73,31
Sat,No,9.0,115
Sat,Yes,10.0,104
Sun,No,6.0,167
Sun,Yes,6.5,49
Thur,No,6.7,112
Thur,Yes,5.0,40


In [150]:
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,0.120385,0.187735,0.15165,0.028123,9
Fri,Yes,0.103555,0.26348,0.174783,0.051293,31
Sat,No,0.056797,0.29199,0.158048,0.039767,115
Sat,Yes,0.035638,0.325733,0.147906,0.061375,104
Sun,No,0.059447,0.252672,0.160113,0.042347,167
Sun,Yes,0.06566,0.710345,0.18725,0.154134,49
Thur,No,0.072961,0.266312,0.160298,0.038774,112
Thur,Yes,0.090014,0.241255,0.163863,0.039389,40


##### 返回不含行索引的聚合数据

In [87]:
tips.groupby(['day','smoker'],as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


In [88]:
tips.groupby(['day','smoker'],as_index=True).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,size,tip_pct
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,18.42,2.8125,2.25,0.15165
Fri,Yes,16.813333,2.714,2.066667,0.174783
Sat,No,19.661778,3.102889,2.555556,0.158048
Sat,Yes,21.276667,2.875476,2.47619,0.147906
Sun,No,20.506667,3.167895,2.929825,0.160113
Sun,Yes,24.12,3.516842,2.578947,0.18725
Thur,No,17.113111,2.673778,2.488889,0.160298
Thur,Yes,19.190588,3.03,2.352941,0.163863


#### 应用：拆分-应用-联合

In [152]:
# 定义函数选除tips最高的n条记录
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [104]:
def top(df, n=5, columns='tip_pct'):
    return df.sort_values(by = columns,ascending=False)[:n]

In [154]:
top(tips,6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [155]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [156]:
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [157]:
tips.groupby(['smoker','day']).apply(top,n=1,columns='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [95]:
res = tips.groupby('smoker')['tip_pct'].describe()
res

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [102]:
# 原本应该使用res.stack()，用res.unstack（）结果竟然一样，区别仅仅在于index层级不一样，暂未理解。
res.unstack()

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [99]:
res.unstack().unstack()

smoker,No,Yes
count,151.0,93.0
mean,0.159328,0.163196
std,0.03991,0.085119
min,0.056797,0.035638
25%,0.136906,0.106771
50%,0.155625,0.153846
75%,0.185014,0.195059
max,0.29199,0.710345


In [97]:
res.stack()

smoker       
No      count    151.000000
        mean       0.159328
        std        0.039910
        min        0.056797
        25%        0.136906
        50%        0.155625
        75%        0.185014
        max        0.291990
Yes     count     93.000000
        mean       0.163196
        std        0.085119
        min        0.035638
        25%        0.106771
        50%        0.153846
        75%        0.195059
        max        0.710345
dtype: float64

In [101]:
res.stack().unstack()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [98]:
res.stack().swaplevel(0,1)

       smoker
count  No        151.000000
mean   No          0.159328
std    No          0.039910
min    No          0.056797
25%    No          0.136906
50%    No          0.155625
75%    No          0.185014
max    No          0.291990
count  Yes        93.000000
mean   Yes         0.163196
std    Yes         0.085119
min    Yes         0.035638
25%    Yes         0.106771
50%    Yes         0.153846
75%    Yes         0.195059
max    Yes         0.710345
dtype: float64

##### 压缩分组键

In [33]:
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [106]:
# as_index参数与group_keys参数的区别
# as_index=False，不以分组依据列为索引，但仍然会分配一个默认层索引
# gourp_keys=Fasle,不以分组依据列为索引，不分配层级索引
tips.groupby('smoker',group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
232,11.61,3.39,No,Sat,Dinner,2,0.29199
149,7.51,2.0,No,Thur,Lunch,2,0.266312
51,10.29,2.6,No,Sun,Dinner,2,0.252672
185,20.69,5.0,No,Sun,Dinner,5,0.241663
88,24.71,5.85,No,Thur,Lunch,2,0.236746
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [114]:
tips.groupby(['smoker','day'],group_keys=False).apply(top,n=1)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
223,15.98,3.0,No,Fri,Lunch,3,0.187735
232,11.61,3.39,No,Sat,Dinner,2,0.29199
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
93,16.32,4.3,Yes,Fri,Dinner,2,0.26348
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
194,16.58,4.0,Yes,Thur,Lunch,2,0.241255


In [None]:
tips.reset_index()

In [121]:
tips.groupby(['smoker','day'],as_index=False).apply(top,n=1)

Unnamed: 0,Unnamed: 1,total_bill,tip,smoker,day,time,size,tip_pct
0,223,15.98,3.0,No,Fri,Lunch,3,0.187735
1,232,11.61,3.39,No,Sat,Dinner,2,0.29199
2,51,10.29,2.6,No,Sun,Dinner,2,0.252672
3,149,7.51,2.0,No,Thur,Lunch,2,0.266312
4,93,16.32,4.3,Yes,Fri,Dinner,2,0.26348
5,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
6,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
7,194,16.58,4.0,Yes,Thur,Lunch,2,0.241255


In [176]:
tips.groupby('smoker',as_index=False).apply(top)

Unnamed: 0,Unnamed: 1,total_bill,tip,smoker,day,time,size,tip_pct
0,232,11.61,3.39,No,Sat,Dinner,2,0.29199
0,149,7.51,2.0,No,Thur,Lunch,2,0.266312
0,51,10.29,2.6,No,Sun,Dinner,2,0.252672
0,185,20.69,5.0,No,Sun,Dinner,5,0.241663
0,88,24.71,5.85,No,Thur,Lunch,2,0.236746
1,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
1,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
1,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
1,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
1,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


##### 分位数与桶分析

In [34]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000)})
frame.head()

Unnamed: 0,data1,data2
0,-0.518979,-0.709516
1,-0.652682,1.710255
2,-0.040112,0.579846
3,-1.450378,0.031716
4,0.946261,2.32098


In [35]:
quartiles = pd.cut(frame.data1,4)
quartiles[:10]

0      (-1.307, 0.25]
1      (-1.307, 0.25]
2      (-1.307, 0.25]
3    (-2.871, -1.307]
4       (0.25, 1.808]
5      (-1.307, 0.25]
6      (-1.307, 0.25]
7       (0.25, 1.808]
8       (0.25, 1.808]
9      (-1.307, 0.25]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.871, -1.307] < (-1.307, 0.25] < (0.25, 1.808] < (1.808, 3.365]]

In [36]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}

In [37]:
frame.data2.groupby(quartiles).apply(get_stats)

data1                  
(-2.871, -1.307]  count     83.000000
                  max        2.093198
                  mean      -0.049837
                  min       -2.234662
(-1.307, 0.25]    count    519.000000
                  max        2.814211
                  mean      -0.006646
                  min       -2.642790
(0.25, 1.808]     count    370.000000
                  max        3.174568
                  mean      -0.028683
                  min       -3.224330
(1.808, 3.365]    count     28.000000
                  max        1.658234
                  mean       0.043685
                  min       -1.654807
Name: data2, dtype: float64

In [39]:
frame.groupby(quartiles).apply(get_stats)

data1
(-2.871, -1.307]    {'min': [-2.8644776008329336, -2.2346615511304...
(-1.307, 0.25]      {'min': [-1.306713759968799, -2.64278961896123...
(0.25, 1.808]       {'min': [0.2504379024732472, -3.22433049708836...
(1.808, 3.365]      {'min': [1.810383238476587, -1.654807066922478...
dtype: object

In [38]:
frame.data2.groupby(quartiles).apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.871, -1.307]",83.0,2.093198,-0.049837,-2.234662
"(-1.307, 0.25]",519.0,2.814211,-0.006646,-2.64279
"(0.25, 1.808]",370.0,3.174568,-0.028683,-3.22433
"(1.808, 3.365]",28.0,1.658234,0.043685,-1.654807


In [131]:
pd.qcut(frame.data1,10,labels=False).head()

0    9
1    5
2    0
3    0
4    7
Name: data1, dtype: int64

In [134]:
# labels=False则分组名称为下标
pd.qcut(frame.data1,10,labels=False).head()

0    9
1    5
2    0
3    0
4    7
Name: data1, dtype: int64

In [201]:
# 将数据按照分位数分为十等份
grouping = pd.qcut(frame.data1,10,labels=False)
grouped = frame.data2.groupby(grouping)
df = grouped.apply(get_stats).unstack()
df.index.name = 'index'
df

Unnamed: 0_level_0,count,max,mean,min
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.48387,0.027031,-2.526632
1,100.0,1.989814,-0.071915,-2.413105
2,100.0,2.313852,-0.005281,-2.475986
3,100.0,2.384615,0.170234,-3.062812
4,100.0,2.747837,0.111811,-2.285785
5,100.0,3.306669,-0.013441,-2.077016
6,100.0,2.409544,0.020417,-2.97965
7,100.0,2.402699,-0.12084,-2.728393
8,100.0,2.230271,0.117274,-2.670971
9,100.0,2.451224,-0.07293,-3.223824


##### 使用分组值填充缺失值

In [203]:
s = pd.Series(np.random.randn(6))
s[::2]=np.nan
s

0         NaN
1   -0.075213
2         NaN
3   -0.971939
4         NaN
5    0.758960
dtype: float64

In [204]:
s.fillna(s.mean())

0   -0.096064
1   -0.075213
2   -0.096064
3   -0.971939
4   -0.096064
5    0.758960
dtype: float64

In [205]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data

Ohio         -0.306424
New York      2.860585
Vermont      -0.728584
Florida      -1.092832
Oregon       -0.188064
Nevada        0.282928
California    0.308772
Idaho        -2.848124
dtype: float64

In [208]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio         -0.306424
New York      2.860585
Vermont            NaN
Florida      -1.092832
Oregon       -0.188064
Nevada             NaN
California    0.308772
Idaho              NaN
dtype: float64

In [209]:
data.groupby(group_key).mean()

East    0.487110
West    0.060354
dtype: float64

In [210]:
fill_mean = lambda g:g.fillna(g.mean())

In [212]:
# groupby将数据分为两个组，然后各自应用apply()
data.groupby(group_key).apply(fill_mean)

Ohio         -0.306424
New York      2.860585
Vermont       0.487110
Florida      -1.092832
Oregon       -0.188064
Nevada        0.060354
California    0.308772
Idaho         0.060354
dtype: float64

In [218]:
# 每一个groupbu分组，各分组单元都有一个name属性
fill_values = {'East':0.5,'West':-1}
fill_func = lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

Ohio         -0.306424
New York      2.860585
Vermont       0.500000
Florida      -1.092832
Oregon       -0.188064
Nevada       -1.000000
California    0.308772
Idaho        -1.000000
dtype: float64

##### 随机采样与排列

In [283]:
suits = ['♥', '♠', '♣', '♦']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'Q', 'K']
cards=[]

In [284]:
for num in base_names:
    cards.extend(suit + str(num)  for suit in suits)
deck = pd.Series(card_val, index = cards)

In [285]:
deck[:13]

♥A     1
♠A     2
♣A     3
♦A     4
♥2     5
♠2     6
♣2     7
♦2     8
♥3     9
♠3    10
♣3    10
♦3    10
♥4    10
dtype: int64

In [286]:
# 定义一个函数，从牌中随机去除n张，默认n=5
def draw(deck, n=5):
    return deck.sample(n)

In [287]:
draw(deck,n=13)

♣A      3
♦J      5
♥8      3
♠8      4
♠3     10
♥3      9
♠A      2
♣5      6
♦10     1
♥7     10
♥Q      6
♦8      6
♣6     10
dtype: int64

In [288]:
get_suit = lambda card:card[0]

In [290]:
# index应用len()函数，以len(index)的值作为分组依据
deck.groupby(get_suit).apply(draw, n=2)

♠  ♠A      2
   ♠4      1
♣  ♣4      2
   ♣10    10
♥  ♥J      2
   ♥K     10
♦  ♦8      6
   ♦A      4
dtype: int64

##### 分组加权平均和相关性

In [291]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                'b', 'b', 'b', 'b'],
                   'data': np.random.randn(8),
                   'weights': np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,-0.385377,0.324234
1,a,-0.532384,0.796975
2,a,-0.30347,0.126171
3,a,-2.664157,0.161055
4,b,-0.425186,0.667423
5,b,0.040205,0.629283
6,b,0.298977,0.16902
7,b,1.123701,0.567381


In [292]:
close_px = pd.read_csv(r'C:\Users\Nikola\Documents\我的数据源\10 利用Python进行数据分析 第二版\examples\stock_px_2.csv',
                       engine='python',
                       parse_dates=True, 
                       index_col=0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL    2214 non-null float64
MSFT    2214 non-null float64
XOM     2214 non-null float64
SPX     2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB


In [294]:
close_px[-4:]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [295]:
close_px.tail()

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-10,388.81,26.94,76.28,1194.89
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [296]:
close_px.head()

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-02,7.4,21.11,29.22,909.03
2003-01-03,7.45,21.14,29.24,908.59
2003-01-06,7.45,21.52,29.96,929.01
2003-01-07,7.43,21.93,28.95,922.93
2003-01-08,7.28,21.31,28.83,909.93


In [297]:
spx_corr = lambda x: x.corrwith(x['SPX'])

In [299]:
rets = close_px.pct_change().dropna()
rets.head()

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-03,0.006757,0.001421,0.000684,-0.000484
2003-01-06,0.0,0.017975,0.024624,0.022474
2003-01-07,-0.002685,0.019052,-0.033712,-0.006545
2003-01-08,-0.020188,-0.028272,-0.004145,-0.014086
2003-01-09,0.008242,0.029094,0.021159,0.019386


In [300]:
get_year = lambda x:x.year

In [301]:
rets.groupby(get_year).apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [303]:
# 上面步骤的简化代码形式
close_px.pct_change().dropna().groupby(lambda x:x.year).apply(lambda x:x.corrwith(x['SPX']))

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


##### 逐组线性回归

#### 数据透视表与交叉表

In [304]:
# 文件名中含有中文会报错，指定engine参数为python可以解决
tips = pd.read_csv(r'C:\Users\Nikola\Documents\我的数据源\10 利用Python进行数据分析 第二版\examples\tips.csv',
                   engine='python')
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [313]:
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips = tips.iloc[:,[5,1,6,0,2,3,4]]
tips.head()

Unnamed: 0,day,tip,time,size,tip_pct,total_bill,smoker
0,Sun,1.01,Dinner,2,0.059447,16.99,No
1,Sun,1.66,Dinner,3,0.160542,10.34,No
2,Sun,3.5,Dinner,3,0.166587,21.01,No
3,Sun,3.31,Dinner,2,0.13978,23.68,No
4,Sun,3.61,Dinner,4,0.146808,24.59,No


In [314]:
# 数据透视表默认对值进行均值处理
tips.pivot_table(index = ['day','smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [315]:
tips.pivot_table(['tip_pct','size'],index = ['time','day'],columns='smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [316]:
#添加总计
tips.pivot_table(['tip_pct','size'],index = ['time','day'],columns='smoker',margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


In [317]:
tips.pivot_table('tip_pct',index=['time','smoker'],columns='day',aggfunc=len,margins=True)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
time,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dinner,No,3.0,45.0,57.0,1.0,106.0
Dinner,Yes,9.0,42.0,19.0,,70.0
Lunch,No,1.0,,,44.0,45.0
Lunch,Yes,6.0,,,17.0,23.0
All,,19.0,87.0,76.0,62.0,244.0


In [319]:
tips.pivot_table('tip_pct',index=['time','size','smoker'],columns='day',aggfunc=len,margins=True,fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,day,Fri,Sat,Sun,Thur,All
time,size,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dinner,1.0,No,0,1,0,0,1.0
Dinner,1.0,Yes,0,1,0,0,1.0
Dinner,2.0,No,3,25,27,1,56.0
Dinner,2.0,Yes,8,28,12,0,48.0
Dinner,3.0,No,0,12,11,0,23.0
Dinner,3.0,Yes,0,6,4,0,10.0
Dinner,4.0,No,0,7,16,0,23.0
Dinner,4.0,Yes,1,6,2,0,9.0
Dinner,5.0,No,0,0,2,0,2.0
Dinner,5.0,Yes,0,1,1,0,2.0


##### 交叉表

In [320]:
# 交叉表计算分组中的频率
from io import StringIO
data = """\
Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed"""
data = pd.read_table(StringIO(data), sep='\s+')

In [325]:
data.rename(columns={'Nationality':'N','Handedness':'H'},inplace=True)
data

Unnamed: 0,Sample,N,H
0,1,USA,Right-handed
1,2,Japan,Left-handed
2,3,USA,Right-handed
3,4,Japan,Right-handed
4,5,Japan,Left-handed
5,6,Japan,Right-handed
6,7,USA,Right-handed
7,8,USA,Left-handed
8,9,Japan,Right-handed
9,10,USA,Right-handed


In [328]:
# 交叉表实现
pd.crosstab(index=data.N,columns=data.H,margins=True,margins_name='计数')

H,Left-handed,Right-handed,计数
N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
计数,3,7,10


In [332]:
# 数据透视表实现
data.pivot_table(index=['N'],columns=['H'],aggfunc='count',margins=True)

Unnamed: 0_level_0,Sample,Sample,Sample
H,Left-handed,Right-handed,All
N,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Japan,2,3,5
USA,1,4,5
All,3,7,10


In [333]:
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)

Unnamed: 0_level_0,smoker,No,Yes,All
time,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dinner,Fri,3,9,12
Dinner,Sat,45,42,87
Dinner,Sun,57,19,76
Dinner,Thur,1,0,1
Lunch,Fri,1,6,7
Lunch,Thur,44,17,61
All,,151,93,244
