In [44]:
"""pandas是基于numpy构建的，
让以numpy为中心的应用变得更加简单
"""
import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import pandas_datareader.data as web
"""
一、pandas的数据结构介绍
两种主要的数据结构：Series,DataFrame
"""
"""(1)Series"""
# Series 是一种类似于一维数组的对象，
# 它由一组数据(各种numpy数据类型)
# 以及一组与之相关的数据标签（即索引）组成。
# 仅由一组数据即可产生最简单的Series
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
# Series的字符串表现形式为：索引在左，值在右
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
# 通常，希望创建的Series带有一个可以对各个数据点进行标记的索引：
obj2 = Series([4,7,-5,3],index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [9]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [10]:
# 索引取值
obj2['a']

-5

In [11]:
obj2[['c','a','d']]

c    3
a   -5
d    4
dtype: int64

In [12]:
# NumPy数组运算都会保留索引和值之间的链接
obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [13]:
obj2*2

d     8
b    14
a   -10
c     6
dtype: int64

In [17]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [18]:
# 将Series看成是一个定长的有序字典，
# 可用在许多原本需要字典参数的函数中：
'b' in obj2

True

In [19]:
'e' in obj2

False

In [22]:
# 通过python字典创建Series,Series的索引就是原字典的键
sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [25]:
# sdata中跟state索引相匹配的那个3个值会被找出对应，
# “California”找不到所对应的sdata,其结果为NaN(not a number)
states = ['California','Ohio','Oregon','Texas']
obj4 = Series(sdata,index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [26]:
# isnull ,notnull 检测缺失数据
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [27]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [28]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [30]:
# 自动对齐不同索引的数据
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [31]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [32]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [34]:
# Series对象本身及其索引都有一个name属性
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [35]:
# Series的索引可以通过赋值的方式就地修改：
obj.index = ['Bob','Steve','Jeff','Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

In [40]:
"""(2)DataFrame"""
# DataFrame是一个表格型数据结构，含有一组有序的列
# 每列可以是不同的值类型
# DataFrame既有行索引也有列索引
# 可以被看做Series组成的字典（共用同一个索引）
# DataFrame中的数据以一个和多个二维快存放的

# 创建DataFrame,最常用的一种是直接传入
# 一个由等长列表或numpy数组组成的字典：
# 结果DataFrame会自动加上索引，且全部列会被有序排列：
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
        'year':[2000,2001,2002,2001,2002],
        'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [41]:
# 指定列序列
DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [46]:
frame2 = DataFrame(data,columns=['year','state','pop','debt'], \
                   index=['one','two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [47]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [48]:
# 通过类似字典标记的方式或属性的方式，可以将DataFrame的列获取为一个Series
# 警告：索引返回视图，Series的Copy即可显式地复制列
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [49]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [50]:
# 返回的Series拥有原DataFrame相同的索引，且其name属性也被设置好了
# 行也可 用索引字段ix
frame2.ix['three']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [52]:
# 列可以通过赋值的方法进行修改
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [53]:
# 使用列或数组，长度必须匹配
frame2['debt'] = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [54]:
# 用Series赋值，指定索引
val = Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [58]:
# 为不存在的列赋值会创建新列，关键字del用于删除列
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [59]:
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [64]:
# 另一种常见的数据形式是嵌套字典：
# 外层键作为列，内层键作为行索引
# 内层键会被合并、排序以形成最终索引
pop = {'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [65]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [75]:
# 显式指定索引
DataFrame(pop,index=[2001,2002,2003]) # error

AttributeError: 'list' object has no attribute 'astype'

In [77]:
# 有Series组成的字典创建
pdata = {'Ohio':frame3['Ohio'][:-1],
         'Nevada':frame3['Nevada'][:2]}
DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [78]:
# 设置DataFrame的index和columns的name属性
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [79]:
# 与Series一样，values属性也会以二维ndarray的形式返回DataFrame的数据
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [82]:
# 若各列数据类型不同，选能兼容所有的类型
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

In [83]:
"""（3）索引对象"""
# pandas的索引对象负责管理轴标签和其他元数据（比如轴名称）
# 构建Series或DataFrame,所用到的任何数组或其他序列的标签会被转换成一个Index
obj = Series(range(3),index=['a','b','c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [84]:
index[1:]

Index(['b', 'c'], dtype='object')

In [85]:
# Index对象不可修改
index[1]='d'

TypeError: Index does not support mutable operations

In [86]:
# 不可修改性，使Index对象在多个数据结构之间安全共享
index = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0],index = index)
obj2.index is index

True

In [87]:
# 除长得像数组，Index的功能也类似一个固定大小的集合
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [88]:
'Ohio' in frame3.columns

True

In [89]:
2003 in frame3.index

False

In [90]:
"""
二、基本功能
"""
"""(1)重新索引"""
# reindex,创建一个适应新索引的新对象
obj = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [91]:
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [92]:
obj.reindex(['a','b','c','d','e'],fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [4]:
# 对于时间序列这样的有序数据，重新索引时可能需要做一些插值处理
# ffill前向填充
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [13]:
# reindex默认重新索引行
frame = DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],
                  columns=['Ohio','Texas','California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [10]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [11]:
# 重新索引列
states = ['Texas','Utah','California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [17]:
# 同时对行列重新索引，而插值只能按行应用（即轴0）
frame.reindex(index=['a','b','c','d'],method='ffill',
              columns=states)

ValueError: index must be monotonic increasing or decreasing

In [18]:
# 利用ix的标签索引功能，重新索引任务可以变得更简洁
frame.ix[['a','b','c','d'],states]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [20]:
"""（2）丢弃指定轴上的项"""
obj = Series(np.arange(5.),index=['a','b','c','d','e'])
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [21]:
obj.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [22]:
# DataFrame,可以删除任意轴上的索引值：
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New york'],
                 columns=['one','two','three','four'])
data.drop(['Colorado','Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New york,12,13,14,15


In [23]:
data.drop('two',axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New york,12,14,15


In [24]:
data.drop(['two','four'],axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New york,12,14


In [25]:
"""(3)索引、选取和过滤"""
obj = Series(np.arange(4.),index=['a','b','c','d'])
obj['b']

1.0

In [26]:
obj[1]

1.0

In [27]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [29]:
obj[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [30]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [31]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [32]:
# 与python切片不同，其末端包含（inclusive）
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [34]:
# 赋值
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [35]:
# DataFrame的索引 返回一个或多个列
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','NewYork'],
                 columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


In [36]:
data['two']

Ohio         1
Colorado     5
Utah         9
NewYork     13
Name: two, dtype: int32

In [37]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
NewYork,14,12


In [38]:
# 切片或布尔型数组选取行：
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [39]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


In [40]:
data<5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
NewYork,False,False,False,False


In [42]:
data[data<5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


In [43]:
# DataFrame行上进行标签索引
data.ix['Colorado',['two','three']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


two      5
three    6
Name: Colorado, dtype: int32

In [44]:
data.ix[['Colorado','Utah'],[3,0,1]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [46]:
data.ix[2]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [47]:
data.ix[:'Utah','two']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [51]:
data.ix[data.three > 5, :3]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
NewYork,12,13,14


In [52]:
"""(4)算术运算和数据对齐"""
# 对不同的索引对象进行算术运算，在将对象相加是
# 如果存在不同的索引对，则结果的索引就是该索引对的并集
s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a','c','e','f','g'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [53]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [54]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [58]:
# 对于DataFrame,对齐操作会同时发生在行和列上
df1 = DataFrame(np.arange(9.).reshape(3,3),columns=list('bcd'),
                index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),
                index=['Utah','Ohio','Texas','Oregon'])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [59]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [60]:
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [61]:
"""(5)在算术方法中填充值"""
df1 = DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde'))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [62]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [63]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [64]:
# add
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [65]:
# 重新索引指定填充值
df1.reindex(columns=df2.columns,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [66]:
"""(6)DataFrame和Series之间的运算"""
# 计算一个二维数组与其某行之间的差：
arr = np.arange(12.).reshape((3,4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [67]:
arr[0]

array([0., 1., 2., 3.])

In [68]:
arr-arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [69]:
# DataFrame和Series之间的运算
frame = DataFrame(np.arange(12.).reshape((4,3)),
                  columns=list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])

In [70]:
series = frame.ix[0]
frame

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [71]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [72]:
frame-series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [73]:
# 索引找不到，两对象就被重新索引，形成并集
series2 = Series(range(3),index=['b','e','f'])
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [74]:
# 若希望匹配行且在列上广播，则必须使用算术运算方法
series3 = frame['d']
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [75]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [76]:
frame.sub(series3,axis=0)

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [79]:
"""(7)函数的应用和映射"""
# numpy的ufuncs(元素级数组方法)也可用于操作pandas对象
frame = DataFrame(np.random.randn(4,3),columns=list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.004748,0.465226,-0.549632
Ohio,-0.472662,0.452174,-2.898224
Texas,-0.295086,0.706358,0.200455
Oregon,-0.047091,-0.129746,0.909672


In [80]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.004748,0.465226,0.549632
Ohio,0.472662,0.452174,2.898224
Texas,0.295086,0.706358,0.200455
Oregon,0.047091,0.129746,0.909672


In [86]:
# 将函数应用到各行各列所形成的一维数组上
# apply
f = lambda x: x.max() - x.min()
frame.apply(f)

b    0.477410
d    0.836104
e    3.807896
dtype: float64

In [87]:
def f(x):
    return Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.472662,-0.129746,-2.898224
max,0.004748,0.706358,0.909672


In [88]:
# 元素级的python函数也可，假如想的到frame中各个浮点值的格式化字符串
# 使用applymap
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.0,0.47,-0.55
Ohio,-0.47,0.45,-2.9
Texas,-0.3,0.71,0.2
Oregon,-0.05,-0.13,0.91


In [89]:
# 之所以叫applymap,是因为Series有一个用于应用元素级函数的map方法
frame['e'].map(format)

Utah      -0.55
Ohio      -2.90
Texas      0.20
Oregon     0.91
Name: e, dtype: object

In [90]:
"""(8)排序和排名"""
# sort_index
obj = Series(range(4),index=['d','a','b','c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [92]:
# DataFrame任意轴排序
frame = DataFrame(np.arange(8).reshape((2,4)),
                 index=['three','one'],
                columns=['d','a','b','c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [93]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [94]:
# 默认升序，也可降序
frame.sort_index(axis=1,ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [96]:
# Series按值排序
obj = Series([4,7,-3,2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [97]:
# 排序时，任何缺失值默认都会被放到Series的末尾
obj = Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [98]:
# 根据一个列或多个列中的值进行排序
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [99]:
frame.sort_index(by='b')

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [100]:
frame.sort_index(by=['a','b'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [101]:
# 排名（ranking）跟排序密切相关，且会增设一个排名值
# 默认情况下，rank是通过“为各组分配一个平均排名”
# 的方式破坏平级关系
obj = Series([7,-5,7,4,2,0,4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [102]:
# 根据值在原数据中出现的顺序给出排名
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [103]:
# 降序排名
obj.rank(ascending=False,method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [104]:
# 行或列上计算排名
frame = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],
                   'c':[-2,5,8,-2.5]})
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [105]:
frame.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


In [106]:
"""(9)带有重复值的轴索引"""
# 带有重复索引值的Series：
obj = Series(range(5),index=['a','a','b','b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [108]:
obj.index.is_unique

False

In [109]:
# 带有重复值的索引
obj['a']

a    0
a    1
dtype: int64

In [110]:
obj['c']

4

In [112]:
# 对DataFrame的行进行索引
df = DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,-0.529491,-0.183668,-0.425132
a,-0.717415,-1.581239,-0.395498
b,0.91505,1.03614,1.016174
b,-0.585446,-0.035595,1.491086


In [113]:
df.ix['b']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2
b,0.91505,1.03614,1.016174
b,-0.585446,-0.035595,1.491086


In [7]:
"""
三、汇总和计算描述统计
"""
# 约简方法
df = DataFrame([[1.4,np.nan],[7.1,-4.5],
               [np.nan,np.nan],[0.75,-1.3]],
               index=['a','b','c','d'],
               columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [8]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [9]:
df.sum(axis=1)# NA值自动排除，除非整个（行或列）均NA

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [10]:
df.mean(axis=1,skipna=False) # skipna禁用该功能

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [11]:
# 间接统计型
df.idxmax()

one    b
two    d
dtype: object

In [12]:
# 累计型
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [13]:
# describe一次性产生多个汇总统计
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [14]:
# describe 对于非数值型数据
obj = Series(['a','a','b','c']*4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

['a',
 'a',
 'b',
 'c',
 'a',
 'a',
 'b',
 'c',
 'a',
 'a',
 'b',
 'c',
 'a',
 'a',
 'b',
 'c']

In [20]:
"""(1)相关系数与协方差"""
# 有些统计是通过参数对计算
# 数据来自Yahoo！Finance的股票价格和成交量
all_data = {}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
price = DataFrame({tic:data['Adj Close'] 
                      for tic,data in all_data.items()})
volume = DataFrame({tic: data['Volume']
                    for tic,data in all_data.items()})
# 计算价格的百分数变化
returns =price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-24,0.034339,0.011117,0.004385,0.002587
2009-12-28,0.012295,0.007098,0.013326,0.005484
2009-12-29,-0.011862,-0.005571,-0.003477,0.007058
2009-12-30,0.012147,0.005376,0.005461,-0.013698
2009-12-31,-0.0043,-0.004416,-0.012597,-0.015504


In [21]:
# Series的corr方法用于计算两个Series中的
# 重叠的，非NA、按索引对齐的值的相关系数
# cov计算协方差
returns.MSFT.corr(returns.IBM)

0.4925370649472438

In [22]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.470676,0.412392,0.422852
GOOG,0.470676,1.0,0.390688,0.438313
IBM,0.412392,0.390688,1.0,0.492537
MSFT,0.422852,0.438313,0.492537,1.0


In [23]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.00103,0.000303,0.000254,0.000309
GOOG,0.000303,0.00058,0.000142,0.000204
IBM,0.000254,0.000142,0.000369,0.000216
MSFT,0.000309,0.000204,0.000216,0.000519


In [25]:
# DataFrame的corrwith方法，计算其列或行跟另一个
# Series或DataFrame之间的相关系数
# 传入一个Series将会返回一个相关系数值Series
# （针对各列计算）
returns.corrwith(returns.IBM)

AAPL    0.412392
GOOG    0.390688
IBM     1.000000
MSFT    0.492537
dtype: float64

In [28]:
# 传入一个DataFrame则会计算按列名配对的相关系数
# 计算百分比变化与成交量的相关系数
# 传入axis=1即可按行计算
returns.corrwith(volume)

AAPL   -0.057664
GOOG    0.062647
IBM    -0.006592
MSFT   -0.016101
dtype: float64

In [36]:
"""(2)唯一值、值计数以及成员资格"""
obj = Series(list('cadaabbcc'))
# 唯一值
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [38]:
# 值计数，各值出现频率,频率降序
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [39]:
# value_counts可用于数组或序列
pd.value_counts(obj.values,sort=False)

c    3
d    1
a    3
b    2
dtype: int64

In [40]:
# 成员资格方法，isin,判断矢量化集合的成员资格
# 用于选取Series和DataFrame列中数据的子集
mask = obj.isin(['b','c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [41]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [42]:
# DataFrame中多个相关列的一张柱状图
data = DataFrame({'Qu1':[1,3,4,3,4],
                  'Qu2':[2,3,1,2,3],
                  'Qu3':[1,5,2,4,4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [45]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [46]:
"""
四、处理缺失数据
"""
# pandas使用浮点值NaN（Not a Number）
# 表示浮点和非浮点数组中的缺失数据
# 只是一个便于被检测出来的标记而已
string_data = Series(['aardvark','artichoke',np.nan,'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [47]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [48]:
# python 内置的none值也会被当做NA处理
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [49]:
"""(1)滤除缺失数据"""
# 对于一个Series,dropna 返回一个仅含有非空数据和索引值的Series
from numpy import nan as NA 
data = Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [50]:
# 也可通过布尔型索引达到这个目的
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [52]:
# 对于DataFrame,可能丢弃全NA或含有NA的行或列
# dropna默认丢弃任何含有缺失值的行
data = DataFrame([[1.,6.5,3.],[1.,NA,NA],
                  [NA,NA,NA],[NA,5.6,3.]])
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,5.6,3.0


In [53]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [54]:
# 只丢弃全为NA的行
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,5.6,3.0


In [55]:
# 只丢弃全为NA的列
data[4]=NA 
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,5.6,3.0,


In [56]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,5.6,3.0


In [57]:
# 涉及时间序列数据，只想留下一部分观测数据
df = DataFrame(np.random.randn(7,3))
df.ix[:4,1] = NA
df.ix[:2,2] = NA 
df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


Unnamed: 0,0,1,2
0,1.266273,,
1,2.547312,,
2,-0.901888,,
3,1.741982,,0.064924
4,-1.321606,,-2.399387
5,0.551889,0.538242,1.068007
6,0.842513,0.85734,-1.432963


In [62]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
3,1.741982,,0.064924
4,-1.321606,,-2.399387
5,0.551889,0.538242,1.068007
6,0.842513,0.85734,-1.432963


In [63]:
"""(2)填充缺失数据"""
# 通过一个常数调用fillna,则替换为那个常数值
df.fillna(0)

Unnamed: 0,0,1,2
0,1.266273,0.0,0.0
1,2.547312,0.0,0.0
2,-0.901888,0.0,0.0
3,1.741982,0.0,0.064924
4,-1.321606,0.0,-2.399387
5,0.551889,0.538242,1.068007
6,0.842513,0.85734,-1.432963


In [66]:
# 通过字典调用，对不同列填充不同值
df.fillna({1:0.5,2:-1})

Unnamed: 0,0,1,2
0,1.266273,0.5,-1.0
1,2.547312,0.5,-1.0
2,-0.901888,0.5,-1.0
3,1.741982,0.5,0.064924
4,-1.321606,0.5,-2.399387
5,0.551889,0.538242,1.068007
6,0.842513,0.85734,-1.432963


In [67]:
# fillna默认返回新对象，也可对现有对象进行就地修改
# 总是返回被填充对象的引用
_ = df.fillna(0,inplace=True)
df

Unnamed: 0,0,1,2
0,1.266273,0.0,0.0
1,2.547312,0.0,0.0
2,-0.901888,0.0,0.0
3,1.741982,0.0,0.064924
4,-1.321606,0.0,-2.399387
5,0.551889,0.538242,1.068007
6,0.842513,0.85734,-1.432963


In [69]:
# 对reindex的那些插值方法也可用于fillna
df = DataFrame(np.random.randn(6,3))
df.ix[2:,1]= NA 
df.ix[4:,2]= NA 
df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


Unnamed: 0,0,1,2
0,-0.536163,0.971933,1.738847
1,1.267965,-0.679354,0.113268
2,-0.707861,,-0.718741
3,-0.492671,,1.953501
4,-1.051064,,
5,1.603846,,


In [70]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.536163,0.971933,1.738847
1,1.267965,-0.679354,0.113268
2,-0.707861,-0.679354,-0.718741
3,-0.492671,-0.679354,1.953501
4,-1.051064,-0.679354,1.953501
5,1.603846,-0.679354,1.953501


In [72]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,-0.536163,0.971933,1.738847
1,1.267965,-0.679354,0.113268
2,-0.707861,-0.679354,-0.718741
3,-0.492671,-0.679354,1.953501
4,-1.051064,,1.953501
5,1.603846,,1.953501


In [73]:
# 传入Series的平均值或中位数
data = Series([1.,NA,3.5,NA,7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [5]:
"""
五、层次化索引
"""
# 能在一个轴上拥有多个（两个以上）索引级别
# 以低维度形式处理高维度数据
data = Series(np.random.randn(10),
              index=[list('aaabbbccdd'),
                     [1,2,3,1,2,3,1,2,2,3]])
data

a  1    0.908731
   2    1.828313
   3   -0.400103
b  1   -0.668591
   2   -1.125470
   3    0.998739
c  1    0.031842
   2    0.241535
d  2    0.006847
   3   -0.637979
dtype: float64

In [6]:
# 带有MultiIndex索引的Series的格式化输出形式
# 索引之间的“间隔”表示“直接使用上面的标签”
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [7]:
# 层次化索引对象，选取数据子集
data['b']

1   -0.668591
2   -1.125470
3    0.998739
dtype: float64

In [8]:
data['b':'c']

b  1   -0.668591
   2   -1.125470
   3    0.998739
c  1    0.031842
   2    0.241535
dtype: float64

In [9]:
data.ix[['b','d']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


b  1   -0.668591
   2   -1.125470
   3    0.998739
d  2    0.006847
   3   -0.637979
dtype: float64

In [10]:
# "内层"进行选取
data[:,2]

a    1.828313
b   -1.125470
c    0.241535
d    0.006847
dtype: float64

In [11]:
# 层次化索引在数据重塑和基于分组的操作（如透视表生成）中扮演着重要角色
# 通过其unstack方法被重新安排到一个DataFrame中
data.unstack()

Unnamed: 0,1,2,3
a,0.908731,1.828313,-0.400103
b,-0.668591,-1.12547,0.998739
c,0.031842,0.241535,
d,,0.006847,-0.637979


In [12]:
# unstack的逆运算是stack
data.unstack().stack()

a  1    0.908731
   2    1.828313
   3   -0.400103
b  1   -0.668591
   2   -1.125470
   3    0.998739
c  1    0.031842
   2    0.241535
d  2    0.006847
   3   -0.637979
dtype: float64

In [13]:
# 对于DataFrame,每条轴都可以有分层索引
frame = DataFrame(np.arange(12).reshape((4,3)),
                  index=[list('aabb'),[1,2,1,2]],
                  columns=[['Ohio','Ohio','Colorado'],
                           ['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [14]:
# 各层都可以有名字（可以是字符串，也可以是别的python对象）
# 如果指定了名称，他们就会显示在控制台输出中
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [15]:
# 有了分部的列索引，因此可以轻松选取列分组
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [18]:
# 可以单独创建MultiIndex然后复用
pd.MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],
                         ['Green','Red','Green']],
                        names=['state','color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

In [19]:
"""(1)重排分级顺序"""
# 有时需要调整某条轴上各级别的顺序
# 或根据指定级别上的值对数据进行排序
# swaplevel接收两个级别编号或名称，
# 并返回一个互换了级别的新对象（数据不变）
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [20]:
# sortlevel根据单个级别中的值对数据进行排序（稳定的）
# 结果有序
frame.sortlevel(1)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [23]:
frame.swaplevel(0,1).sortlevel(0)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [24]:
"""(2)根据级别汇总统计"""
# 对DataFrame和Series的描述和汇总统计都有一个level选项
# 用于指定在某条轴上求和的级别
# 利用了pandas的groupby功能
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [25]:
frame.sum(level='color',axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [26]:
"""(3)使用DataFrame的列"""
# 将DataFrame的一个或多个列当做行索引来用
# 或将行索引变成列
frame = DataFrame({'a':range(7),'b':range(7,0,-1),
                   'c':['one','one','one','two','two','two','two'],
                   'd':[0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [27]:
# set_index会将其一个或多个列转换为行索引
# 并创建一个新的DataFrame
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [28]:
# 默认列移除，亦可保留
frame.set_index(['c','d'],drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [29]:
# reset_index的功能跟set_index刚好相反
# 层次化索引的级别会被转移到列里面
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [30]:
"""
六、其他有关pandas的话题
"""
"""(1)整数索引"""
# 跟内置python数据结构在索引语义上有些不同
ser = Series(np.arange(3.))
ser[-1] # 整数索引bug

KeyError: -1

In [31]:
# 对于非整数索引，则无歧义
ser2 = Series(np.arange(3.),index=['a','b','c'])
ser2[-1]

2.0

In [32]:
# 为保持良好的一致性，若轴索引含有索引器
# 那么根据整数进行数据选取的操作将总是面向标签
# 包括用ix切片
ser.ix[:1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


0    0.0
1    1.0
dtype: float64

In [37]:
# 若需要可靠的，不考虑索引类型的，基于位置的索引
# 使用Series的iget_value和DataFrame的irow和icol方法
ser3 = Series(range(3),index=[-5,1,3])
ser3.iat[2] # iget_value弃用

2

In [43]:
frame = DataFrame(np.arange(6).reshape(3,2),index=[2,0,1])
frame.iloc[0] # irow弃用

0    0
1    1
Name: 2, dtype: int32

In [50]:
"""(2)面板数据 error未完成"""
# pandas有一个Panel数据结构，可看作三维版的DataFrame
# 由DataFrame对象组成的字典或一个三维ndarray来创建
pdata = pd.Panel(dict((stk,web.get_data_yahoo(stk,'1/1/2009','1/6/2012'))\
                      for stk in ['AAPL','GOOG','MSFT','DELL']))
# Panel中的每一项都是一个DataFrame
pdata

KeyError: 'Date'

In [73]:
for stk in ['AAPL','GOOG','MSFT','DELL']:
    pdata = {stk:web.get_data_yahoo(stk,'1/1/2009','1/6/2012')}
#dict((stk,web.get_data_yahoo(stk,'1/1/2009','1/6/2012'))
      #for stk in ['AAPL','GOOG','MSFT','DELL'])

KeyError: 'Date'

ValueError: dictionary update sequence element #0 has length 4; 2 is required

{'a': 1, 'b': 1}