# pandas基础练习与示例

In [52]:
#导入pandas模块，约定俗成使用别名pd
import pandas as pd

In [53]:
#导入Pandas的两种数据结构Series和DataFrame，前者是带索引标签的一维数组，后者是带有索引标签和列标签的二维数组
from pandas import Series, DataFrame

In [54]:
import numpy as np
np.random.seed(12345)

PREVIOUS_MAX_ROWS = pd.options.display.max_rows
#设置notebook一个cell的显示行数
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True) #suppress:If True, always print floating point numbers using fixed point  notation

## 介绍pandas的数据结构

### Series数据结构，有翻译为系列

In [55]:
#从列表构建Series，缺省索引是从0开始的整数序列
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [62]:
#Series包括values属性和index索引属性
print(obj.values)
print(obj.index)  # 返回RangeIndex对象，类似range(4)

[ 7 -5  7  4  2  0  4]
RangeIndex(start=0, stop=7, step=1)


In [63]:
#从列表构建Series，指定索引标签
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [65]:
#以索引标签访问Series,思考比较与numpy数组的访问方式
obj2['a']

-5

In [66]:
obj2['d'] = 6
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [67]:
#与numpy的数组算数运算类似，针对Series的算数运算都是针对其每一个元素
print(obj2[obj2 > 0])
print(obj2 * 2)
np.exp(obj2)

d    6
b    7
c    3
dtype: int64
d    12
b    14
a   -10
c     6
dtype: int64


d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [68]:
#判断元素是否包含在Series对象中
'b' in obj2
'e' in obj2

False

In [7]:
#从字典构建Series对象
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [61]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [11]:
#判断Series中的每个元素是否为空，返回对应的Series结果
pd.isnull(obj4)
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [12]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [70]:
#Series对象之间进行算数运算，是基于对齐机制，即行和列标签一致的对应元素进行运算，没有对应的则为NaN
obj3
obj4
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [14]:
#Series对象及其索引可以通过name属性命名
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [15]:
obj
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### 数据框DataFrame

In [9]:
#从字典构建DataFrame，字典的key对应列名，缺省使用从0开始的系列整数作为索引
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [62]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [68]:
frame["b"][0:2]

0    4.3
1    7.0
Name: b, dtype: float64

In [72]:
#显示头尾，无参数则缺省显示5行
frame.head()
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [20]:
#旧版本的ix属性已经在新版本中不能使用，可分别使用loc（标签名称）和iloc（索引整数值）属性检索
#frame.ix[1,1]

AttributeError: 'DataFrame' object has no attribute 'ix'

In [73]:
print(frame.iloc[1,1])
frame.loc[2,['pop','year']]

2001


pop      3.6
year    2002
Name: 2, dtype: object

In [74]:
#创建DataFrame时指定列名，如原始数据无列名则直接使用指定列名，如果有则调整指定顺序,没有完全对应的列名则对应列元素值设置为NaN
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [72]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
print(frame2)
frame2.columns

       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN


Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [73]:
#DataFrame元素访问方式，按照列名和索引小标两种方式[].
print(frame2['state'])
frame2.year

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object


one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [77]:
#loc和iloc函数，分别按照标签名称和索引检索元素；注意：看到使用ix方式则是Python2.X版本
frame2.loc['three']


year       2002
state      Ohio
pop         3.6
debt          2
eastern    True
Name: three, dtype: object

In [78]:
frame2['debt'] = 16.5
print(frame2)
frame2['debt'] = np.arange(6.)
frame2

       year   state  pop  debt  eastern
one    2000    Ohio  1.5  16.5     True
two    2001    Ohio  1.7  16.5     True
three  2002    Ohio  3.6  16.5     True
four   2001  Nevada  2.4  16.5    False
five   2002  Nevada  2.9  16.5    False
six    2003  Nevada  3.2  16.5    False


Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,0.0,True
two,2001,Ohio,1.7,1.0,True
three,2002,Ohio,3.6,2.0,True
four,2001,Nevada,2.4,3.0,False
five,2002,Nevada,2.9,4.0,False
six,2003,Nevada,3.2,5.0,False


In [79]:
#使用Series对象对DataFrame对象的列赋值，需要索引对齐
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [81]:
#赋值DataFrame对象的列如果不存在，直接创建
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [83]:
#删除整列
del frame2['eastern']
frame2.columns

KeyError: 'eastern'

In [84]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [85]:
#由字典创建DF对象，如果字典有嵌套，则内层嵌套的key作为索引
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [84]:
#转置操作，行列互换
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [86]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [87]:
#DF的切片与构建。切片方式之一：DF['列名'][start:end]
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [88]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [90]:
frame3.values
frame3.index
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [32]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

### Index对象：注意不能修改

In [86]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
index[1:]

Index(['b', 'c'], dtype='object')

index[1] = 'd'  # TypeError

In [33]:
labels = pd.Index(np.arange(3))
labels
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2
obj2.index is labels

True

In [34]:
frame3
print(frame3.columns)   #DF的columns，其实也是Index对象
'Ohio' in frame3.columns
2003 in frame3.index

Index(['Nevada', 'Ohio'], dtype='object')


False

In [35]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

## 重要的函数

### Reindexing重建索引

In [88]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [89]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [90]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3
#重建索引时，如果指定索引在原对象的索引中不存在，则可指定填充方法，如前向填充ffill
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [91]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [11]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [15]:
#注意：reindex返回重建索引后的结果，原对象并没有改变
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [14]:
#loc的行列检索用法之一：df[行序列，列序列].注意：新版本要求行序列和列序列中的标签必须存在
frame.loc[["a","c","d"],["Texas","California"]]

Unnamed: 0,Texas,California
a,1,2
c,4,5
d,7,8


### 从一个轴删除数据

In [96]:
#obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
print(obj)
#指定索引名删除数据,drop(索引序列),inplace=True则修改对象本身，没有返回值
new_obj = obj.drop('c')
print(new_obj,obj)
obj.drop(['d', 'c']) 

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64 a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64


a    0.0
b    1.0
e    4.0
dtype: float64

In [100]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [101]:
#df对象的drop函数，缺省对应行，即索引标签，axis=0。注意：drop函数返回修改后的结果，缺省不对原对象进行修改，参数inplace指是否修改原对象
data.drop(['Colorado', 'Ohio'],inplace=True)

In [102]:
data

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [104]:
#指定axis=1或axis='columns'，表示删除整列，返回删除后的结果
data.drop('two', axis=1)
data.drop(['four'], axis='columns')

Unnamed: 0,one,two,three
Utah,8,9,10
New York,12,13,14


In [105]:
data

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [100]:
a=obj.drop('c')
a

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [101]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [102]:
#设置inplace=True，表示对原对象进行修改
obj.drop('c', inplace=True)
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

### 索引，选择与过滤

In [103]:
#Series对象的检索，缺省指的是索引标签名称或索引号
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj)
obj['b']
print(obj[1])
obj[2:4]
print(obj[['b', 'a', 'd']])
obj[[1, 3]]
obj[obj < 2]

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
1.0
b    1.0
a    0.0
d    3.0
dtype: float64


a    0.0
b    1.0
dtype: float64

In [104]:
#[索引起始:索引结束]的Series对象检索方式
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [None]:
obj['b':'c'] = 5
obj

In [105]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
print(data)
data['two']
data[['three', 'one']]

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [107]:
#[索引起始号:索引结束号]的DataFrame对象检索方式
print(data[:2])
data[data['three'] > 5]

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7


Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [108]:
data['three'] > 5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [None]:
#布尔序列检索方式
data < 5
data[data < 5] = 0
data

#### 使用loc和iloc函数进行选择

In [109]:
#loc函数检索格式：[索引标签名称序列，列标签名称序列]，使用形式不是括号，不像函数，像数组使用方式
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [110]:
#iloc函数检索格式：[索引编号序列，列编号序列]
data.iloc[2, [3, 0, 1]]
data.iloc[2]
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [None]:
data.loc[:'Utah', 'two']
data.iloc[:, :3][data.three > 5]

### 整数索引方式

ser = pd.Series(np.arange(3.))
ser
ser[-1]

In [None]:
ser = pd.Series(np.arange(3.))

In [None]:
ser

In [None]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]    #-1只倒数第一个

In [None]:
#索引区间":"方法，缺省不写，前面不写的缺省从0开始，后面不写的指到结尾
ser[:1]
ser.loc[:1]
ser.iloc[:1]

### 数学运算与数据对齐

In [17]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
print(s1)
s2

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64


a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [18]:
#Series对象之间算术运算，索引一致的对应元素进行相应运算，无对应的则设置为NaN
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [19]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
df2

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0


Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [20]:
#DataFrame对象之间算术运算，索引和列都一致的对应元素进行相应运算，无对应的则设置为NaN
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [21]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df1
df2
df1 - df2

Unnamed: 0,A,B
0,,
1,,


#### 带填充值的数学运算方法

In [22]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
df1
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [23]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [49]:
#设置参数fill_value，索引和列没有对应的元素会用该值填充替换
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [24]:
print(1 / df1)
print(df1)
df1.rdiv(1)  #rdiv右侧小数除法，指定参数1的效果与(1 / df1)相同

       a         b         c         d
0    inf  1.000000  0.500000  0.333333
1  0.250  0.200000  0.166667  0.142857
2  0.125  0.111111  0.100000  0.090909
     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0


Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [25]:
#重建索引时，原df对象如果没有对应索引或列，也可以指定填充值
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


#### DataFrame 和 Series对象之间的操作,注意维不同时的广播机制与numpy的多维数组类似

In [26]:
arr = np.arange(12.).reshape((3, 4))
print(arr)
print(arr[0])
arr - arr[0]

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
[0. 1. 2. 3.]


array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [27]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
print(frame)
print(series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64


In [114]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [28]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [29]:
series3 = frame['d']
frame
series3
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### 函数应用与映射Mapping

In [115]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
np.abs(frame)

               b         d         e
Utah   -0.204708  0.478943 -0.519439
Ohio   -0.555730  1.965781  1.393406
Texas   0.092908  0.281746  0.769023
Oregon  1.246435  1.007189 -1.296221


Unnamed: 0,b,d,e
Utah,0.204708,0.478943,0.519439
Ohio,0.55573,1.965781,1.393406
Texas,0.092908,0.281746,0.769023
Oregon,1.246435,1.007189,1.296221


In [116]:
f = lambda x: x.max() - x.min()
print(frame)
frame.apply(f)

               b         d         e
Utah   -0.204708  0.478943 -0.519439
Ohio   -0.555730  1.965781  1.393406
Texas   0.092908  0.281746  0.769023
Oregon  1.246435  1.007189 -1.296221


b    1.802165
d    1.684034
e    2.689627
dtype: float64

In [117]:
#axis参数指定函数应用在什么轴
frame.apply(f, axis='columns')

Utah      0.998382
Ohio      2.521511
Texas     0.676115
Oregon    2.542656
dtype: float64

In [None]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

In [None]:
format = lambda x: '%.2f' % x
frame.applymap(format)

In [None]:
frame['e'].map(format)

### 排序与排名Sorting，Ranking

In [30]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [32]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
print("排序前",frame)

排序前        d  a  b  c
three  0  1  2  3
one    4  5  6  7


In [33]:
#sort_index，不带参数缺省是对索引排名，即axies=0; 设置axis=1,是对列进行排序，但不改变元素的值
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [34]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [55]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [35]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()   #对元素值进行排序，返回排序结果，但原对象排序不变

2   -3
3    2
0    4
1    7
dtype: int64

In [57]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()   #注意空值的排序顺序

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [36]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
frame.sort_values(by='b')  #对于多列df，指定列，类似SQL的order by

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [37]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [38]:
#有关元素排名的详细介绍，参见：https://blog.csdn.net/hahaha66888/article/details/85051437
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj)
#根据元素值的大小，进行排名，如果有重复值，则排名的值有小数点（取均值）
obj.rank()

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64


0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [39]:
#根据元素值的大小，进行排名，如果有重复值并且指定method="first"参数，则排名值不再进行平均，先出现的排名靠前
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [40]:
# 降序排名，method='max'  使用整个分组的最大排名
print(obj.rank(ascending=False))  #注意降序排名的对应元素排序值
obj.rank(ascending=False, method='max')  #根据降序排名顺序，取整为最大值

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64


0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [42]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [41]:
print(frame.rank())
frame.rank(axis='columns')

     b    a
0  3.0  1.5
1  4.0  3.5
2  1.0  1.5
3  2.0  3.5


Unnamed: 0,b,a
0,2.0,1.0
1,2.0,1.0
2,1.0,2.0
3,2.0,1.0


### 有重复标签的轴索引

In [65]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [66]:
obj.index.is_unique

False

In [67]:
print(obj['a'])
obj['c']

a    0
a    1
dtype: int64


4

In [68]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
df.loc['b']

Unnamed: 0,0,1,2
b,0.092908,0.281746,0.769023
b,1.246435,1.007189,-1.296221


## 合计与计算描述性统计

In [106]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [51]:
df.sum()  #求和，缺省是按列求和，忽略NaN值

one    9.25
two   -5.80
dtype: float64

In [52]:
df.sum(axis='columns')  #按行求和

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [107]:
df.mean(axis='columns', skipna=True)  #求均值不能忽略NaN值，有一个元素是NaN值则结果为NaN

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [54]:
df.idxmax()  #返回每列最大值所在的索引和列

one    b
two    d
dtype: object

In [55]:
df.cumsum()  #注意：累积求和，保留过程结果，但注意NaN值得处理

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [108]:
df.describe()  #描述性统计，注意数值型与字符型的结果差别

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [77]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

### 相关性与协方差Correlation，Covariance

conda install pandas-datareader

In [109]:
price = pd.read_pickle('yahoo_price.pkl')
volume = pd.read_pickle('yahoo_volume.pkl')

import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [110]:
print(price,volume)

                  AAPL        GOOG         IBM       MSFT
Date                                                     
2010-01-04   27.990226  313.062468  113.304536  25.884104
2010-01-05   28.038618  311.683844  111.935822  25.892466
2010-01-06   27.592626  303.826685  111.208683  25.733566
2010-01-07   27.541619  296.753749  110.823732  25.465944
2010-01-08   27.724725  300.709808  111.935822  25.641571
...                ...         ...         ...        ...
2016-10-17  117.550003  779.960022  154.770004  57.220001
2016-10-18  117.470001  795.260010  150.720001  57.660000
2016-10-19  117.120003  801.500000  151.259995  57.529999
2016-10-20  117.059998  796.969971  151.520004  57.250000
2016-10-21  116.599998  799.369995  149.630005  59.660000

[1714 rows x 4 columns]                  AAPL      GOOG       IBM      MSFT
Date                                               
2010-01-04  123432400   3927000   6155300  38409100
2010-01-05  150476200   6031900   6841400  49749600
2010-01-06  1

In [111]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [83]:
#特征之间（对应列）的相关性和协方差
print(returns['MSFT'].corr(returns['IBM']))
returns['MSFT'].cov(returns['IBM'])

0.4997636114415114


8.870655479703546e-05

In [84]:
returns.MSFT.corr(returns.IBM)

0.4997636114415114

In [86]:
#整个DataFrame对象的列之间的相关性和协方差
print(returns.corr())
returns.cov()

          AAPL      GOOG       IBM      MSFT
AAPL  1.000000  0.407919  0.386817  0.389695
GOOG  0.407919  1.000000  0.405099  0.465919
IBM   0.386817  0.405099  1.000000  0.499764
MSFT  0.389695  0.465919  0.499764  1.000000


Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


In [45]:
#df与某一个属性的两两之间的相关性
returns.corrwith(returns.IBM)

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

In [46]:
returns.corrwith(volume)

AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

### 唯一值，值计数与成员

In [47]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [48]:
#获取不重复的元素列表
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [49]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [None]:
pd.value_counts(obj.values, sort=False)

In [50]:
obj
mask = obj.isin(['b', 'c'])
mask
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [51]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [None]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

In [None]:
result = data.apply(pd.value_counts).fillna(0)
result