### 5、pandas入门

In [46]:
import pandas as pd
import numpy as np

#### 5.1 pandas 数据结构

##### Series
- 类似一维数据的对象
- 索引在左，值在右
- 选取Series方式
    - 单个索引['A']
    - 索引列表[['A','B']]
- Series运算
    - 保留索引的链接
- 可看做定长的有序字典，因为它是通过索引值映射到数据值
    - 可直接将字典变成Series对象
- 检测缺失数据isnull（）, notnull（）
- 根据运算的索引标签自动对齐数据
- Series本身和索引有name的属性
- 索引可以通过赋值的方式修改

In [3]:
obj = pd.Series([4,6,7,3])

In [4]:
obj

0    4
1    6
2    7
3    3
dtype: int64

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj2 = pd.Series([1,2,3,4], index=['a','b','c','d'])

In [7]:
obj2

a    1
b    2
c    3
d    4
dtype: int64

In [8]:
obj2.index

Index([u'a', u'b', u'c', u'd'], dtype='object')

In [10]:
obj2['b'], obj2[['b','a']]

(2, b    2
 a    1
 dtype: int64)

In [11]:
obj2 * 2

a    2
b    4
c    6
d    8
dtype: int64

In [13]:
'a' in obj2

True

In [14]:
dict_data = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
pd.Series(dict_data)

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [18]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(dict_data, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [19]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [20]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [21]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [23]:
obj2 + obj4

California   NaN
Ohio         NaN
Oregon       NaN
Texas        NaN
a            NaN
b            NaN
c            NaN
d            NaN
dtype: float64

In [24]:
obj4.name = 'status'

In [25]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: status, dtype: float64

##### DataFrame
- 表格型数据结构
- 行列索引
- 以二维结构保持数据
- 列获取
    - df['status']
    - df.status
    - df[['status','other']]（返回Dataframe格式）
- 行获取
    - df.loc['three']
    - df.iloc[1]（根据索引）
- 删除列 del df['status]
- 转置属性T
- values 属性会以二维ndarray的形式返回

In [29]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada',
'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [32]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop','debt'],
                      index=['one','tow','three','four','five','six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
tow,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [33]:
frame2.T

Unnamed: 0,one,tow,three,four,five,six
year,2000,2001,2002,2001,2002,2003
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9,3.2
debt,,,,,,


<img src='images/5_1.png'>

##### 索引对象
- 负责管理轴标签和其他元数据(比如轴名称等)
- 不能直接对索引对象进行修改

In [34]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index

In [36]:
index[1] = 'fff'

TypeError: Index does not support mutable operations

<img src='images/5_2.png'>

#### 5.2 基本功能

##### 重新索引
- df.reindex()
    - 默认参数labels，替换的列表
        - 某个索引不存在则引入缺失值
    - 可选参数method，缺失值替换的方法
    - 可选参数columns，修改列的索引

In [38]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a',
'c'])

In [40]:
obj.reindex(['a', 'b', 'c', 'd', 'e'])

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [43]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2,
4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [44]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [47]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas','California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [48]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


<img src='images/5_3.png'>

##### 丢弃指定轴上的项
- df.drop()
    - 传入可为str / list
    - 默认axis=0 即为行，列则为1或者columns
    - 默认inplace=False 返回的是一个在指定轴上删 除了指定值的新对象，True则直接替换并不返回

In [66]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd',
'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [67]:
new_obj = obj.drop('c')

In [68]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [69]:
new_obj2 = obj.drop('c', inplace=True)

In [70]:
new_obj2, obj

(None, a    0.0
 b    1.0
 d    3.0
 e    4.0
 dtype: float64)

In [71]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'NewYork'],                     
                    columns=['one', 'two', 'three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


In [72]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
NewYork,12,13,14,15


In [73]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
NewYork,12,14


In [74]:
obj.drop('e', inplace=True)
obj

a    0.0
b    1.0
d    3.0
dtype: float64

##### 索引、选取和过滤
- Series
    - 索引值
        - 整数【1】
        - 字符【’aa‘】
        - 切片（整数、字符）【1:2】、【’a‘:'b'】
        - 列表（整数、字符）【【’a‘,'b'】】,【【1，2，3】】
        - 布尔型【obj>3】
- DataFrame
    - 索引值
        - 字符【’aa‘】
        - 字符列表【【’a‘,'b'】】，返回DataFrame
        - 切片【:2】
        - 布尔型【df['threee']>4】

In [75]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [76]:
obj['b']

1.0

In [77]:
obj[1]

1.0

In [78]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [79]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [80]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [81]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [82]:
obj['b':'d']

b    1.0
c    2.0
d    3.0
dtype: float64

In [84]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'NewYork'],
                    columns=['one', 'two', 'three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


In [85]:
data['two']

Ohio         1
Colorado     5
Utah         9
NewYork     13
Name: two, dtype: int64

In [86]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
NewYork,14,12


In [87]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [88]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


##### 用loc和iloc进行选取
- loc用名称，而iloc用索引值
- 参数【行,列】
- 可选取一行/列或者多行/列
    - 行列同时选取多行则放回DataFrame
- 切片

In [89]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [90]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [91]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [92]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [93]:
data.loc[:'Utah', 'two']

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [94]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
NewYork,12,13,14


<img src='images/5_4.png'>

##### 整数索引
- 索引含有整数，应使用标签loc / iloc

In [108]:
ser = pd.Series(np.arange(3.))
ser
# ser[-1] 报错

0    0.0
1    1.0
2    2.0
dtype: float64

In [103]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

2.0

In [105]:
ser[:1]

0    0.0
dtype: float64

In [106]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [107]:
ser.iloc[:1]

0    0.0
dtype: float64

##### 算术运算和数据对齐
- 缺省列或者行会以Nan填充

In [109]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd','e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a', 'c', 'e', 'f', 'g'])

In [110]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [112]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),index=['Utah', 'Ohio', 'Texas','Oregon'])

In [113]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [114]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})

In [115]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


##### 在算术方法中填充值
- add
    - df1.add(df2, fill_value=0)

<img src='images/5_5.png'>

In [116]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

In [118]:
df2.loc[1, 'b'] = np.nan

In [119]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [120]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [121]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [122]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [123]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [124]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


##### DataFrame 和 Series 之间的运算
- 将 Series 的索引匹配到 DataFrame 的列，然后沿着行一直向下广播
- 索引中找不到，则参与运算的两 个对象就会被重新索引以形成并集
- 匹配行且在列上广播，则必须使用算术运算方法

In [126]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [127]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [128]:
arr - arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [130]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                      columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas','Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [131]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [132]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [134]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [135]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [136]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [137]:
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


##### 函数应用和映射
- 函数应用，
    - 内置如sum,count,abs
    - 自行编写并使用apply方法应用
        - 默认在行上传播，设置axis='colunms'可在列上传播
        - 传递到 apply 的函数不是必须返回一个标量，还可以返回由多个值组成的 Series
    - applymap函数

In [139]:
frame = pd.DataFrame(np.random.randn(4, 3),columns=list('bde'), index=['Utah', 'Ohio', 'Texas','Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-1.336961,-2.554868,-0.853661
Ohio,1.584207,0.574794,1.019382
Texas,0.967176,-0.62725,1.320861
Oregon,0.956363,-0.130789,-0.606408


In [140]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.336961,2.554868,0.853661
Ohio,1.584207,0.574794,1.019382
Texas,0.967176,0.62725,1.320861
Oregon,0.956363,0.130789,0.606408


In [141]:
f = lambda x: x.max() - x.min()

In [142]:
frame.apply(f)

b    2.921169
d    3.129663
e    2.174523
dtype: float64

In [143]:
frame.apply(f, axis='columns')

Utah      1.701207
Ohio      1.009413
Texas     1.948111
Oregon    1.562771
dtype: float64

In [144]:
def f(x):
  return pd.Series([x.min(), x.max()], index=['min','max'])

In [145]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.336961,-2.554868,-0.853661
max,1.584207,0.574794,1.320861


In [146]:
frame.apply(f, axis='columns')

Unnamed: 0,min,max
Utah,-2.554868,-0.853661
Ohio,0.574794,1.584207
Texas,-0.62725,1.320861
Oregon,-0.606408,0.956363


In [152]:
format = lambda x: '%.2f' % x

In [153]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-1.34,-2.55,-0.85
Ohio,1.58,0.57,1.02
Texas,0.97,-0.63,1.32
Oregon,0.96,-0.13,-0.61


In [154]:
frame['e'].map(format)

Utah      -0.85
Ohio       1.02
Texas      1.32
Oregon    -0.61
Name: e, dtype: object

##### 排序和排名
- 排序
    - sort_index()
        - ascending=False默认升序
        - axis=0 默认排序行
    - sort_values()
        - 必要参数可为单个列名或者是由多个列组成的列表
            - Series 直接使用
            - DateFame 作为参数by
        - 缺失值默认都会被放到末尾
- 排名（有点看不懂）
    - rank函数
        - 参数method
        - 参数ascending=False
        - 参数axis='columns' 在行或者列上

<img src='images/5_6.png'>

In [157]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [159]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [160]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [161]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [162]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [163]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [164]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [165]:
frame.sort_values(by='b')

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [166]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [167]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [168]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [169]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [170]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [171]:
frame.rank(axis='columns')

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


##### 带有重复标签的轴索引
- 索引的 is_unique 属性可以告诉你它的值是否是唯一的
- 如果某个索引对应多个值，则返回一个 Series;而对应单个值的，则返回一个标量值

In [174]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [175]:
obj.index.is_unique

False

In [177]:
obj['a']

a    0
a    1
dtype: int64

In [178]:
obj['c']

4

In [180]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,0.629014,0.288453,-1.356049
a,0.49731,0.142522,-0.188853
b,-2.148222,-1.677703,0.780728
b,-2.59645,-1.090348,-0.001783


In [181]:
df.loc['b']

Unnamed: 0,0,1,2
b,-2.148222,-1.677703,0.780728
b,-2.59645,-1.090348,-0.001783


#### 5.3 汇总和计算描述统计

##### 简单统计
- sum函数
    - 参数axix
- mean函数
    - 参数axis='columns'
    - 参数skipna=False 是否忽略NA值
- idmax/idmin函数
    - 最小值或最大值的索引
- cumsum()函数
    - idmax/idmin返回的是间接统计,cumsum返回累计统计
- describe函数
    - 一次性产生多个汇总统计

<img src='images/5_7.png'>

<img src='images/5_8.png'>

In [182]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [183]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [184]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [185]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [186]:
df.idxmax()

one    b
two    d
dtype: object

In [187]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [188]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [189]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

##### 相关系数与协方差
- corr
- cov
- corrwith
    - 计算其列或行跟另一个Series 或 DataFrame 之间的相关系数
    - 传入一个 Series 将会返回一个相关系数值 Series (针对各列进行计算)

In [193]:
import pandas_datareader.data as web

In [194]:
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [196]:
price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                    for ticker, data in all_data.items()})

In [197]:
returns = price.pct_change()

In [198]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-02-25,0.007284,-0.000874,0.001508,0.005587
2019-02-26,0.000574,0.005165,0.001864,0.0069
2019-02-27,0.003098,0.000825,-0.003936,-0.001691
2019-02-28,-0.009836,0.003468,-0.007473,-0.001248
2019-03-01,0.010511,0.018814,0.007746,0.004463


In [199]:
returns['MSFT'].corr(returns['IBM'])

0.48668310978790741

In [200]:
returns['MSFT'].cov(returns['IBM'])

8.7347189057950152e-05

In [201]:
returns.MSFT.corr(returns.IBM)

0.48668310978790741

In [203]:
returns.corr(),returns.cov()

(          AAPL      GOOG       IBM      MSFT
 AAPL  1.000000  0.457481  0.372080  0.450247
 GOOG  0.457481  1.000000  0.408324  0.537512
 IBM   0.372080  0.408324  1.000000  0.486683
 MSFT  0.450247  0.537512  0.486683  1.000000,
           AAPL      GOOG       IBM      MSFT
 AAPL  0.000271  0.000116  0.000076  0.000108
 GOOG  0.000116  0.000238  0.000078  0.000121
 IBM   0.000076  0.000078  0.000153  0.000087
 MSFT  0.000108  0.000121  0.000087  0.000211)

In [204]:
returns.corrwith(returns.IBM)

AAPL    0.372080
GOOG    0.408324
IBM     1.000000
MSFT    0.486683
dtype: float64

In [205]:
returns.corrwith(volume)

AAPL   -0.059732
GOOG   -0.017377
IBM    -0.153277
MSFT   -0.089666
dtype: float64

##### 唯一值、值计数以及成员资格
- unique，它可以得到 Series 中的唯一值数组
- value_counts 用于计算一个 Series 中各值出现的频率
- isin 用于判断矢量化集合的成员资格，可用于过滤 Series 中或 DataFrame 列中数据的子集
- Index.get_indexer 给你一个索引数组，从可能包含重复值的数组到另一个不同值的数组

In [206]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [207]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [208]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [209]:
pd.value_counts(obj.values, sort=False)

a    3
c    3
b    2
d    1
dtype: int64

In [210]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [213]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [214]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [215]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [216]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [217]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
