In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
data = pd.read_csv('http://bit.ly/imdbratings')

In [3]:
data.head(10)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
8,8.9,Schindler's List,R,Biography,195,"[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kings..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."


# 数据结构
## Series

一维数组型对象,包含数据标签,称为索引(index),类似固定有限长的字典  
默认索引: 0 - N-1  
获取Series对象值和索引: obj.values, obj.index

### 创建

In [4]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [25]:
obj2 = pd.Series([4,7,-5,3],index = ['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

使用字典生成Series

In [27]:
sdata = {'宝马':18, '奔驰':19, '奥迪':16}

In [29]:
obj3 = pd.Series(sdata)

In [31]:
obj3

宝马    18
奔驰    19
奥迪    16
dtype: int64

### 索引

In [17]:
obj2['a']

-5

In [32]:
obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [33]:
states = ['宝马','奔驰','保时捷']

In [36]:
obj4 = pd.Series(sdata, index=states)
obj4

宝马     18.0
奔驰     19.0
保时捷     NaN
dtype: float64

### 检查缺失

In [37]:
pd.isnull(obj4)

宝马     False
奔驰     False
保时捷     True
dtype: bool

In [43]:
pd.notnull(obj4)

宝马      True
奔驰      True
保时捷    False
dtype: bool

### 自动对齐索引

In [44]:
obj3

宝马    18
奔驰    19
奥迪    16
dtype: int64

In [45]:
obj4

宝马     18.0
奔驰     19.0
保时捷     NaN
dtype: float64

In [46]:
obj3 +obj4

保时捷     NaN
奔驰     38.0
奥迪      NaN
宝马     36.0
dtype: float64

### name属性

In [53]:
obj4.name = 'value'
obj4.index.name = 'car'
obj4 

car
宝马     18.0
奔驰     19.0
保时捷     NaN
Name: value, dtype: float64

### index按位置赋值进行改变

In [55]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [59]:
obj.index = ['a', 'b', 'c', 'd']
obj

a    4
b    7
c   -5
d    3
dtype: int64

### 运算

In [20]:
obj*2

0     8
1    14
2   -10
3     6
dtype: int64

In [21]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [22]:
'b' in obj2

True

In [24]:
'e' in obj2

False

### [ ]测试_标签对obj2索引a,c

In [14]:
# 注意此处['a','c']要加[],是字符串作为索引列表,用标签进行索引
obj2[['a','c']]

a   -5
c    3
dtype: int64

## DataFrame
表示矩阵的数据表,包含排序的列集合,每一列可以是不同的值类型.  
其中数据被存储为一个以上的二维块.

### [ ]构建

利用包含等长度列表或numpy数组的**字典**来形成  
转置 .T

注意:  
①值是数组  ②注意有,

In [69]:
data = {'car':['宝马','奔驰','奥迪'],
       'value':['17','18','16'],
       'number':['1','2','3']}
frame = pd.DataFrame(data)
frame

Unnamed: 0,car,value,number
0,宝马,17,1
1,奔驰,18,2
2,奥迪,16,3


In [81]:
data = {'car':['benchi','baoma','aodi'],
       'number':[1,2,3],
       'value':[17,18,16]}
frame = pd.DataFrame(data,columns=['car','value','number'],index=['one','two','three'])
frame

Unnamed: 0,car,value,number
one,benchi,17,1
two,baoma,18,2
three,aodi,16,3


### 检索
可以按字典标记或属性检索为Series

In [82]:
frame['value']  # (列向)

one      17
two      18
three    16
Name: value, dtype: int64

In [83]:
frame.car

one      benchi
two       baoma
three      aodi
Name: car, dtype: object

通过位置或特殊属性loc进行索引

In [86]:
frame.loc['two']  # 按index进行索引(横向)

car       baoma
value        18
number        2
Name: two, dtype: object

### 更改列引用
要求列长度与赋值长度匹配, 或加入index参数  
如果赋值列不存在,将生成新列  
del方法移除已有列  

In [99]:
frame['number'] = [2,3,4]
frame

Unnamed: 0,car,value,number
one,benchi,17,2
two,baoma,18,3
three,aodi,16,4


In [100]:
del frame['number']
frame

Unnamed: 0,car,value
one,benchi,17
two,baoma,18
three,aodi,16


In [101]:
frame.columns

Index(['car', 'value'], dtype='object')

### 嵌套字典
传给DataFrame时会将字典的键作为键, 内部字典的键作为行索引

In [102]:
pop = {'benchi':{19:2,20:1},
       'baoma':{19:3,20:2},
       'aodi':{19:4,20:3}}

In [192]:
frame2 = pd.DataFrame(pop)
frame2

Unnamed: 0,benchi,baoma,aodi
19,2,3,4
20,1,2,3


### 自动对齐索引
* 默认缺失值方法
* 使用填充值的算术方法:df1.add(df2, fill_value=0)

In [206]:
df1 = pd.DataFrame(np.arange(9).reshape(3,3),
                   columns=['car','value','number'],
                   index=['one','two','three'])
df1

Unnamed: 0,car,value,number
one,0,1,2
two,3,4,5
three,6,7,8


In [203]:
df2 = pd.DataFrame(np.arange(9).reshape(3,3),
                   columns=['car','value','number'],
                   index=['one','two','four'])
df2

Unnamed: 0,car,value,number
one,0,1,2
two,3,4,5
four,6,7,8


In [205]:
df1+df2

Unnamed: 0,car,value,number
four,,,
one,0.0,2.0,4.0
three,,,
two,6.0,8.0,10.0


使用填充值的算术方法

In [207]:
df1.add(df2, fill_value=0)

Unnamed: 0,car,value,number
four,6.0,7.0,8.0
one,0.0,2.0,4.0
three,6.0,7.0,8.0
two,6.0,8.0,10.0


### name属性

In [114]:
frame2.index.name = 'year'; frame2.columns.name = 'car'
frame2

car,benchi,baoma,aodi
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19,2,3,4
20,1,2,3


### values属性

In [116]:
frame2.values

array([[2, 3, 4],
       [1, 2, 3]], dtype=int64)

In [118]:
frame.values

array([['benchi', 17],
       ['baoma', 18],
       ['aodi', 16]], dtype=object)

### DataFrame构造函数的有效输入
* 2D ndarray
* 各种字典 
* 各种列表

## 索引对象
### 不变性
* 索引对象不可变,不能修改  ( index[1]=1 错的)
* 使多种数据结构中分享索引对象更安全

In [123]:
obj = pd.Series(range(3), index=['a','b','c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [124]:
index[1:]

Index(['b', 'c'], dtype='object')

In [125]:
labels = pd.Index(np.arange(3))
labels 

Int64Index([0, 1, 2], dtype='int64')

In [127]:
obj2 = pd.Series([1.5,-2.5,0], index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [129]:
obj2.index is labels

True

In [132]:
frame2

car,benchi,baoma,aodi
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19,2,3,4
20,1,2,3


In [134]:
frame2.columns

Index(['benchi', 'baoma', 'aodi'], dtype='object', name='car')

In [137]:
'baoma' in frame2.columns

True

In [139]:
dup_labels = pd.Index(['foo','foo','foo2'])
dup_labels

Index(['foo', 'foo', 'foo2'], dtype='object')

### 索引对象的方法和属性

方法 | 属性
:-: | :-:
append | 额外的索引对象粘贴到元索引后,生成新索引|
difference | 计算索引差集|
intersection | 计算索引交集|
union|计算两个索引的并集|
isin|表示每个值是否在传值中的布尔数组|
delete|
drop|根据传参删除指定索引值,产生新的索引|
inset | 位置i处插入一个元素
is_monotonic|索引序列递增返回True
is_unique|索引序列唯一返回True
unique|计算序列唯一值序列

# 基本功能

## 重建索引

.reindex方法的参数  
index, method, fill_value, limit, tolerance, level,copy

In [144]:
obj = pd.Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [147]:
obj2 = obj.reindex(['d','b','a','c','e'])
obj2


d    4.5
b    7.2
a   -5.3
c    3.6
e    NaN
dtype: float64

顺序序列: ffill方法

In [149]:
obj3 = pd.Series([4.5,7.2,-5.3,3.6], index=[0,2,4,6])
obj3

0    4.5
2    7.2
4   -5.3
6    3.6
dtype: float64

In [150]:
obj3.reindex(range(6),method='ffill')

0    4.5
1    4.5
2    7.2
3    7.2
4   -5.3
5   -5.3
dtype: float64

## 轴向上删除条目
data.drop()

In [155]:
data = {'car':['宝马','奔驰','奥迪'],
       'value':['17','18','16'],
       'number':['1','2','3']}
frame = pd.DataFrame(data)
frame

Unnamed: 0,car,value,number
0,宝马,17,1
1,奔驰,18,2
2,奥迪,16,3


In [154]:
frame.drop(1)

Unnamed: 0,car,value,number
0,宝马,17,1
2,奥迪,16,3


## 索引,选择与过滤

**DataFrame 索引选项**

类型|描述 
 ----  | ----  
df[val] |选择单列或多列,特殊情况:通过布尔数组过滤行,切片或布尔值
df.loc[val]|标签选择,单行或多行
df.loc[:,val]|标签选择单列或多列
df.loc[val1,val2]|标签选择行和列的一部分
df.iloc[where]|根据整数位置选择
df.iloc[:,where]
df.iloc[where_i,where_j]
df.at[label_i,label_j]|根据行列标签选择,单个标量值
df.iat[i,j]|根据行列整数位置选择,单个标量值
reindex方法|标签选择行或列
get_value,set_value方法|行和列的标签设置单个值



In [157]:
obj = frame['value']
obj

0    17
1    18
2    16
Name: value, dtype: object

|  表头   | 表头  |
|  ----  | ----  |
| 单元格  | 单元格 |
| 单元格  | 单元格 |

In [158]:
obj[0:2]

0    17
1    18
Name: value, dtype: object

In [160]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [162]:
obj[0:2] = 2

In [163]:
obj

0    2
1    2
2   -5
3    3
dtype: int64

In [175]:
data = pd.DataFrame(np.arange(16).reshape(4,4))
data

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


索引

In [168]:
data[1]

0     1
1     5
2     9
3    13
Name: 1, dtype: int32

In [176]:
data[:2]

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7


选择

In [177]:
data > 5

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,True,True
2,True,True,True,True
3,True,True,True,True


In [178]:
data[data < 5] 

Unnamed: 0,0,1,2,3
0,0.0,1.0,2.0,3.0
1,4.0,,,
2,,,,
3,,,,


过滤

In [180]:
data[data < 5] = 0
data

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,0,5,6,7
2,8,9,10,11
3,12,13,14,15


使用loc和iloc选择数据  (标签索引,单行多列) 
loc:轴标签 iloc:整数标签

In [185]:
data.loc[1,[1,3]]

1    5
3    7
Name: 1, dtype: int32

In [187]:
data.iloc[1,[1,3]]

1    5
3    7
Name: 1, dtype: int32

In [191]:
data.loc[1]

0    0
1    5
2    6
3    7
Name: 1, dtype: int32

## DataFrame 和 S2eries 间的操作
广播机制: arr - arr[0] 时,每行都减去了数组  

In [209]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [211]:
arr[0]

array([0, 1, 2, 3])

In [213]:
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [215]:
frame = pd.DataFrame(np.arange(12).reshape(4,3),
                    columns=list('bde'),
                    index=list('bdca'))
series = frame.iloc[0]
frame

Unnamed: 0,b,d,e
b,0,1,2
d,3,4,5
c,6,7,8
a,9,10,11


In [216]:
series

b    0
d    1
e    2
Name: b, dtype: int32

In [217]:
frame - series

Unnamed: 0,b,d,e
b,0,0,0
d,3,3,3
c,6,6,6
a,9,9,9


## 函数应用和映射

1. numpy的通用函数(逐元素数组方法)对pandas也有用
2. 大部分常用数组统计方法(sum, mean等)
3. DataFrame的apply方法:  
    frame.apply(f):将函数f被frame中的每一列调用一次,结果是一frame的列作为索引的Series  
    frame.apply(f, axis='columns'): 每一行调用一次
4. Series使用逐元素的Python函数 -- map方法
5. frame使用逐元素的Python函数 -- **applymap方法**


In [218]:
frame

Unnamed: 0,b,d,e
b,0,1,2
d,3,4,5
c,6,7,8
a,9,10,11


In [219]:
np.exp(frame)

Unnamed: 0,b,d,e
b,1.0,2.718282,7.389056
d,20.085537,54.59815,148.413159
c,403.428793,1096.633158,2980.957987
a,8103.083928,22026.465795,59874.141715


In [220]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    9
d    9
e    9
dtype: int64

传递给apply的是返回的带有多个值的Series

In [222]:
def f(x):
    return pd.Series([x.min(),x.max()], index=['min','max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,0,1,2
max,9,10,11


逐元素的Python函数也可以使用 

**applymap方法**

In [226]:
formate = lambda x:'%.2f' %x
frame.applymap(formate)

Unnamed: 0,b,d,e
b,0.0,1.0,2.0
d,3.0,4.0,5.0
c,6.0,7.0,8.0
a,9.0,10.0,11.0


map方法

In [227]:
frame['e'].map(formate)

b     2.00
d     5.00
c     8.00
a    11.00
Name: e, dtype: object

## 排序和排名

sort_index方法: f回一个新的,排序好的对象  
sort_values方法:根据Series的值排序,DataFrame时使用参数by  
排名中的平级关系打破方法:  


In [233]:
obj = pd.Series(range(4), index=list('dabc'))
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [241]:
frame = pd.DataFrame(np.arange(8).reshape(2,4),
                    index=['three','one'],
                    columns=list)
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [242]:
frame.loc['three','b']=7
frame

Unnamed: 0,d,a,b,c
three,0,1,7,3
one,4,5,6,7


In [244]:
frame.sort_values(by='b')

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,7,3


# 描述性统计的概念与方法

Pandas 内建了处理缺失值的方法  
    规约方法:.idmax,.idmin,.sum,等 其可选参数:axis,skipna(排除缺失值),level(缩减分层等级)  
    累积型方法:df.cumsum()等,  
    其他方法:df.describe()(产生汇总统计),  

In [250]:
df = pd.DataFrame(     [[1.4,np.nan],
                  [7.1,-4.5],
                  [np.nan,np.nan],
                  [0.75,-1.3]],
                  index = list('abcd'),
                  columns = ['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [251]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [253]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [255]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


## 相关性协方差

## 唯一值,计数和成员属性