In [1]:
# 引入模块
# -*- coding:utf-8 -*-

# 常用包的函数
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from numpy.random import randn
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

# 解决显示汉字不正确问题
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

%matplotlib inline

pd.set_option('precision', 6) #设置精度
pd.set_option('display.float_format', lambda x: '%.2f' % x) 
pd.options.display.max_rows = 100
df1 = DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
df2 = DataFrame({'key':['a','b','d'],'data2':range(3)})

In [3]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [4]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


In [5]:
# 若没指明用哪个列进行连接，merege会将重叠列当做键
pd.merge(df1,df2)

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [6]:
pd.merge(df1,df2,on='key')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [7]:
# 若两个对象的列名不同，则可以分别进行指定
df3 = DataFrame({'lkey':['b','b','a','c','a','a','b'],'data1':range(7)})
df4 = DataFrame({'rkey':['a','b','d'],'data2':range(3)})
pd.merge(df3,df4,left_on='lkey',right_on='rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


In [8]:
# 默认情况下，上述做的事“inner”连接，结果中的键为交集
# 其他方式还有“left”、“right”和“outer”
# 外连接求取的是键的并集，组合了左连接和右连接的效果
pd.merge(df1,df2,how='outer')

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


In [9]:
df1 = DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df2 = DataFrame({'key':['a','b','a','b','d'],'data1':range(5)})
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [10]:
df2

Unnamed: 0,data1,key
0,0,a
1,1,b
2,2,a
3,3,b
4,4,d


In [11]:
pd.merge(df1,df2,how='left',on='key')
# 多对多连接产生的是行的笛卡尔积。

Unnamed: 0,data1_x,key,data1_y
0,0,b,1.0
1,0,b,3.0
2,1,b,1.0
3,1,b,3.0
4,2,a,0.0
5,2,a,2.0
6,3,c,
7,4,a,0.0
8,4,a,2.0
9,5,b,1.0


In [12]:
left = DataFrame({'key1':['foo','foo','bar'],'key2':['one','two','one'],'lval':[1,2,3]})
left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [13]:
right = DataFrame({'key1':['foo','foo','bar','bar'],'key2':['one','one','one','two'],'rval':[4,5,6,7]})
right

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [14]:
pd.merge(left,right,on=['key1','key2'],how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [18]:
# merge有一个更实用的suffixes选项，用于附加到两个DataFrame对象的重叠的列名上
pd.merge(left,right,on='key1',suffixes=('_left','_right'),sort='False')

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,bar,one,3,one,6
1,bar,one,3,two,7
2,foo,one,1,one,4
3,foo,one,1,one,5
4,foo,two,2,one,4
5,foo,two,2,one,5


In [27]:
left1 = DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
right1 = DataFrame({'group_val':[3.5,7]},index=['a','b'])
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [24]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [28]:
pd.merge(left1,right1,left_on='key',right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [30]:
pd.merge(left1,right1,left_on='key',right_index=True,how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [33]:
aa = np.arange(12).reshape((6,2))
aa

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11]])

In [35]:
left = DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],'key2':[2000,2001,2002,2001,2002],'data':np.arange(5)})
left

Unnamed: 0,data,key1,key2
0,0,Ohio,2000
1,1,Ohio,2001
2,2,Ohio,2002
3,3,Nevada,2001
4,4,Nevada,2002


In [37]:
right = DataFrame(np.arange(12).reshape((6,2)),index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],[2000,2000,2000,2000,2001,2002]],columns=['event1','event2'])
right

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2000,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [39]:
pd.merge(left,right,left_on=['key1','key2'],right_index=True)

Unnamed: 0,data,key1,key2,event1,event2
0,0,Ohio,2000,4,5
0,0,Ohio,2000,6,7
1,1,Ohio,2001,8,9
2,2,Ohio,2002,10,11


In [41]:
# 使用合并双方的索引
left2 = DataFrame([[1,2],[3,4],[5,6]],index=['a','c','e'],columns=['Ohio','Nevada'])
left2

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [43]:
right2 = DataFrame([[7,8],[9,10],[11,12],[13,14]],index=['b','c','d','e'],columns=['Missouri','Alabama'])
right2

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [45]:
pd.merge(left2,right2,how='outer',left_index=True,right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [49]:
# DataFrame提供join方法，该方法能够方便地实现按索引合并
# 还能用于合并多个带有相同或相似缩影的DataFrame对象，不管他们之间有没有重叠的列
left2.join(right2,how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [50]:
# 还支持参数DataFrame的索引跟调用者DataFrame的某个列之间的连接
left1.join(right1,on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [55]:
# 可以向join传入一组DataFrame
another = DataFrame([[7,8],[9,10],[11,12],[16,17]],index=['a','c','e','f'],columns=['New York','Oregon'])
left2.join([right2,another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1,2,,,7,8
c,3,4,9.0,10.0,9,10
e,5,6,13.0,14.0,11,12


In [57]:
left2.join([right2,another],how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
b,,,7.0,8.0,,
c,3.0,4.0,9.0,10.0,9.0,10.0
d,,,11.0,12.0,,
e,5.0,6.0,13.0,14.0,11.0,12.0
f,,,,,16.0,17.0


In [59]:
# 另一种数据合并运算也被成为连接concatenation, 绑定binding, 或者堆叠stacking
# Numpy有一个用于合并原始Numpy数组的concatenation的函数
arr = np.arange(12).reshape(3,4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [61]:
np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [65]:
# 对于panda对象，需要考虑如下问题：
# 1. 如果各对象其他轴上的索引不同，那些轴是应该做并集还是交集？
# 2. 结果对象中的分组需要各不相同？
# 3. 用于连接的轴重要么
# pandas的concat函数提供了一种能够解决这些问题的可靠方式
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6],index=['f','g'])
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [67]:
# concat默认是在axis=0上产生一个新的Series，如果传入的axis=1,则变成一个DataFrame
pd.concat([s1,s2,s3],axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [70]:
# 在这种情况下，另外一条轴上没有重叠，从索引的有序并集上就可以看出来
# 若传入join='inner'即可得到它们的交集
s4 = pd.concat([s1 * 5, s3])
s4

a    0
b    5
f    5
g    6
dtype: int64

In [72]:
pd.concat([s1,s4],axis=1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,5
f,,5
g,,6


In [74]:
pd.concat([s1,s4],axis=1,join='inner')

Unnamed: 0,0,1
a,0,0
b,1,5


In [76]:
# 可以通过join_axes指定要在其他轴上使用的索引
pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])

Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,5.0
e,,


In [78]:
# 在连接轴上创建一个层次化索引
result = pd.concat([s1,s1,s3],keys=['one','two','three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [80]:
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [82]:
# 若沿着axis=1对Series进行合并，则keys就会成为DataFrame的列头
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [85]:
# DataFrame逻辑
df1 = DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [87]:
df2 = DataFrame(5 + np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [89]:
pd.concat([df1,df2],axis=1,keys=['level1','level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [94]:
# 若传入参数是一个字典，则字典的键会被当做keys选项的值
pd.concat({'level1':df1,'level2':df2},axis=1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [97]:
# 创建管理层次化索引
pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [100]:
df1 = DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df1

Unnamed: 0,a,b,c,d
0,0.45,-2.73,1.75,0.48
1,-1.51,0.02,0.41,1.77
2,-0.51,0.22,-0.91,0.11


df2 = DataFrame(np.random.randn(2,3),columns=['b','d','a'])
df2

In [110]:
pd.concat([df1,df2],ignore_index=True)

Unnamed: 0,a,b,c,d
0,0.45,-2.73,1.75,0.48
1,-1.51,0.02,0.41,1.77
2,-0.51,0.22,-0.91,0.11
3,-1.41,-1.82,,-0.29
4,-0.54,-1.29,,0.29


In [112]:
## 合并重叠数据
# 我们使用numpy的where函数，用于表达一种矢量化的if-else
a = Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
a

f    nan
e   2.50
d    nan
c   3.50
b   4.50
a    nan
dtype: float64

In [114]:
b = Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])

b

f   0.00
e   1.00
d   2.00
c   3.00
b   4.00
a   5.00
dtype: float64

In [133]:
np.where(pd.isnull(a),b,a)

array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])

In [125]:
# 层次化的DataFrame数据的重排任务提供了一种具有良好一致性的方式
# stack: 将数据的列旋转为行
# unstack: 将数据的行旋转为列
data = DataFrame(np.arange(6).reshape(2,3),index=pd.Index(['Ohio','Colorado'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [128]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [130]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [134]:
# 默认情况下，unstack操作的是最内层。传入分层级别的编号或者名称可对其他级别进行操作
result.unstack(0)
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [136]:
# 数据转换
# 利用函数或映射进行数据转换
data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],'ounces':[4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [143]:
meat_to_animal = {
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}
# Seris的map方法可以接受一个函数或者含有银蛇关系的字典型对象
# map这里有点类似反射？
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [145]:
# 我们可以传入一个能够完成全部这些工作的函数
# 使用map是一种实现元素级转换以及其他数据清理工作的便捷方式
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [148]:
# 利用fillna方法填充缺失数据可以看做值替换的一种特殊情况
# replace提供了一种实现该功能的更简单、更灵活的方式
data = Series([1,-999,2,-999,-1000,3])
data

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64

In [150]:
data.replace(-999,np.nan)

0       1.00
1        nan
2       2.00
3        nan
4   -1000.00
5       3.00
dtype: float64

In [152]:
data.replace([-999,-1000],np.nan)

0   1.00
1    nan
2   2.00
3    nan
4    nan
5   3.00
dtype: float64

In [153]:
# 若希望对不同的值进行不同的替换，则传入一个由替换关系组成的列表
data.replace([-999,-1000],[np.nan,0])

0   1.00
1    nan
2   2.00
3    nan
4   0.00
5   3.00
dtype: float64

In [154]:
# 传入的参数可以使字典
data.replace({-999:np.nan,-1000:0})

0   1.00
1    nan
2   2.00
3    nan
4   0.00
5   3.00
dtype: float64

In [155]:
# 重名明轴索引
data = DataFrame(np.arange(12).reshape((3,4)),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [156]:
data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [158]:
data.index = data.index.map(str.upper)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [160]:
# 如果想要创建数据集的转换版（而非修改原始数据），可采用rename
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [161]:
# rename可以结合字典类型对象实现对部分轴标签的更新
data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [162]:
# 若想修改某个数据集合，则需要传入inplace=True
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [164]:
data.rename(index={'OHIO':'INDIANA'},inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [165]:
## 离散化和面元bin划分
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
# cats包含了ages序列元素对应的桶
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [166]:
# pandas返回的是一个特殊的Categorical对象
# 可将其看成一组表示面元名称的字符串
# 表示不同分类名称的levels数组以及一个为年龄数据进行标号的labels属性
cats.labels

  after removing the cwd from sys.path.


array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [170]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [172]:
# 跟“区间”的数学符号一样，圆括号表示开端，方括号表示闭端（包括）。
# 哪边是闭端可以通过right=False进行修改
pd.cut(ages,bins,right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [174]:
# 可以设置面元名称，将labels选项设置为一个列表或数组即可
group_names = ['Youth','YouthAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels=group_names)

[Youth, Youth, Youth, YouthAdult, Youth, ..., YouthAdult, Senior, MiddleAged, MiddleAged, YouthAdult]
Length: 12
Categories (4, object): [MiddleAged < Senior < Youth < YouthAdult]

In [177]:
# 如果向cut传入的是面元的数量而不是确切的面元边界，则它会根据数据的最小值和最大值计算登场面元
# 如下我们将一些均匀分布的数据切分成四组
data = randn(20)
pd.cut(data,4,precision=2)

[(-0.81, 0.33], (-0.81, 0.33], (-0.81, 0.33], (1.48, 2.63], (-1.96, -0.81], ..., (1.48, 2.63], (-0.81, 0.33], (0.33, 1.48], (0.33, 1.48], (0.33, 1.48]]
Length: 20
Categories (4, interval[float64]): [(-1.96, -0.81] < (-0.81, 0.33] < (0.33, 1.48] < (1.48, 2.63]]

In [178]:
# qcut是一个类似于cut的函数，它可以根据样本分位数对数据进行面元划分
# 根据数据的分布情况，cut可能无法使各个面元中含有相同数量的数据点，
# 而qcut由于使用的是样本分位数，因此可以得到大小基本相等的面元
data = randn(1000)
cats = pd.qcut(data,4)
cats

[(-3.828, -0.643], (-3.828, -0.643], (0.0405, 0.665], (0.0405, 0.665], (-0.643, 0.0405], ..., (-3.828, -0.643], (0.0405, 0.665], (0.665, 2.987], (0.665, 2.987], (-3.828, -0.643]]
Length: 1000
Categories (4, interval[float64]): [(-3.828, -0.643] < (-0.643, 0.0405] < (0.0405, 0.665] < (0.665, 2.987]]

In [179]:
pd.value_counts(cats)

(0.665, 2.987]      250
(0.0405, 0.665]     250
(-0.643, 0.0405]    250
(-3.828, -0.643]    250
dtype: int64

In [180]:
# 同cut一样，也可以设置自定义的分位数（0到1之间的数值，包含端点）
pd.qcut(data,[0,0.1,0.5,0.9,1])

[(-1.209, 0.0405], (-1.209, 0.0405], (0.0405, 1.262], (0.0405, 1.262], (-1.209, 0.0405], ..., (-1.209, 0.0405], (0.0405, 1.262], (0.0405, 1.262], (0.0405, 1.262], (-3.828, -1.209]]
Length: 1000
Categories (4, interval[float64]): [(-3.828, -1.209] < (-1.209, 0.0405] < (0.0405, 1.262] < (1.262, 2.987]]

In [196]:
# 检查和过滤异常值
# 异常值outlier的过滤或变化运算在很大程度上其实就是数组运算
np.random.seed(12345)
data = DataFrame(randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.07,0.07,0.03,-0.0
std,1.0,0.99,1.01,1.0
min,-3.43,-3.55,-3.18,-3.75
25%,-0.77,-0.59,-0.64,-0.64
50%,-0.12,0.1,0.0,-0.01
75%,0.62,0.78,0.68,0.65
max,3.37,2.65,3.26,3.93


In [197]:
# 某列中绝对值大于3的值
col = data[3]
col[np.abs(col) > 3]

97     3.93
305   -3.40
400   -3.75
Name: 3, dtype: float64

In [198]:
# 选出全部含有绝对值大于3的行
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.54,0.48,3.25,-1.02
97,-0.77,0.55,0.11,3.93
102,-0.66,-0.57,3.18,0.96
305,-2.32,0.46,-0.03,-3.4
324,0.05,1.95,3.26,0.96
400,0.15,0.51,-0.2,-3.75
499,-0.29,-0.24,-3.06,1.92
523,-3.43,-0.3,-0.44,-0.87
586,0.28,1.18,-3.18,1.37
808,-0.36,-3.55,1.55,-2.19


In [202]:
# The sign function returns -1 if x < 0, 0 if x==0, 1 if x > 0. 
# nan is returned for nan inputs.
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe() 

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.07,0.07,0.03,-0.0
std,1.0,0.99,1.0,0.99
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77,-0.59,-0.64,-0.64
50%,-0.12,0.1,0.0,-0.01
75%,0.62,0.78,0.68,0.65
max,3.0,2.65,3.0,3.0


In [204]:
# 排列和随机采样
# 利用numpy.random.permutation函数可以实现对Series和DataFrame的列的排列工作
# 通过需要排列的轴的长度调用permutation, 可产生一个表示新顺序的整数数组
df = DataFrame(np.arange(5 * 4).reshape(5,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [205]:
sampler = np.random.permutation(5)
sampler

array([1, 0, 2, 3, 4])

In [206]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [209]:
sampler = np.random.permutation(len(df))
sampler

array([1, 0, 4, 3, 2])

In [210]:
df.take(sampler[:3])

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19


In [212]:
df = DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [213]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [216]:
dummies = pd.get_dummies(df['key'],prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [218]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [None]:
# 字符串操作

In [219]:
# regex
import re
text = 'foo bar\t baz \tqux'
# \s+用来描述一个或多个空白符
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [220]:
# 我们可以用re.compile自己编译regex得到一个可重用的regex对象
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [221]:
# 若要得到匹配regex的所有模式，则可以使用findll
# 若对多个字符串使用同一条正则，建议通过re.compile创建regex对象
# 节省cpu时间
regex.findall(text)

[' ', '\t ', ' \t']

In [None]:
# match和search与findall类似
# findall返回字符串中所有的匹配项
# search则只返回第一个匹配项
# match更加严格，只匹配字符串的首部
# sub将匹配的模式替换成指定字符串

In [222]:
# pandas中矢量化的字符串函数
# 字符串规整化操作
data = {'Dave':'dave@google.com','Steve':'steve@gmail.com','Rob':'rob@gmail.com','Wes':np.nan}
data

{'Dave': 'dave@google.com',
 'Rob': 'rob@gmail.com',
 'Steve': 'steve@gmail.com',
 'Wes': nan}

In [232]:
data = Series(data)
data

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

In [233]:
data.isnull()

Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool

In [234]:
# 通过data.map，所有字符串和正则表达式方法都能被应用于各个值
# 若存在NAN则会报错
# 为解决这个问题，Series有一些能够跳过NA值的字符串操作方法
# 通过Series的str属性即可访问这些方法
data.str.contains('gmail')

Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

In [235]:
# 这里也可以使用正则表达式，还可以加上任意re选项
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\\\.([A-Z]{2,4})'

In [236]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave      []
Rob       []
Steve     []
Wes      NaN
dtype: object