# 数据清洗与准备

## 7.1、处理缺失值

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows=10

In [4]:
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])
print(string_data)

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object


In [5]:
string_data[0]=None
print(string_data)

0         None
1    artichoke
2          NaN
3      avocado
dtype: object


In [6]:
print(string_data.isnull())        #可以发现，None和NaN都被认为是NA，not available

0     True
1    False
2     True
3    False
dtype: bool


### 1.1、过滤缺失值

In [3]:
#利用dropna来过滤缺失值
from numpy import nan as NA
data=pd.Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [4]:
data[data.notnull()]    #与上面等价

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
#对于DataFrame，dropna默认情况下会删除包含缺失值的行
data=pd.DataFrame([[1,6.5,3],[1,NA,NA],
                  [NA,NA,NA],[NA,6.5,3]])
data1=data.dropna()
print(data)
print(data1)

#利用参数how可以实现仅删除所有值都是NA的行
print(data.dropna(how='all'))

#利用参数axis可以实现删除NA存在的列
print(data.dropna(axis=1,how='all'))

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [13]:
#利用参数thresh实现保留一定数量的观察值的行
data2=pd.DataFrame(np.random.randn(7,3))
data2.iloc[:4,1]=NA
data2.iloc[:2,2]=NA
print(data2.dropna(thresh=2))
print(data2.dropna(thresh=1))

          0         1         2
2 -0.671067       NaN  1.284127
3 -0.594093       NaN -2.444752
4  0.514992 -0.601770 -0.837686
5  0.028874 -0.900232 -1.162911
6 -0.774774  0.501525  0.433749
          0         1         2
0  0.233087       NaN       NaN
1 -0.902660       NaN       NaN
2 -0.671067       NaN  1.284127
3 -0.594093       NaN -2.444752
4  0.514992 -0.601770 -0.837686
5  0.028874 -0.900232 -1.162911
6 -0.774774  0.501525  0.433749


### 1.2、补全缺失值

In [18]:
#利用fillna来替代缺失值
'''
不能用布尔值索引将缺失值补全
data[data.isnull()]=5  此时由于内部数据类型不统一，因此不能用bool值数组赋值
'''
data2=pd.DataFrame(np.random.randn(7,3))
data2.iloc[:4,1]=NA
data2.iloc[:2,2]=NA
print(data2.fillna(0))

#调用fillna时使用字典，可以使得不同列可以设定不同的填充值
print(data2.fillna({1:0.5,2:0}))

#调用fillna时，利用参数inplace可以使得修改原数据
_=data2.fillna(0,inplace=True)
print(data2)

          0         1         2
0 -0.804293  0.000000  0.000000
1 -1.220486  0.000000  0.000000
2  0.100599  0.000000 -0.103862
3 -1.122888  0.000000 -0.231285
4 -1.647301 -0.043402 -2.505836
5 -0.831211 -1.375719  1.157523
6  2.953561  0.996073  0.656459
          0         1         2
0 -0.804293  0.500000  0.000000
1 -1.220486  0.500000  0.000000
2  0.100599  0.500000 -0.103862
3 -1.122888  0.500000 -0.231285
4 -1.647301 -0.043402 -2.505836
5 -0.831211 -1.375719  1.157523
6  2.953561  0.996073  0.656459
          0         1         2
0 -0.804293  0.000000  0.000000
1 -1.220486  0.000000  0.000000
2  0.100599  0.000000 -0.103862
3 -1.122888  0.000000 -0.231285
4 -1.647301 -0.043402 -2.505836
5 -0.831211 -1.375719  1.157523
6  2.953561  0.996073  0.656459


In [19]:
#fillna拥有很多参数可用
#利用method参数可用选择前向填充或后向填充
data3=pd.DataFrame(np.random.randn(6,3))
data3.iloc[2:,1]=NA
data3.iloc[4:,2]=NA
print(data3)
print(data3.fillna(method='ffill'))              #后向填充
print(data3.fillna(method='ffill',limit=2))      #后向填充，并设置填充长度最大为2

          0         1         2
0  0.264246  0.243948 -0.362505
1 -0.387553 -0.619872 -1.531149
2 -1.321191       NaN -0.318601
3  1.069199       NaN  0.290524
4  0.226386       NaN       NaN
5 -0.829300       NaN       NaN
          0         1         2
0  0.264246  0.243948 -0.362505
1 -0.387553 -0.619872 -1.531149
2 -1.321191 -0.619872 -0.318601
3  1.069199 -0.619872  0.290524
4  0.226386 -0.619872  0.290524
5 -0.829300 -0.619872  0.290524
          0         1         2
0  0.264246  0.243948 -0.362505
1 -0.387553 -0.619872 -1.531149
2 -1.321191 -0.619872 -0.318601
3  1.069199 -0.619872  0.290524
4  0.226386       NaN  0.290524
5 -0.829300       NaN  0.290524


## 2、数据转换

### 2.1、删除重复值

In [20]:
#DataFrame中会出现重复行
data=pd.DataFrame({'k1':['one','two']*3+['two'],
                  'k2':[1,1,2,3,3,4,4]})
print(data)

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4


In [22]:
print(data.duplicated())      #返回某行是否有重复行

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool


In [23]:
print(data.drop_duplicates())   #返回没有重复行的内容

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4


In [24]:
#可用根据某一列的内容去除重复值
print(data.drop_duplicates(['k1']))      #根据'k1'列，删去有重复的行

    k1  k2
0  one   1
1  two   1


In [28]:
#keep参数可设置保留重复行的最后一行
data['va']=range(7)
print(data)
print(data.drop_duplicates(['k1','k2'],keep='last'))

    k1  k2  va
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
5  two   4   5
6  two   4   6
    k1  k2  va
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
6  two   4   6


### 1.2.2、使用函数或映射进行数据转换

In [32]:
data=pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                  'ounces':[4,3,12,6,7.5,8,3,5,6]})
print(data)

meat_to_animal={
    'bacon':'pig',
    'pulled pork':'cow',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}
lower_food=data.food.str.lower()
data['animal']=lower_food.map(meat_to_animal)          #Series的map中可以是函数，也可以是包含映射关系的字典型对象
print(data)


data['animal']=data['food'].map(lambda x: meat_to_animal[x.lower()])
print(data)
'''
两个实现效果接近
'''

          food  ounces
0        bacon     4.0
1  pulled pork     3.0
2        bacon    12.0
3     Pastrami     6.0
4  corned beef     7.5
5        Bacon     8.0
6     pastrami     3.0
7    honey ham     5.0
8     nova lox     6.0
          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     cow
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon
          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     cow
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon


### 2.3、替代值

In [36]:
#fillna是替换的特殊情况，替换缺失值。replace提供了一种更加灵活的替换
data=pd.Series([1,-999,2,-999,-1000,3])
print(data.replace(-999,NA))
print(data.replace([-999,-1000],NA))                #可以将多个值进行替换
print(data.replace([-999,-1000],[NA,500]))          #可以将不同值替换成不同值
print(data.replace({-999:433,-1000:344}))           #可以用字典

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
0      1.0
1      NaN
2      2.0
3      NaN
4    500.0
5      3.0
dtype: float64
0      1
1    433
2      2
3    433
4    344
5      3
dtype: int64


### 2.4、重命名轴索引

In [41]:
#轴标签也可以像Series中的值一样进行类似的使用函数或映射的转换
data=pd.DataFrame(np.arange(12).reshape((3,4)),
                 index=['Ohio','Colorado','New York'],
                 columns=['one','two','three','four'])
print(data.index.map(lambda x:x[:4].upper()))
data.index=data.index.map(lambda x:x[:4].upper())
print(data)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')
      one  two  three  four
OHIO    0    1      2     3
COLO    4    5      6     7
NEW     8    9     10    11


In [44]:
#可以使用rename改变轴上标签
data=pd.DataFrame(np.arange(12).reshape((3,4)),
                 index=['Ohio','Colorado','New York'],
                 columns=['one','two','three','four'])
print(data.rename(index=str.title,columns=str.upper))
print(data.rename(index={'Ohio':'INDIANA'},
                 columns={'three':'peekaboo'}))              #可以传入字典，进行定向转换
data.rename(index={'Ohio':'INDIANA'},inplace=True)           #传入inplace参数可以修改原数据
print(data)

          ONE  TWO  THREE  FOUR
Ohio        0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11
          one  two  peekaboo  four
INDIANA     0    1         2     3
Colorado    4    5         6     7
New York    8    9        10    11
          one  two  three  four
INDIANA     0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11


### 2.5、离散化和分箱

In [16]:
#连续值的离散化，或者分成几个部分，可以使用cut函数
ages=[18,20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)
print(cats)
'''
返回的是ages中的每一个值所处的分区
bins中即所要划分的分区：(18,25],(25,35],(35,60],(65,100].这些范围含左不含右，因此第一个18给出的分区为NaN。可以把这些分区叫做箱
'''

[NaN, (18.0, 25.0], (18.0, 25.0], (18.0, 25.0], (25.0, 35.0], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 13
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]


'\n返回的是ages中的每一个值所处的分区\nbins中即所要划分的分区：(18,25],(25,35],(35,60],(65,100].这些范围含左不含右，因此第一个18给出的分区为NaN。可以把这些分区叫做箱\n'

In [3]:
cats.codes    #返回的是ages中每个数据对应的分区的数据标签。注意，返回的是数组

array([-1,  0,  0,  0,  1,  0,  0,  2,  1,  3,  2,  2,  1], dtype=int8)

In [4]:
cats.categories     #返回的是不同分区的名称

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [7]:
cats.categories[0]

Interval(18, 25, closed='right')

In [5]:
pd.value_counts(cats)    #计算cats中箱的数量并排列

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [10]:
#可以通过right参数调节分区的边界开闭，right默认为True，左开右闭
pd.cut(ages,[18,26,36,61,100],right=False)          

[[18, 26), [18, 26), [18, 26), [18, 26), [26, 36), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 13
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [17]:
#还可以通过设置labels选项，传递一个列表或数组来设置自定义的箱名
group_names=['Youth','YoungAdult','MiddleAged','Senior']
pd.cut(ages,bins,labels=group_names)

[NaN, 'Youth', 'Youth', 'Youth', 'YoungAdult', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 13
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [4]:
#除了传入箱边分区还可以传入整数个的箱
data=np.random.rand(20)         #均匀分布
print(pd.cut(data,4,precision=2))    #将data分成四部分，precision指定了分区边界的十进制精度为2位。自动给出的分区是等长的，边界根据data的最大最小值给出
print(data,min(data),max(data))

[(-1.98, -0.62], (2.08, 3.44], (-0.62, 0.73], (-0.62, 0.73], (-1.98, -0.62], ..., (0.73, 2.08], (-1.98, -0.62], (-0.62, 0.73], (-0.62, 0.73], (-1.98, -0.62]]
Length: 20
Categories (4, interval[float64, right]): [(-1.98, -0.62] < (-0.62, 0.73] < (0.73, 2.08] < (2.08, 3.44]]
[-1.97082915  3.43579144  0.55422694  0.01176623 -1.27719074 -0.60493126
  0.17574637  1.14899933 -0.09247077 -0.5327763   1.50868728 -0.99442682
 -0.35579301  0.27527793  0.17049097  0.73420122 -1.0019167   0.31048453
  0.40812471 -1.26688964] -1.9708291537307907 3.435791436607815


In [5]:
'''
与cut不同的是，qcut基于样本分位数进行分箱，也即使得每个分区的数据点数量相同
'''
data=np.random.randn(1000)       #正态分布
print(pd.qcut(data,4))                               #将数据点自动分成四段，每段数据点数目相同
print(pd.qcut(data,4).value_counts())
print(pd.qcut(data,[0,0.1,0.5,0.9,1]))               #传入自定义的分位数，类似一根长度为1的绳子，以一定的比例进行分区。里面应该为0到1之间的数据
print(pd.qcut(data,[0,0.1,0.5,0.9,1]).value_counts())

[(-0.0371, 0.681], (0.681, 3.017], (-0.0371, 0.681], (-0.74, -0.0371], (-0.0371, 0.681], ..., (-2.7609999999999997, -0.74], (0.681, 3.017], (-2.7609999999999997, -0.74], (-0.74, -0.0371], (-0.74, -0.0371]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.7609999999999997, -0.74] < (-0.74, -0.0371] < (-0.0371, 0.681] < (0.681, 3.017]]
(-2.7609999999999997, -0.74]    250
(-0.74, -0.0371]                250
(-0.0371, 0.681]                250
(0.681, 3.017]                  250
Name: count, dtype: int64
[(-0.0371, 1.253], (1.253, 3.017], (-0.0371, 1.253], (-1.339, -0.0371], (-0.0371, 1.253], ..., (-2.7609999999999997, -1.339], (-0.0371, 1.253], (-1.339, -0.0371], (-1.339, -0.0371], (-1.339, -0.0371]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.7609999999999997, -1.339] < (-1.339, -0.0371] < (-0.0371, 1.253] < (1.253, 3.017]]
(-2.7609999999999997, -1.339]    100
(-1.339, -0.0371]                400
(-0.0371, 1.253]                 400
(1.253, 3.017]              

### 2.6、检测和过滤异常值

In [60]:
#利用any函数或布尔值数据进行数组操作
data=pd.DataFrame(np.random.randn(10,4))
data.loc[10,:]=[-5,2,9,10]                     #用loc可以在末尾再加一行
print(data[(np.abs(data)>3).any(axis=1)])      #DataFrame的any方法默认对于0轴进行，通过axis参数可以调整至1轴

#np.sign()函数根据数据的正负产生-1或1
print(np.abs(data)>3)
data[np.abs(data)>3]=np.sign(data)*3
print(data)


           0         1         2          3
5  -1.346957 -3.519253  0.004481  -1.191391
6   2.154696 -3.322489  1.227508  -0.387363
10 -5.000000  2.000000  9.000000  10.000000
        0      1      2      3
0   False  False  False  False
1   False  False  False  False
2   False  False  False  False
3   False  False  False  False
4   False  False  False  False
..    ...    ...    ...    ...
6   False   True  False  False
7   False  False  False  False
8   False  False  False  False
9   False  False  False  False
10   True  False   True   True

[11 rows x 4 columns]
           0         1         2         3
0  -0.761561 -1.055773  1.220106 -0.362545
1   0.943996  1.709941  0.365669  0.783350
2  -2.389311 -0.151833  0.981117  1.056260
3   0.319538 -2.993372 -1.418250  0.674584
4   1.163803 -1.257646  0.418202  0.747591
..       ...       ...       ...       ...
6   2.154696 -3.000000  1.227508 -0.387363
7   1.423827  0.295763 -1.172923  1.164377
8  -2.006947  1.005924 -0.818215  0.725667

### 2.7、置换和随机抽样

In [2]:
#numpy.random.permutation可以根据长度返回一个表示新顺序的整数数组
data=pd.DataFrame(np.arange(5*4).reshape((5,4)),
                 index=['a','b','c','d','e'])
sampler=np.random.permutation(5)
print(sampler)
print(data)

print(data.take(sampler))
print(data.iloc[sampler])       #data.take()与data.iloc[]对于行或列的索引等价，不同点在于iloc还可以进行值索引，take只能对行或列索引。
                                #take里面为整数数组
sampler=np.random.permutation(4)
print(data.take(sampler,axis=1))      #利用axis进行列的重新排序
'''
通过产生的sampler便可以对DataFrame的行或列进行随机重新排序
'''

[4 0 3 1 2]
    0   1   2   3
a   0   1   2   3
b   4   5   6   7
c   8   9  10  11
d  12  13  14  15
e  16  17  18  19
    0   1   2   3
e  16  17  18  19
a   0   1   2   3
d  12  13  14  15
b   4   5   6   7
c   8   9  10  11
    0   1   2   3
e  16  17  18  19
a   0   1   2   3
d  12  13  14  15
b   4   5   6   7
c   8   9  10  11
    0   2   1   3
a   0   2   1   3
b   4   6   5   7
c   8  10   9  11
d  12  14  13  15
e  16  18  17  19


'\n通过产生的sampler便可以对DataFrame的行或列进行随机重新排序\n'

In [9]:
#可以通过sample方法来选出一个不含有替代值的随机子集（即不允许有重复选择）
data.sample(n=2)

Unnamed: 0,0,1,2,3
b,4,5,6,7
e,16,17,18,19


In [10]:
#也可以生成一个带有替代值的样本
choices=pd.Series([5,7,-1,6,4])
print(choices.sample(n=10,replace=True))          #传入replace=True参数来实现

2   -1
1    7
4    4
3    6
0    5
3    6
4    4
1    7
1    7
3    6
dtype: int64


### 2.8、计算指标/虚拟变量

In [17]:
#如果一个DataFrame中的一列有k个不同的值，则通过get_dummies可以得到一个k列的矩阵来表示某一行是否有哪个值
data=pd.DataFrame({'key':['b','b','a','c','a','b'],
                   'data1':range(6)})
print(pd.get_dummies(data['key']))

dummies=pd.get_dummies(data['key'],prefix='key')   #利用prefix参数加上前缀
data_with_dummies=data[['key']].join(dummies)     
print(data_with_dummies)

       a      b      c
0  False   True  False
1  False   True  False
2   True  False  False
3  False  False   True
4   True  False  False
5  False   True  False
  key  key_a  key_b  key_c
0   b  False   True  False
1   b  False   True  False
2   a   True  False  False
3   c  False  False   True
4   a   True  False  False
5   b  False   True  False


In [5]:
'''
一个更复杂的处理例子
'''
mnames=['movie_id','title','genres']
movies=pd.read_table('pydata-book/datasets/movielens/movies.dat',sep='::',
                     header=None,names=mnames,engine='python')
print(movies[:10])

all_genres=[]
for x in movies['genres']:
    all_genres.extend(x.split('|'))
genres=pd.unique(all_genres)
print('所有的流派:\n',genres)

#构建一个指标DataFrame
zero_matrix=np.zeros((len(movies),len(genres)))
dummies=pd.DataFrame(zero_matrix,columns=genres)
for i,gen in enumerate(movies['genres']):
    io=dummies.columns.get_indexer(gen.split('|'))    #选出每一个电影对应流派在dummies哪一列
    dummies.iloc[i,io]=1
movies_with=movies.join(dummies.add_prefix('Genre_'))
print('\n\n',movies_with.iloc[0])

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy
5         6                         Heat (1995)         Action|Crime|Thriller
6         7                      Sabrina (1995)                Comedy|Romance
7         8                 Tom and Huck (1995)          Adventure|Children's
8         9                 Sudden Death (1995)                        Action
9        10                    GoldenEye (1995)     Action|Adventure|Thriller
所有的流派:
 ['Animation' "Children's" 'Comedy' 'Adventure' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Sci-Fi' 'Documentary

In [6]:
'''
还可以将get_dummies与cut结合使用
'''
np.random.seed(12345)
value=np.random.rand(10)
bins=[0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(value,bins))                 #实现了value中的值是显示分布在哪个区间的指标

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,False,False,False,False,True
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,False,True,False,False
5,False,False,True,False,False
6,False,False,False,False,True
7,False,False,False,True,False
8,False,False,False,True,False
9,False,False,False,True,False


## 3、字符串操作

### 3.1、字符串对象方法

In [7]:
'''
1、split
2、strip
3、+
4、join
5、in
6、index
7、find                          find与index的区别，find找不到字符时返回-1，index则直接抛出异常
8、count
9、replace
'''
val='a,b,  guido'
print(val.split(','))
print(val.strip())

pieces=[x.strip() for x in val.split(',')]
first,second,third = pieces
print(first+'::'+second+'::'+third)
print('::'.join(pieces))

print('guido' in val)
print(val.index(','))
print(val.find(';'))

print(val.count(','))
print(val.replace(',',':::'))

['a', 'b', '  guido']
a,b,  guido
a::b::guido
a::b::guido
True
1
-1
2
a:::b:::  guido


### 3.2、正则表达式

In [4]:
#正则表达式一般用re库实现，有三个主题：模式匹配、替代、拆分。
import re

text="foo     bar\t  baz   \tqux"
re.split('\s+',text)          #'\s+'表示一个或多个空白字符。首先匹配空白字符，之后传入split方法进行运行

['foo', 'bar', 'baz', 'qux']

In [10]:
#也可以制作一个可复用的正则表达式，使用re.compile函数
regex=re.compile('\s+')          #将模式编译进正则表达式regex中
print(regex.split(text)) 
print(regex.findall(text))       #findall是找到所有匹配的模式

['foo', 'bar', 'baz', 'qux']
['     ', '\t  ', '   \t']


In [42]:
text='''Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
1@1.cppp
'''
pattern=r'[A-Z0-9.%+-]+@[A-Z0-9.-]+.[A-Z]{2,4}'
regex=re.compile(pattern,flags=re.IGNORECASE)
print(regex.findall(text))           #寻找所有符合的模式
m=regex.search(text)
print(text[m.start():m.end()])       #search方法只返回文本中第一个匹配到的模式的地址，即开始和结束的位置
print(regex.match(text))             #match方法只在字符串起始位置处进行匹配，如果没有就返回None
print(regex.sub('REDACTED',text))    #sub方法用特定字符串替换找到的模式，返回一个新字符串

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com', '1@1.cppp']
dave@google.com
None
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED
REDACTED



In [48]:
pattern=r'([A-Z0-9.%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'      #用小括号将要匹配的模式包起来，分成几个分组
regex=re.compile(pattern,flags=re.IGNORECASE)
print(regex.findall('1@1.com'))
print(regex.findall(text))                                  #就可以得到匹配的模式分组，为元组格式
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3',text))    #sub也可以访问每个匹配对象中的分组   \1表示第一个匹配分组

[('1', '1', 'com')]
[('dave', 'google', 'com'), ('steve', 'gmail', 'com'), ('rob', 'gmail', 'com'), ('ryan', 'yahoo', 'com'), ('1', '1', 'cppp')]
Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com
Username: 1, Domain: 1, Suffix: cppp



### 3.3、pandas中的向量化字符串函数

In [10]:
'''
Series中有str属性以提供各种字符串的方法函数，如
1、data.str.contains()
2、data.str.findall()
3、data.str.match()
4、data.str.get()
5、data.str[]

(注意，DataFrame中没有str属性)
'''
data={'Dave':'dave@google.com','Steve':'steve@gmail.com',
      'Rob':'rob@gmail.com','Wes':np.nan}
data=pd.Series(data)
print(data)

print(data.str.contains('gmail'))       #检查Series中每个属性是否包含'gmail
pattern=r'([A-Z0-9.%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
print(data.str.findall(pattern,flags=re.IGNORECASE))

print(data.str.match(pattern,flags=re.IGNORECASE))

print(data.str.get(1))                  #取每一行的第一个字符
print(data.str[1])
print(data.str[:5])                     #还可以切片

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object
Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object
Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object
Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object
Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object
Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object
Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object


In [13]:
data=pd.DataFrame({'a':['asdfg','sdfgh'],'b':['qwert','werty'],'c':['zxcvb','xcvbn']})
print(data)
print(data.str[0])

       a      b      c
0  asdfg  qwert  zxcvb
1  sdfgh  werty  xcvbn


AttributeError: 'DataFrame' object has no attribute 'str'