pandas data cleaning and preparation

In [1]:
import pandas as pd
import numpy as np

# 处理缺失数据
> <font size=4>pandas 中缺失值的表示 : _**NaN, None**_ </font>

## _**<font color=skyblue><font color=orange>obj</font>.isnull( ), <font color=orange>obj</font>.isna( )</font>**_

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data, string_data.isnull()

(0     aardvark
 1    artichoke
 2          NaN
 3      avocado
 dtype: object,
 0    False
 1    False
 2     True
 3    False
 dtype: bool)

In [3]:
string_data[0] = None
string_data, string_data.isnull()

(0         None
 1    artichoke
 2          NaN
 3      avocado
 dtype: object,
 0     True
 1    False
 2     True
 3    False
 dtype: bool)

## _**<font color=skyblue><font color=orange>obj</font>.notnull( ), <font color=orange>obj</font>.notna( )</font>**_

In [4]:
string_data.notna()

0    False
1     True
2    False
3     True
dtype: bool

## _**<font color=skyblue><font color=orange>series</font>.dropna( ), <font color=orange>frame</font>.dropna( <font color=orange>axis</font>, <font color=orange>how</font>, <font color=orange>thresh</font> )</font>**_
> <font size=3> _series.dropna( )_ 等价于 _series[ series.notna( ) ]_ </font>

In [5]:
string_data, string_data.dropna()

(0         None
 1    artichoke
 2          NaN
 3      avocado
 dtype: object,
 1    artichoke
 3      avocado
 dtype: object)

In [6]:
string_data[string_data.notna()]

1    artichoke
3      avocado
dtype: object

### _**<font color=orange>how</font>**_ : 过滤缺失值的方式

In [7]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], 
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
data.dropna(how='any') # 去除的行至少有一个缺失值

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [9]:
data.dropna(how='all') # 去除的行所有的值都是缺失值

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


### _**<font color=orange>thresh = n</font>**_ : 过滤缺失值时, 对应行或列的剩下的非缺失值的个数大于等于n

In [10]:
df = pd.DataFrame(np.random.randn(7,7))
df.iloc[:7, 0] = np.nan
df.iloc[:6, 1] = np.nan
df.iloc[:5, 2] = np.nan
df.iloc[:4, 3] = np.nan
df.iloc[:3, 4] = np.nan
df.iloc[:2, 5] = np.nan
df.iloc[:1, 6] = np.nan
df

Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,0.548996
2,,,,,,-0.448875,-0.028554
3,,,,,1.973543,-0.927488,-0.622286
4,,,,1.329444,-0.430934,-0.957348,1.737727
5,,,-0.497778,-0.801027,-0.13891,-0.599382,0.839175
6,,-0.638612,1.437967,0.108482,0.436201,-0.003544,-0.953735


In [11]:
df.dropna(thresh=1) 
# 保留的行中, 至少有一个不是缺失值, 即去除的行所有值都是缺失值

Unnamed: 0,0,1,2,3,4,5,6
1,,,,,,,0.548996
2,,,,,,-0.448875,-0.028554
3,,,,,1.973543,-0.927488,-0.622286
4,,,,1.329444,-0.430934,-0.957348,1.737727
5,,,-0.497778,-0.801027,-0.13891,-0.599382,0.839175
6,,-0.638612,1.437967,0.108482,0.436201,-0.003544,-0.953735


In [12]:
df.dropna(thresh=3) # 保留的行中, 至少有一个不是缺失值

Unnamed: 0,0,1,2,3,4,5,6
3,,,,,1.973543,-0.927488,-0.622286
4,,,,1.329444,-0.430934,-0.957348,1.737727
5,,,-0.497778,-0.801027,-0.13891,-0.599382,0.839175
6,,-0.638612,1.437967,0.108482,0.436201,-0.003544,-0.953735


## _**<font color=skyblue><font color=orange>series</font>.fillna( <font color=orange>value</font>, <font color=orange>method</font> ), <font color=orange>frame</font>.fillna( <font color=orange>value</font>, <font color=orange>method</font>, <font color=orange>axis</font> )</font>**_

In [13]:
string_data.fillna(-9999)

0        -9999
1    artichoke
2        -9999
3      avocado
dtype: object

### _**<font color=orange>value = dict</font>**_ : 通过传递字典到 fillna 可以实现对不同的列填充不同的值

In [14]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,,,,,,,
1,,,,,,,0.548996
2,,,,,,-0.448875,-0.028554
3,,,,,1.973543,-0.927488,-0.622286
4,,,,1.329444,-0.430934,-0.957348,1.737727
5,,,-0.497778,-0.801027,-0.13891,-0.599382,0.839175
6,,-0.638612,1.437967,0.108482,0.436201,-0.003544,-0.953735


In [15]:
df.fillna( {1: 0.5, 2: 0} )

Unnamed: 0,0,1,2,3,4,5,6
0,,0.5,0.0,,,,
1,,0.5,0.0,,,,0.548996
2,,0.5,0.0,,,-0.448875,-0.028554
3,,0.5,0.0,,1.973543,-0.927488,-0.622286
4,,0.5,0.0,1.329444,-0.430934,-0.957348,1.737727
5,,0.5,-0.497778,-0.801027,-0.13891,-0.599382,0.839175
6,,-0.638612,1.437967,0.108482,0.436201,-0.003544,-0.953735


### _**<font color=orange>method</font>**_ : 填充方式 , _**<font color=orange>limit</font>**_ : 限制填充个数

In [16]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[1:5, 1] = np.nan
df.iloc[2:4, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.291262,0.200877,2.589814
1,-0.337112,,0.186565
2,-2.075923,,
3,-0.109867,,
4,0.807568,,-0.173979
5,0.059223,0.818621,0.74286


<font size=2> _**<font color=orange>method = 'ffill'</font>**_ : 用前一个非缺失值去填充该缺失值</font>

In [17]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.291262,0.200877,2.589814
1,-0.337112,0.200877,0.186565
2,-2.075923,0.200877,0.186565
3,-0.109867,,0.186565
4,0.807568,,-0.173979
5,0.059223,0.818621,0.74286


<font size=2> _**<font color=orange>method = 'bfill'</font>**_ : 用后一个非缺失值去填充该缺失值</font>

In [18]:
df.fillna(method='bfill', limit=2)

Unnamed: 0,0,1,2
0,-0.291262,0.200877,2.589814
1,-0.337112,,0.186565
2,-2.075923,,-0.173979
3,-0.109867,0.818621,-0.173979
4,0.807568,0.818621,-0.173979
5,0.059223,0.818621,0.74286


# 处理重复数据

## _**<font color=skyblue><font color=orange>obj</font>.duplicated( <font color=orange>columns</font>, <font color=orange>keep</font>  ), <font color=orange>obj</font>.drop_duplicates( <font color=orange>columns</font>, <font color=orange>keep</font> )</font>**_ : 移除重复数据

In [19]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 
                     'k2': [1, 1, 2, 2, 3, 3, 3]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
5,two,3
6,two,3


In [20]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [21]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
5,two,3


### _**<font color=orange>columns</font>**_ : 指定部分列进行重复项判断/过滤

In [22]:
data.duplicated(['k1'])

0    False
1    False
2     True
3     True
4     True
5     True
6     True
dtype: bool

In [23]:
data.drop_duplicates(['k2'])

Unnamed: 0,k1,k2
0,one,1
2,one,2
4,one,3


### _**<font color=orange>keep</font>**_ : keep = 'first' / keep = 'last' ， 保留项

In [24]:
data.duplicated(['k1'], keep='last')

0     True
1     True
2     True
3     True
4    False
5     True
6    False
dtype: bool

In [25]:
data.drop_duplicates(['k2'], keep='last')

Unnamed: 0,k1,k2
1,two,1
3,two,2
6,two,3


# 数据映射与替换

## _**<font color=skyblue><font color=orange>series</font>.map( <font color=orange>arg</font> )</font>**_ : 利用函数或字典进行数据映射

In [26]:
data = pd.DataFrame({'food': ['Bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham','nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,Bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


<font size=3>映射数据</font>

In [27]:
meat_to_animal = {'bacon': 'pig',
                  'pulled pork': 'pig',
                  'pastrami': 'cow',
                  'corned beef': 'cow',
                  'honey ham': 'pig',
                  'nova lox': 'salmon'}
meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

### _**<font color=skyblue><font color=orange>series</font>.map( <font color=orange>dict</font> )</font>**_

In [28]:
data['animal1'] = data['food'].str.lower().map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal1
0,Bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### _**<font color=skyblue><font color=orange>series</font>.map( <font color=orange>func</font> )</font>**_

In [29]:
data['animal2'] = data['food'].map( lambda x: meat_to_animal[x.lower()] )
data

Unnamed: 0,food,ounces,animal1,animal2
0,Bacon,4.0,pig,pig
1,pulled pork,3.0,pig,pig
2,bacon,12.0,pig,pig
3,Pastrami,6.0,cow,cow
4,corned beef,7.5,cow,cow
5,Bacon,8.0,pig,pig
6,pastrami,3.0,cow,cow
7,honey ham,5.0,pig,pig
8,nova lox,6.0,salmon,salmon


## _**<font color=skyblue><font color=orange>obj</font>.replace( <font color=orange>to_replace</font>, <font color=orange>value</font>  )</font>**_ : 数据替换

In [30]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [31]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [32]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## _**<font color=skyblue><font color=orange>obj</font>.rename( <font color=orange>index</font>, <font color=orange>columns</font>, <font color=orange>inplace</font>  )</font>**_ : 索引重命名

In [33]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


### _**<font color=skyblue><font color=orange>series</font>.map( <font color=orange>func</font>  )</font>**_ : map 方法

In [34]:
data.index = data.index.map( lambda x: x[:4].upper() )
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### _**<font color=skyblue><font color=orange>series</font>.rename( <font color=orange>index</font>, <font color=orange>columns</font>  )</font>**_ : rename 方法

In [35]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


<font size=3>rename 可以结合字典型对象实现对部分轴标签的更新 </font>

In [36]:
data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


# 数据划分

## _**<font color=skyblue>pd.cut( <font color=orange>x</font>, <font color=orange>bins</font>, <font color=orange>right</font>, <font color=orange>labels</font>  )</font>**_ : 划分面元（ binning ）
> <font size=3> _**<font color=orange>x</font>**_ : _The input array to be binned; must be 1-D_ </font>  
> <font size=3> _**<font color=orange>bins</font>**_ : 面元, 可以是确切的面元边界, 也可以是面元数量</font>  
> <font size=3> _**<font color=orange>right</font>**_ : _True : ( , ] ; False : [ , )_ </font>  
> <font size=3> _**<font color=orange>labels</font>**_ : 设置面元的名称 </font>

###  _**<font color=orange>bins = list</font>**_

In [37]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins, right=False)
cats

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [38]:
cats.categories # 展示了划分的面元

IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)], dtype='interval[int64, left]')

In [39]:
cats.codes 

array([0, 0, 1, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [40]:
pd.value_counts(cats)

[18, 25)     4
[25, 35)     4
[35, 60)     3
[60, 100)    1
dtype: int64

In [41]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

###  _**<font color=orange>bins = n</font>**_ : 根据样本的 **<font color=ff8888>最小值和最大值</font>** 计算等长的面元

In [42]:
data = np.random.randint(0, 11, (50))
pd.cut(data, 5)

[(-0.01, 2.0], (-0.01, 2.0], (-0.01, 2.0], (4.0, 6.0], (-0.01, 2.0], ..., (6.0, 8.0], (8.0, 10.0], (-0.01, 2.0], (4.0, 6.0], (-0.01, 2.0]]
Length: 50
Categories (5, interval[float64, right]): [(-0.01, 2.0] < (2.0, 4.0] < (4.0, 6.0] < (6.0, 8.0] < (8.0, 10.0]]

## _**<font color=skyblue>pd.qcut( <font color=orange>x</font>, <font color=orange>q</font>, <font color=orange>labels</font>  )</font>**_ : 根据分位数划分面元（ quantile binning ）
> <font size=3> _**<font color=orange>x</font>**_ : _The input array to be binned; must be 1-D_ </font>  
> <font size=3> _**<font color=orange>q</font>**_ : _Number of quantiles,_ 分位数 </font>  
> <font size=3> _**<font color=orange>labels</font>**_ : 设置面元的名称 </font>

###  _**<font color=orange>q = n</font>**_ : 根据样本的 **<font color=ff8888>分位数</font>** 对数据进行面元划分

In [43]:
data = np.random.randint(0, 101, (1000))
cats = pd.qcut(data, 4)
cats

[(50.5, 76.0], (26.0, 50.5], (-0.001, 26.0], (50.5, 76.0], (26.0, 50.5], ..., (-0.001, 26.0], (26.0, 50.5], (76.0, 100.0], (26.0, 50.5], (26.0, 50.5]]
Length: 1000
Categories (4, interval[float64, right]): [(-0.001, 26.0] < (26.0, 50.5] < (50.5, 76.0] < (76.0, 100.0]]

In [44]:
pd.value_counts(cats)

(50.5, 76.0]      260
(-0.001, 26.0]    255
(26.0, 50.5]      245
(76.0, 100.0]     240
dtype: int64

###  _**<font color=orange>q = list</font>**_ : 传递自定义的分位数（0到1之间的数值, 包含端点）

In [45]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(50.5, 91.0], (11.0, 50.5], (11.0, 50.5], (50.5, 91.0], (11.0, 50.5], ..., (11.0, 50.5], (11.0, 50.5], (50.5, 91.0], (11.0, 50.5], (11.0, 50.5]]
Length: 1000
Categories (4, interval[float64, right]): [(-0.001, 11.0] < (11.0, 50.5] < (50.5, 91.0] < (91.0, 100.0]]

# 随机采样

## _**<font color=skyblue>np.random.permutation( <font color=orange>x</font> )</font>**_

In [46]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
df.iloc[sampler]

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7


## _**<font color=skyblue><font color=orange>obj</font>.sample( <font color=orange>n</font> )</font>**_

In [47]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df.sample(n=5)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7
4,16,17,18,19


# 将 <font color=ff8888>分类变量</font> 转换为 <font color=ff8888>向量变量</font>

## _**<font color=skyblue>pd.get_dummies( <font color=orange>series</font>, <font color=orange>prefix</font> )</font>**_

In [48]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data':	range(6)})
df

Unnamed: 0,key,data
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [49]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


### _**<font color=orange>prefix</font>**_ : 给指标 DataFrame 的列加上一个前缀

In [50]:
pd.get_dummies(df['key'], prefix='key')

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [51]:
pd.get_dummies(df['key'], prefix='key').join(df['data'])

Unnamed: 0,key_a,key_b,key_c,data
0,0,1,0,0
1,0,1,0,1
2,1,0,0,2
3,0,0,1,3
4,1,0,0,4
5,0,1,0,5


# 字符串的处理方法

## _**<font color=skyblue><font color=orange>str</font>.split( <font color=orange>sep</font>, <font color=orange>maxsplit</font> )</font>**_ : 根据 sep 拆分字符串, str → list

In [52]:
val = ' a ,b, guido '
val.split(',')

[' a ', 'b', ' guido ']

## _**<font color=skyblue><font color=orange>str</font>.strip( )</font>**_ : 去除字符串两边空白符（包括换行符）

<font size=4>_**<font color=skyblue><font color=orange>str</font>.lstrip( ), <font color=orange>str</font>.rstrip( )</font>**_ : 去除字符串 左或右 的空白符（包括换行符）</font>

In [53]:
[x.strip() for x in val.split(',')]

['a', 'b', 'guido']

## _**<font color=skyblue><font color=orange>sep</font>.join( <font color=orange>list</font> )</font>**_ : 去除字符串首尾空白符（包括换行符）

In [54]:
'::'.join( val.split(',') )

' a ::b:: guido '

## _**<font color=skyblue><font color=orange>str</font>.index( <font color=orange>sep</font> ), <font color=orange>str</font>.find( <font color=orange>sep</font> )</font>**_ : 返回 sep 在 str 中第一次出现的位置
> <font size=3>区别 : 如果 sep 在 str 中不存在, _sep.find_ 返回 -1 , _sep.index_ 会引发异常</font>

In [55]:
val.index(',')

3

In [56]:
val.find(':')

-1

## _**<font color=skyblue><font color=orange>str</font>.rfind( <font color=orange>sep</font> )</font>**_ : 返回 sep 在 str 中最后一次出现的位置

In [57]:
val.rfind(',')

5

## _**<font color=skyblue><font color=orange>str</font>.count( <font color=orange>sep</font> )</font>**_ : 返回 sep 在 str 中出现的次数

In [58]:
val.count(',')

2

## _**<font color=skyblue><font color=orange>str</font>.replace( <font color=orange>old</font>, <font color=orange>new</font> )</font>**_ : 替换

In [59]:
val.replace(',', '::')

' a ::b:: guido '

## _**<font color=skyblue><font color=orange>str</font>.endswith( <font color=orange>sep</font> ), <font color=orange>str</font>.startswith( <font color=orange>sep</font> )</font>**_ : 判断 str 是否以 sep 结尾或开始

In [60]:
val.strip().endswith('a')

False

In [61]:
val.strip().startswith('a')

True

## _**<font color=skyblue><font color=orange>str</font>.lower( ), <font color=orange>str</font>.upper( ), <font color=orange>str</font>.title( )</font>**_ : 控制大小写

In [62]:
val.title(), val.upper(), val.lower()

(' A ,B, Guido ', ' A ,B, GUIDO ', ' a ,b, guido ')

# 正则表达式

In [63]:
import re

## _**<font color=skyblue>re.split( <font color=orange>pattern</font>, <font color=orange>str</font> )</font>**_ : 根据 sep 拆分字符串（ str 中的分隔符 sep 数量不定 ）

In [64]:
text = "foo   bar\t baz \tqux"
re.split('\s+', text)  # 描述一个或多个空白符的正则表达式是'\s+'

['foo', 'bar', 'baz', 'qux']

## _**<font color=skyblue>re.compile( <font color=orange>pattern</font> )</font>**_ : 根据 pattern 返回一个正则表达式类（ regex ）的对象

In [65]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

<font size=4> _**<font color=skyblue><font color=orange>regex</font>.split( <font color=orange>str</font> )</font>**_ : 根据 regex 拆分字符串</font>

## _**<font color=skyblue>re.findall( <font color=orange>pattern</font>, <font color=orange>str</font> ), <font color=orange>regex</font>.findall( <font color=orange>str</font> )</font>**_ : 返回字符串中的正则表达式匹配项

In [66]:
regex.findall(text)

['   ', '\t ', ' \t']

In [67]:
text = """WANG wangbj27@mail2.sysu.edu.cn
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# \.[A-Z]{2,4} : 必须以 '.[A-Z]'结尾, 并且[A-Z]的字符数为2~4个
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)

['wangbj27@mail2.sysu.edu.cn',
 'dave@google.com',
 'steve@gmail.com',
 'rob@gmail.com',
 'ryan@yahoo.com']

## _**<font color=skyblue>re.finditer( <font color=orange>pattern</font>, <font color=orange>str</font> ), <font color=orange>regex</font>.finditer( <font color=orange>str</font> )</font>**_ : 以迭代器的形式返回字符串中的正则表达式匹配项

In [68]:
for x in regex.finditer(text):
    print(x.group())

wangbj27@mail2.sysu.edu.cn
dave@google.com
steve@gmail.com
rob@gmail.com
ryan@yahoo.com


## _**<font color=skyblue>re.sub( <font color=orange>pattern</font>, <font color=orange>repl</font>, <font color=orange>str</font> ), <font color=orange>regex</font>.sub( <font color=orange>repl</font>, <font color=orange>str</font> )</font>**_ : 替换字符串中的正则表达式匹配项

In [69]:
print(regex.sub(repl='E-mail', string=text))

WANG E-mail
Dave E-mail
Steve E-mail
Rob E-mail
Ryan E-mail


## 正则表达式的分组模式

### pattern :  r' **<font color=ff8888>(</font>** [A-Z0-9._%+-]+ **<font color=ff8888>)</font>** @ **<font color=ff8888>(</font>** [A-Z0-9.-]+ **<font color=ff8888>)</font>** \. **<font color=ff8888>(</font>** [A-Z]{2,4} **<font color=ff8888>)</font>** '

In [70]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)

[('wangbj27', 'mail2.sysu.edu', 'cn'),
 ('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

### sub 还能通过 **<font color=ff8888>\1</font>**、**<font color=ff8888>\2</font>** 之类的特殊符号访问各匹配项中的分组, 符号 **<font color=ff8888>\1</font>** 对应第一个匹配的组

In [71]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

WANG Username: wangbj27, Domain: mail2.sysu.edu, Suffix: cn
Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


# pandas 的矢量化字符串方法 _obj.str.func( ... )_  
> <font size=4>将字符串的方法应用于 series 的各个单元里去</font>

## _**<font color=skyblue><font color=orange>series</font>.str.contains( <font color=orange>pattern</font> )</font>**_ : 检查各行是否含有字符串 string

In [72]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [73]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

## _**<font color=skyblue><font color=orange>series</font>.str.findall( <font color=orange>pattern</font>, <font color=orange>flags</font> )</font>**_ 

In [74]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

## _**<font color=skyblue><font color=orange>series</font>.str.match( <font color=orange>pattern</font>, <font color=orange>flags</font> )</font>**_ 

In [75]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
data.str.match(pattern, flags=re.IGNORECASE)

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

## _**<font color=skyblue><font color=orange>series</font>.str.get( <font color=orange>i</font> ), <font color=orange>series</font>.str.slice( <font color=orange>start</font>, <font color=orange>stop</font> ), <font color=orange>series</font>.str[ <font color=orange>start</font> : <font color=orange>stop</font> ]</font>**_ : 切片

In [76]:
data.str.get(0)

Dave       d
Steve      s
Rob        r
Wes      NaN
dtype: object

In [77]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

## _**<font color=skyblue><font color=orange>series</font>.str.len( )</font>**_ 

In [78]:
data.str.len()

Dave     15.0
Steve    15.0
Rob      13.0
Wes       NaN
dtype: float64

## _**<font color=skyblue><font color=orange>series1</font>.str.cat( <font color=orange>series2</font>, <font color=orange>sep</font> )</font>**_ : 根据索引实现元素级字符串连接

In [79]:
name = pd.Series({'Dave':'Dave', 'Steve':'Steve', 'Rob':'Rob', 'Wes':'Wes'})
name.str.cat(data, '----')

Dave      Dave----dave@google.com
Steve    Steve----steve@gmail.com
Rob          Rob----rob@gmail.com
Wes                           NaN
dtype: object

<font size=4>_**<font color=skyblue><font color=orange>series</font>.str.len( )</font>**_ </font>  
<font size=4> _**<font color=skyblue><font color=orange>series</font>.str.lower( ), <font color=orange>series</font>.str.upper( ), <font color=orange>series</font>.str.title( )</font>**_ </font>   
<font size=4> _**<font color=skyblue><font color=orange>series</font>.str.strip( ), <font color=orange>series</font>.str.lstrip( ), <font color=orange>series</font>.str.rstrip( ) </font>**_ : 去除两边/左/右的空格</font>   

<font size=4> _**<font color=skyblue><font color=orange>series</font>.str.endswith( <font color=orange>sep</font> ), <font color=orange>series</font>.str.startswith( <font color=orange>sep</font> )</font>**_ </font>  
<font size=4> _**<font color=skyblue><font color=orange>series</font>.str.find( <font color=orange>sep</font> ), <font color=orange>series</font>.str.rfind( <font color=orange>sep</font> )</font>**_ : 返回 sep 在字符串中的位置</font>   
<font size=4> _**<font color=skyblue><font color=orange>series</font>.str.count( <font color=orange>sep</font> )</font>**_ : 计数 sep 在字符串中出现的次数</font>   
<font size=4> _**<font color=skyblue><font color=orange>series</font>.str.split( <font color=orange>sep</font> )</font>**_ : 根据分隔符 sep 对字符串进行划分, str→list</font>   
<font size=4> _**<font color=skyblue><font color=orange>series</font>.str.join( <font color=orange>sep</font> )</font>**_ : 利用分隔符 sep 将 list 连接起来, liat→str</font>   
<font size=4> _**<font color=skyblue><font color=orange>series</font>.str.replace( <font color=orange>old</font>, <font color=orange>new</font> )</font>**_ : 替换</font>      

<font size=4> _**<font color=skyblue><font color=orange>series1</font>.str.cat( <font color=orange>series2</font>, <font color=orange>sep</font> )</font>**_ : 根据索引实现元素级字符串连接</font>  

<font size=4> _**<font color=skyblue>... ...</font>**_ </font>  