In [11]:
import pandas as pd
import numpy as np
import re #Regex

**String Object Methods**

In [2]:
val = 'a,b,guido'
val.split(',')

['a', 'b', 'guido']

In [3]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [4]:
first,second,third = pieces
first+'::'+second+'::'+third

'a::b::guido'

In [5]:
'::'.join(pieces)

'a::b::guido'

In [6]:
'guido' in val

True

In [7]:
val.index(',')

1

In [8]:
val.find("L")

-1

In [9]:
val.count(',')

2

In [10]:
val.replace(',','::')

'a::b::guido'

![Table 7-3](7_3.png)

**Regular Expression**

In [12]:
text = "foo    bar\t barz \tqux"
re.split('\s+',text)

['foo', 'bar', 'barz', 'qux']

In [14]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'barz', 'qux']

In [16]:
regex.findall(text)#Patterns

['    ', '\t ', ' \t']

In [32]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}'
regex = re.compile(pattern) #Add flags=re.IGNORECASE as a parameter to make regex case insensitive
#regex = re.compile(pattern,flags=re.IGNORECASE)

In [33]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [35]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [36]:
text[m.start():m.end()]

'dave@google.com'

In [37]:
print(regex.match(text))

None


In [39]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



![](7_4.png)

**Vectorized String Functions in Pandas**

In [40]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
.....: 'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [41]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [42]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [45]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags = re.IGNORECASE)
m = regex.match('dave@google.com')
m.groups()
data.str.findall(pattern)

Dave      []
Steve     []
Rob       []
Wes      NaN
dtype: object

![Table 7-5](7_5.png)