### Environment

In [5]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

### Generate dummy variables

In [8]:
df=pd.DataFrame({'key':['b','b','a','c','a','b'],
                'data1':range(6)})
df  #generates data

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [10]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [11]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [12]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


### String object manipulation

In [19]:
val='202D Wilson Rd, Pullman, Washington'
val.split(',')

['202D Wilson Rd', ' Pullman', ' Washington']

In [20]:
pieces=[x.strip() for x in val.split(',')]
pieces

['202D Wilson Rd', 'Pullman', 'Washington']

In [23]:
first, second, third=pieces
'Street:'+first+' City:'+second+' State:'+third

'Street:202D Wilson Rd City:Pullman State:Washington'

In [30]:
'::'.join(pieces)

'202D Wilson Rd::Pullman::Washington'

In [31]:
val

'202D Wilson Rd, Pullman, Washington'

In [25]:
'Washington' in val

True

In [26]:
val.index(',')

14

In [35]:
val.find('Wilson') # Wilson starts at 6th character

5

In [36]:
val.count(',')

2

In [37]:
val.replace(',','_')

'202D Wilson Rd_ Pullman_ Washington'

In [39]:
val.replace(',','')

'202D Wilson Rd Pullman Washington'

### Regular Expressions in the string

In [43]:
import re
text = "apple    ball\t cat  \tdog"
re.split('\s+', text)

['apple', 'ball', 'cat', 'dog']

In [47]:
regex=re.compile('\s+') #another way
regex.split(text)

['apple', 'ball', 'cat', 'dog']

In [48]:
regex.findall(text) #regular text expressions

['    ', '\t ', '  \t']

In [55]:
text = """Dave dave@msn.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
m=regex.search(text)
m

<re.Match object; span=(5, 17), match='dave@msn.com'>

In [56]:
text[m.start():m.end()]

'dave@msn.com'

In [57]:
print(regex.match(text))

None


In [58]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [61]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

In [62]:
regex.findall(text)

[('dave', 'msn', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [63]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: msn, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



### Vectorized string functions using pandas

In [64]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [65]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [66]:
pattern
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [69]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [71]:
matches.str.get(1)
matches.str[0]

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [73]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object