# <span style='color:IndianRed'> Python Feature Engineering with Pandas.  </span>

## <span style='color:darkcyan'> Python Regular Expressions.</span>
A **regular expression** is a sequence of characters that specifies a search pattern in text.

In [1]:
# Import Libraries.
import numpy as np
import pandas as pd

In [2]:
# Load the Dataset.
df = pd.read_csv('titanic_train.csv')
display(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# Describing the Dataset. 
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### <span style='color:royalblue'> Basics of Regular Expressions.</span>

In [4]:
# Importing Regular Expressions Library.
import re

#### <span style='color:darkGoldenRod'> Searching for a String.</span>

In [5]:
expression = "William"
name_search = re.search(expression, "Allen, Mr. William Henry")
print(name_search)

<re.Match object; span=(11, 18), match='William'>


In [6]:
print(name_search.group(0))

William


#### <span style='color:darkGoldenRod'> Searching for a String Based On Different Casing.</span>

In [7]:
 expression = "[Ww]illiam"
name_search = re.search(expression, "Allen, Mr. William Henry")
print(name_search)

<re.Match object; span=(11, 18), match='William'>


In [8]:
print(name_search.group(0))

William


#### <span style='color:darkGoldenRod'> Searching for a String Regardless of Casing.</span>

In [9]:
expression = '(?i)william' 
name_search = re.search(expression, "Allen, Mr. William Henry")
print(name_search)

<re.Match object; span=(11, 18), match='William'>


In [10]:
print(name_search.group(0))

William


#### <span style='color:darkGoldenRod'> Searching for the Beginning or End of String.</span>

In [11]:
# Case : End of a String
expression = "Henry$"
name_search = re.search(expression, "Allen, Mr. William Henry")
print(name_search)

<re.Match object; span=(19, 24), match='Henry'>


In [12]:
print(name_search.group(0))

Henry


In [13]:
# Case : Beginning of a String.
expression = "^Allen"
name_search = re.search(expression, "Allen, Mr. William Henry")
print(name_search)

<re.Match object; span=(0, 5), match='Allen'>


In [14]:
print(name_search.group(0))

Allen


#### <span style='color:darkGoldenRod'> Searching for a Pattern.</span>

In [15]:
expression = "([A-Za-z]+)\."
name_search = re.search(expression, "Allen, Mr. William Henry")
print(name_search)
print(name_search.group(0))
print(name_search.group(1))

<re.Match object; span=(7, 10), match='Mr.'>
Mr.
Mr


In [16]:
expression = "([A-Za-z]+)\. ([A-Za-z]+)\."
titles_search = re.search(expression, "Allen, Dr. Mr. William Henry")
print(titles_search.group(0))
print(titles_search.group(1))
print(titles_search.group(2))

Dr. Mr.
Dr
Mr


#### <span style='color:darkGoldenRod'> Extracting Based on Patterns.</span>

In [17]:
titles_search = re.findall('([A-Za-z]+)\.', "Mrs. Mr. Ms. Dr.")
print(titles_search)

['Mrs', 'Mr', 'Ms', 'Dr']


### <span style='color:royalblue'> Regular Expressions using Pandas.</span>

#### <span style='color:darkGoldenRod'> Applying the count() function.</span>

In [18]:
# Using the count function.
expression = "^Braund.+Owen"
print(df['Name'].str.count(expression).sum())
print(df['Name'].str.count(expression))

1
0      1
1      0
2      0
3      0
4      0
      ..
886    0
887    0
888    0
889    0
890    0
Name: Name, Length: 891, dtype: int64


In [19]:
display(df[df['Name'].str.count(expression) == 1])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [20]:
expression = 'William'
print(df['Name'].str.count(expression))
print(df['Name'].str.count(expression).sum())

0      0
1      0
2      0
3      0
4      1
      ..
886    0
887    0
888    0
889    0
890    0
Name: Name, Length: 891, dtype: int64
70


In [21]:
# Display Rows that Contains More than One Occurrences.
display(df[df['Name'].str.count(expression) > 1]) 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
348,349,1,3,"Coutts, Master. William Loch ""William""",male,3.0,1,1,C.A. 37671,15.9,,S


#### <span style='color:darkGoldenRod'> Applying the extract() function.</span>

In [22]:
# Extract an Expression and Create a New Column.
expression = ' ([A-Za-z]+)\.'
df['Title'] = df['Name'].str.extract(expression)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr


In [23]:
# Extract Function Can Be Used to Split Data.
expression = "^([A-G])"
df["Cabin_Type"] = df['Cabin'].str.extract(expression)

expression = "^.{1}(.*)"
df["Cabin_Number"] = df["Cabin"].str.extract(expression)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Cabin_Type,Cabin_Number
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr,,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss,,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,C,123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev,,
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss,B,42
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss,,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr,C,148


#### <span style='color:darkGoldenRod'> Applying the contains() and match() function.</span>

In [24]:
expression = "William"
print(df["Name"].str.count(expression).sum())
display(df[df["Name"].str.count(expression) == 1])

70


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Cabin_Type,Cabin_Number
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,,
12,13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.0500,,S,Mr,,
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,Mr,,
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S,Mr,A,6
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C,Mrs,B,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802,803,1,1,"Carter, Master. William Thornton II",male,11.0,1,2,113760,120.0000,B96 B98,S,Master,B,96 B98
810,811,0,3,"Alexander, Mr. William",male,26.0,0,0,3474,7.8875,,S,Mr,,
864,865,0,2,"Gill, Mr. John William",male,24.0,0,0,233866,13.0000,,S,Mr,,
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S,Mrs,,


In [25]:
expression = "William"
df[df["Name"].str.contains(expression)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Cabin_Type,Cabin_Number
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,,
12,13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.0500,,S,Mr,,
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,Mr,,
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S,Mr,A,6
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C,Mrs,B,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802,803,1,1,"Carter, Master. William Thornton II",male,11.0,1,2,113760,120.0000,B96 B98,S,Master,B,96 B98
810,811,0,3,"Alexander, Mr. William",male,26.0,0,0,3474,7.8875,,S,Mr,,
864,865,0,2,"Gill, Mr. John William",male,24.0,0,0,233866,13.0000,,S,Mr,,
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S,Mrs,,


In [26]:
expression = '^William'
print(df["Name"].str.count(expression).sum())
df[df['Name'].str.contains(expression)]

5


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Cabin_Type,Cabin_Number
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S,Mr,,
155,156,0,1,"Williams, Mr. Charles Duane",male,51.0,0,1,PC 17597,61.3792,,C,Mr,,
304,305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,Mr,,
351,352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35.0,C128,S,Mr,C,128.0
735,736,0,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1,,S,Mr,,


In [27]:
expression = 'William'
df[df['Name'].str.match(expression)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Cabin_Type,Cabin_Number
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S,Mr,,
155,156,0,1,"Williams, Mr. Charles Duane",male,51.0,0,1,PC 17597,61.3792,,C,Mr,,
304,305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,Mr,,
351,352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35.0,C128,S,Mr,C,128.0
735,736,0,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1,,S,Mr,,


#### <span style='color:darkGoldenRod'> Applying the replace() function.</span>

In [28]:
df['Title'] = df['Title'].str.replace('Mlle', 'Miss')
df['Title'] = df['Title'].str.replace('Ms', 'Miss')
df['Title'] = df['Title'].str.replace('Mme', 'Miss')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Cabin_Type,Cabin_Number
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr,,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss,,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,C,123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev,,
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss,B,42
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss,,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr,C,148


In [29]:
df['Name'] = df['Name'].str.replace(' (Mlle)\.', ' Miss.', regex=True)
df['Name'] = df['Name'].str.replace(' (Ms)\.', ' Miss.', regex=True)
df['Name'] = df['Name'].str.replace(' (Mme)\.', ' Miss.', regex=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Cabin_Type,Cabin_Number
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr,,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss,,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,C,123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Rev,,
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Miss,B,42
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Miss,,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Mr,C,148


#### <span style='color:darkGoldenRod'> Applying the findall() function.</span>

In [30]:
expression = '([A-Za-z]*(ll)[a-z]*)'
df['Name'].str.findall(expression)

0                                []
1                                []
2                                []
3                  [(Futrelle, ll)]
4      [(Allen, ll), (William, ll)]
                   ...             
886                              []
887                              []
888                              []
889                  [(Howell, ll)]
890                              []
Name: Name, Length: 891, dtype: object

In [31]:
[i for i in df['Name'].str.findall(expression) if len(i)>0]

[[('Futrelle', 'll')],
 [('Allen', 'll'), ('William', 'll')],
 [('Bonnell', 'll')],
 [('William', 'll')],
 [('Williams', 'll')],
 [('William', 'll')],
 [('Ellen', 'll'), ('Nellie', 'll')],
 [('William', 'll')],
 [('William', 'll')],
 [('William', 'll')],
 [('Driscoll', 'll')],
 [('William', 'll')],
 [('Ramell', 'll')],
 [('Lillian', 'll')],
 [('Caldwell', 'll')],
 [('Dowdell', 'll')],
 [('Achille', 'll')],
 [('William', 'll')],
 [('Fuller', 'll')],
 [('William', 'll')],
 [('William', 'll')],
 [('Ellis', 'll')],
 [('Futrelle', 'll')],
 [('Nicholls', 'll')],
 [('Billiard', 'll')],
 [('Williams', 'll')],
 [('William', 'll')],
 [('Chibnall', 'll')],
 [('William', 'll')],
 [('Hallace', 'll')],
 [('William', 'll')],
 [('Walle', 'll')],
 [('Ellen', 'll')],
 [('Newell', 'll')],
 [('Mellors', 'll'), ('William', 'll')],
 [('Lovell', 'll'), ('Hall', 'll')],
 [('Wallach', 'll')],
 [('Lillian', 'll')],
 [('William', 'll')],
 [('Collyer', 'll')],
 [('Pengelly', 'll'), ('William', 'll')],
 [('William

#### <span style='color:darkGoldenRod'> Applying the split() function.</span>

In [38]:
expression = '([\'A-Za-z ()"//.-]+), ([A-Za-z]+). ([A-Za-z ()"//.-]*)'
df1 = df['Name'].str.split(expression, expand=True)
display(df1)

Unnamed: 0,0,1,2,3,4
0,,Braund,Mr,Owen Harris,
1,,Cumings,Mrs,John Bradley (Florence Briggs Thayer),
2,,Heikkinen,Miss,Laina,
3,,Futrelle,Mrs,Jacques Heath (Lily May Peel),
4,,Allen,Mr,William Henry,
...,...,...,...,...,...
886,,Montvila,Rev,Juozas,
887,,Graham,Miss,Margaret Edith,
888,,Johnston,Miss,"Catherine Helen ""Carrie""",
889,,Behr,Mr,Karl Howell,


In [39]:
df1.drop([0,4], axis=1, inplace=True)
display(df1)

Unnamed: 0,1,2,3
0,Braund,Mr,Owen Harris
1,Cumings,Mrs,John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss,Laina
3,Futrelle,Mrs,Jacques Heath (Lily May Peel)
4,Allen,Mr,William Henry
...,...,...,...
886,Montvila,Rev,Juozas
887,Graham,Miss,Margaret Edith
888,Johnston,Miss,"Catherine Helen ""Carrie"""
889,Behr,Mr,Karl Howell


In [40]:
df1.columns = ['Last Name','Title','First Name']
display(df1)

Unnamed: 0,Last Name,Title,First Name
0,Braund,Mr,Owen Harris
1,Cumings,Mrs,John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss,Laina
3,Futrelle,Mrs,Jacques Heath (Lily May Peel)
4,Allen,Mr,William Henry
...,...,...,...
886,Montvila,Rev,Juozas
887,Graham,Miss,Margaret Edith
888,Johnston,Miss,"Catherine Helen ""Carrie"""
889,Behr,Mr,Karl Howell
