In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('E:\\Notes\\Statistics\\Feature Engineering\\Datasets\\train.csv', usecols=['Age','Pclass','SibSp','Parch','Survived'])
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [4]:
df.isnull().sum()

Survived      0
Pclass        0
Age         177
SibSp         0
Parch         0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
dtype: int64

In [18]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [19]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

np.float64(0.6933333333333332)

#### **Applying Feature Construction**

In [20]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1

In [21]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size
0,3,22.0,1,0,2
1,1,38.0,1,0,2
2,3,26.0,0,0,1
3,1,35.0,1,0,2
4,3,35.0,0,0,1


In [22]:
def myfunc(num):
    if num == 1:
        return 0  #Travelling Alone
    
    elif (num > 1 and num <= 4):
        return 1  #Travelling with small family
    
    else:
        return 2  #Travelling with Large Family

In [23]:
X['Family_type'] = X['Family_size'].apply(myfunc)

In [24]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size,Family_type
0,3,22.0,1,0,2,1
1,1,38.0,1,0,2,1
2,3,26.0,0,0,1,0
3,1,35.0,1,0,2,1
4,3,35.0,0,0,1,0


In [26]:
X.drop(columns=['SibSp','Parch','Family_size'], inplace=True)

In [27]:
X.head()

Unnamed: 0,Pclass,Age,Family_type
0,3,22.0,1
1,1,38.0,1
2,3,26.0,0
3,1,35.0,1
4,3,35.0,0


In [38]:
np.mean(cross_val_score(LogisticRegression(), X, y, cv=20, scoring='accuracy'))

np.float64(0.7003174603174602)

#### **Feature Splitting**

In [40]:
df2 = pd.read_csv('E:\\Notes\\Statistics\\Feature Engineering\\Datasets\\train.csv', usecols=['Age','Pclass','SibSp','Parch','Survived','Name'])
df2.head()

Unnamed: 0,Survived,Pclass,Name,Age,SibSp,Parch
0,0,3,"Braund, Mr. Owen Harris",22.0,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0
2,1,3,"Heikkinen, Miss. Laina",26.0,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0
4,0,3,"Allen, Mr. William Henry",35.0,0,0


In [41]:
df2['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

**Extracting Mr/Mrs from Name Column**

In [43]:
df2['Title'] = df2['Name'].str.split(',', expand=True)[1].str.split('.',expand=True)[0]
df2

Unnamed: 0,Survived,Pclass,Name,Age,SibSp,Parch,Title
0,0,3,"Braund, Mr. Owen Harris",22.0,1,0,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,Mrs
2,1,3,"Heikkinen, Miss. Laina",26.0,0,0,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,Mrs
4,0,3,"Allen, Mr. William Henry",35.0,0,0,Mr
...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",27.0,0,0,Rev
887,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,Miss
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,Miss
889,1,1,"Behr, Mr. Karl Howell",26.0,0,0,Mr


In [46]:
# Analyzing Survival Rate on the basis of Title
(df.groupby(df2['Title']).mean()['Survived']).sort_values()

Title
Capt            0.000000
Don             0.000000
Jonkheer        0.000000
Rev             0.000000
Mr              0.168342
Major           0.500000
Dr              0.500000
Col             0.500000
Master          0.583333
Miss            0.719178
Mrs             0.787037
Lady            1.000000
Mme             1.000000
Mlle            1.000000
Ms              1.000000
Sir             1.000000
the Countess    1.000000
Name: Survived, dtype: float64