#### Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

#### Imporing the Dataset

In [2]:
dataset = pd.read_csv('Titanic.csv')[['Age','Pclass','SibSp','Parch','Survived']]
dataset.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [3]:
dataset.dropna(inplace=True)

In [4]:
x = dataset.iloc[:,0:4]
y = dataset.iloc[:,-1]

#### Checking the accuracy of model before Feature Construction

In [5]:
# calculate the mean accuracy of a Logistic Regression model on the given data (x) and labels (y) using 20-fold cross validation
np.mean(cross_val_score(LogisticRegression(), x, y, scoring='accuracy', cv=20))

0.6933333333333332

#### Applying Feature Construction

In [6]:
# calculating the total size of the family by adding the number of siblings/spouses ('SibSp') and the number of parents/children ('Parch')
x['Family_size'] = x['SibSp'] + x['Parch'] + 1
x.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [7]:
def construction(num):
    # check the number of people in the family
    if num == 1:
        # alone
        return 0
    elif num >1 and num <=4:
        # small family
        return 1
    else:
        # large family
        return 2

In [8]:
x['Family_type'] = x['Family_size'].apply(construction)
x.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [9]:
x.drop(columns=['SibSp','Parch','Family_size'],inplace=True)
x.head()

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


#### Checking the accuracy of model after Feature Construction

In [10]:
np.mean(cross_val_score(LogisticRegression(), x, y, scoring='accuracy', cv=20))

0.7003174603174602