# Import packages and Settings

In [2]:
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

# Base Model

In [5]:
train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})
test_data['Sex'] = test_data['Sex'].map({'male':0, 'female':1})

In [6]:
X_train = train_data.drop('Survived', axis=1).select_dtypes(include='number')
Y_train = train_data['Survived']
X_test = test_data.select_dtypes(include='number')
X_train.shape, Y_train.shape, X_test.shape

((891, 7), (891,), (418, 7))

In [18]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.7464879794112107

In [7]:
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

In [8]:
result = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})
result.to_csv('./Result/result_basic.csv', index=False)

# Improvements

## Fare

In [5]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [6]:
feature = 'Fare'
fill_value = -1
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [7]:
test_data_imp[feature].isna().sum()

0

In [9]:
X_train = train_data_imp[['Pclass', 'SibSp', 'Parch', 'Sex', 'Fare']]
X_test = test_data_imp[['Pclass', 'SibSp', 'Parch', 'Sex', 'Fare']]
X_train.shape, X_test.shape

((891, 5), (418, 5))

In [10]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.7946833218253719

## Age

In [8]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [9]:
feature = 'Age'
fill_value = -1
train_data_imp[feature] = train_data_imp[feature].fillna(value=fill_value)
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [10]:
test_data_imp[feature].isna().sum()

0

In [11]:
X_train = train_data_imp[['Pclass', 'SibSp', 'Parch', 'Sex', 'Age']]
X_test = test_data_imp[['Pclass', 'SibSp', 'Parch', 'Sex', 'Age']]
X_train.shape, X_test.shape

((891, 5), (418, 5))

In [14]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.7845395769254911

## Embarked

In [24]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [25]:
feature = 'Embarked'
fill_value = train_data[feature].mode()[0]
train_data_imp[feature] = train_data_imp[feature].fillna(value=fill_value)
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [26]:
train_data_imp[feature].isna().sum()

0

In [27]:
encoder_onehot = OneHotEncoder(sparse_output=False)

train_data_onehot = encoder_onehot.fit_transform(pd.DataFrame(train_data_imp[feature]))
train_feature_name_onehot = encoder_onehot.get_feature_names_out()
train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot, dtype=int)

test_data_onehot = encoder_onehot.transform(pd.DataFrame(test_data_imp[feature]))
test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot, dtype=int)

train_data_onehot.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [28]:
train_data_imp = pd.concat([train_data_imp, train_data_onehot], axis=1)
test_data_imp = pd.concat([test_data_imp, test_data_onehot], axis=1)
train_data_imp.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0,1


In [30]:
X_train = train_data_imp[['Pclass', 'SibSp', 'Parch', 'Sex', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
X_test = test_data_imp[['Pclass', 'SibSp', 'Parch', 'Sex', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
X_train.shape, X_test.shape

((891, 7), (418, 7))

In [31]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.7946268281965978

## Title

In [37]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [36]:
feature = 'Title'

In [38]:
train_data_imp[feature] = train_data_imp['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
train_data_imp[feature] = train_data_imp[feature].replace(['Mlle', 'Ms'], 'Miss')
train_data_imp[feature] = train_data_imp[feature].replace(['Mme', 'Dona'], 'Mrs')
train_data_imp[feature] = train_data_imp[feature].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Rare')
pd.crosstab(train_data_imp[feature], train_data_imp['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,40,0
Miss,0,185
Mr,517,0
Mrs,0,126
Rare,20,3


In [39]:
test_data_imp[feature] = test_data_imp['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test_data_imp[feature] = test_data_imp[feature].replace(['Mlle', 'Ms'], 'Miss')
test_data_imp[feature] = test_data_imp[feature].replace(['Mme', 'Dona'], 'Mrs')
test_data_imp[feature] = test_data_imp[feature].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Rare')
pd.crosstab(train_data_imp[feature], train_data_imp['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,40,0
Miss,0,185
Mr,517,0
Mrs,0,126
Rare,20,3


In [40]:
encoder_onehot = OneHotEncoder(sparse_output=False)

train_data_onehot = encoder_onehot.fit_transform(pd.DataFrame(train_data_imp[feature]))
train_feature_name_onehot = encoder_onehot.get_feature_names_out()
train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot, dtype=int)

test_data_onehot = encoder_onehot.transform(pd.DataFrame(test_data_imp[feature]))
test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot, dtype=int)

train_data_onehot.head()

Unnamed: 0,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,0,1,0,0
1,0,0,0,1,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,0,1,0,0


In [41]:
train_data_imp = pd.concat([train_data_imp, train_data_onehot], axis=1)
test_data_imp = pd.concat([test_data_imp, test_data_onehot], axis=1)
train_data_imp.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,Mr,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,0,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0,1,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,Mrs,0,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,Mr,0,0,1,0,0


In [42]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title']
X_train = train_data_imp.drop(['Survived'] + columns_to_drop, axis=1)

X_test = test_data_imp.drop(columns_to_drop, axis=1)
X_train.shape, X_test.shape

((891, 9), (418, 9))

In [43]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.8081036971941498

## Family

In [7]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [8]:
train_data_imp['Family'] = train_data_imp['SibSp'] + train_data_imp['Parch'] + 1
test_data_imp['Family'] = test_data_imp['SibSp'] + test_data_imp['Parch'] + 1

In [9]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked']
X_train = train_data_imp.drop(['Survived'] + columns_to_drop, axis=1)

X_test = test_data_imp.drop(columns_to_drop, axis=1)
X_train.shape, X_test.shape

((891, 5), (418, 5))

In [10]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.7890214048082356

## Alone

In [11]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [12]:
train_data_imp['Family'] = train_data_imp['SibSp'] + train_data_imp['Parch'] + 1
test_data_imp['Family'] = test_data_imp['SibSp'] + test_data_imp['Parch'] + 1

In [13]:
train_data_imp['Alone'] = 0
train_data_imp.loc[train_data_imp['Family'] == 1, 'Alone'] = 1
test_data_imp['Alone'] = 0
test_data_imp.loc[test_data_imp['Family'] == 1, 'Alone'] = 1

In [15]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family']
X_train = train_data_imp.drop(['Survived'] + columns_to_drop, axis=1)

X_test = test_data_imp.drop(columns_to_drop, axis=1)
X_train.shape, X_test.shape

((891, 5), (418, 5))

In [16]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.7845395769254911

## FareBinned

In [44]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [45]:
feature = 'Fare'
fill_value = train_data[feature].mode()[0]
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [46]:
test_data_imp[feature].isna().sum()

0

In [47]:
train_data_imp['FareBinned'], fare_bin = pd.cut(train_data_imp['Fare'], 5, labels=[0,1,2,3,4], retbins=True)
fare_bin

array([ -0.5123292, 102.46584  , 204.93168  , 307.39752  , 409.86336  ,
       512.3292   ])

In [48]:
test_data_imp['FareBinned'] = pd.cut(test_data_imp['Fare'], bins=fare_bin, labels=[0,1,2,3,4])
test_data_imp['FareBinned'].unique()

[0, 2, 1, 4]
Categories (5, int64): [0 < 1 < 2 < 3 < 4]

In [49]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked']
X_train = train_data_imp.drop(['Survived'] + columns_to_drop, axis=1)

X_test = test_data_imp.drop(columns_to_drop, axis=1)
X_train.shape, X_test.shape

((891, 5), (418, 5))

In [50]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.7856631724311092

## AgeBinned

In [53]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [54]:
feature = 'Age'
fill_value = train_data[feature].mode()[0]
train_data_imp[feature] = train_data_imp[feature].fillna(value=fill_value)
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [55]:
train_data_imp['AgeBinned'], fare_bin = pd.cut(train_data_imp['Age'], 5, labels=[0,1,2,3,4], retbins=True)
fare_bin

array([ 0.34042, 16.336  , 32.252  , 48.168  , 64.084  , 80.     ])

In [56]:
test_data_imp['AgeBinned'] = pd.cut(train_data_imp['Age'], bins=fare_bin, labels=[0,1,2,3,4])
test_data_imp['AgeBinned'].unique()

[1, 2, 3, 0, 4]
Categories (5, int64): [0 < 1 < 2 < 3 < 4]

In [57]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked']
X_train = train_data_imp.drop(['Survived'] + columns_to_drop, axis=1)

X_test = test_data_imp.drop(columns_to_drop, axis=1)
X_train.shape, X_test.shape

((891, 5), (418, 5))

In [58]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.7912685958194714