# Import packages and Settings

In [1]:
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})
test_data['Sex'] = test_data['Sex'].map({'male':0, 'female':1})

label = 'Survived'

# Functions

In [4]:
def PrepareData(columns_to_drop, label, train_data, test_data):
    X_train = train_data.drop([label] + columns_to_drop, axis=1)
    Y_train = train_data[label]
    X_test = test_data.drop(columns_to_drop, axis=1)
    print(X_train.shape, Y_train.shape, X_test.shape)
    return X_train, Y_train, X_test

In [5]:
def EvalTree(X_train, Y_train):
    model = DecisionTreeClassifier(random_state=0)
    scores = cross_val_score(model, X_train, Y_train)
    print(scores.mean())

# Base Model

In [13]:
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Age', 'Fare', 'Cabin', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data, test_data)

(891, 4) (891,) (418, 4)


In [14]:
EvalTree(X_train, Y_train)

0.7845395769254911


In [15]:
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

In [16]:
result = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})
result.to_csv('./Result/result_basic.csv', index=False)

# Improvements

## Fare

In [11]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [12]:
feature = 'Fare'
fill_value = -1
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [17]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Cabin', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_imp, test_data_imp)

(891, 5) (891,) (418, 5)


In [18]:
EvalTree(X_train, Y_train)

0.7946833218253719


## Age

In [19]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [20]:
feature = 'Age'
fill_value = -1
train_data_imp[feature] = train_data_imp[feature].fillna(value=fill_value)
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [25]:
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_imp, test_data_imp)

(891, 5) (891,) (418, 5)


In [26]:
EvalTree(X_train, Y_train)

0.8047705730964786


## Embarked

In [27]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [28]:
feature = 'Embarked'
fill_value = train_data[feature].mode()[0]
train_data_imp[feature] = train_data_imp[feature].fillna(value=fill_value)
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [29]:
encoder_onehot = OneHotEncoder(sparse_output=False)

train_data_onehot = encoder_onehot.fit_transform(pd.DataFrame(train_data_imp[feature]))
train_feature_name_onehot = encoder_onehot.get_feature_names_out()
train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot, dtype=int)

test_data_onehot = encoder_onehot.transform(pd.DataFrame(test_data_imp[feature]))
test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot, dtype=int)

train_data_onehot.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [30]:
train_data_imp = pd.concat([train_data_imp, train_data_onehot], axis=1)
test_data_imp = pd.concat([test_data_imp, test_data_onehot], axis=1)
train_data_imp.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0,0,1


In [32]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Fare', 'Ticket', 'Cabin', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_imp, test_data_imp)

(891, 7) (891,) (418, 7)


In [33]:
EvalTree(X_train, Y_train)

0.7946268281965978


## Title

In [67]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [68]:
feature = 'Title'

In [69]:
train_data_imp[feature] = train_data_imp['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
train_data_imp[feature] = train_data_imp[feature].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
train_data_imp[feature] = train_data_imp[feature].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
pd.crosstab(train_data_imp[feature], train_data_imp['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,40,0
Miss,0,185
Mr,517,0
Mrs,0,126
Rare,20,3


In [71]:
test_data_imp[feature] = test_data_imp['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test_data_imp[feature] = test_data_imp[feature].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
test_data_imp[feature] = test_data_imp[feature].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
pd.crosstab(test_data_imp[feature], test_data_imp['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,21,0
Miss,0,79
Mr,240,0
Mrs,0,72
Rare,5,1


In [72]:
encoder_onehot = OneHotEncoder(sparse_output=False)

train_data_onehot = encoder_onehot.fit_transform(pd.DataFrame(train_data_imp[feature]))
train_feature_name_onehot = encoder_onehot.get_feature_names_out()
train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot, dtype=int)

test_data_onehot = encoder_onehot.transform(pd.DataFrame(test_data_imp[feature]))
test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot, dtype=int)

train_data_onehot.head()

Unnamed: 0,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,0,1,0,0
1,0,0,0,1,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,0,1,0,0


In [73]:
train_data_imp = pd.concat([train_data_imp, train_data_onehot], axis=1)
test_data_imp = pd.concat([test_data_imp, test_data_onehot], axis=1)
train_data_imp.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,Mr,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,0,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0,1,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,Mrs,0,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,Mr,0,0,1,0,0


In [74]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_imp, test_data_imp)

(891, 9) (891,) (418, 9)


In [75]:
EvalTree(X_train, Y_train)

0.8081036971941498


## Family

In [42]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [43]:
train_data_imp['Family'] = train_data_imp['SibSp'] + train_data_imp['Parch'] + 1
test_data_imp['Family'] = test_data_imp['SibSp'] + test_data_imp['Parch'] + 1

In [44]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_imp, test_data_imp)

(891, 5) (891,) (418, 5)


In [45]:
EvalTree(X_train, Y_train)

0.7890214048082356


## Alone

In [46]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [47]:
train_data_imp['Family'] = train_data_imp['SibSp'] + train_data_imp['Parch'] + 1
test_data_imp['Family'] = test_data_imp['SibSp'] + test_data_imp['Parch'] + 1

In [48]:
train_data_imp['Alone'] = 0
train_data_imp.loc[train_data_imp['Family'] == 1, 'Alone'] = 1
test_data_imp['Alone'] = 0
test_data_imp.loc[test_data_imp['Family'] == 1, 'Alone'] = 1

In [49]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_imp, test_data_imp)

(891, 5) (891,) (418, 5)


In [50]:
EvalTree(X_train, Y_train)

0.7845395769254911


## FareBinned

In [51]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [52]:
feature = 'Fare'
fill_value = train_data[feature].mode()[0]
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [53]:
train_data_imp['FareBinned'], fare_bin = pd.cut(train_data_imp['Fare'], 5, labels=[0,1,2,3,4], retbins=True)
fare_bin

array([ -0.5123292, 102.46584  , 204.93168  , 307.39752  , 409.86336  ,
       512.3292   ])

In [54]:
test_data_imp['FareBinned'] = pd.cut(test_data_imp['Fare'], bins=fare_bin, labels=[0,1,2,3,4])
test_data_imp['FareBinned'].unique()

[0, 2, 1, 4]
Categories (5, int64): [0 < 1 < 2 < 3 < 4]

In [55]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_imp, test_data_imp)

(891, 5) (891,) (418, 5)


In [56]:
EvalTree(X_train, Y_train)

0.7856631724311092


## AgeBinned

In [57]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [58]:
feature = 'Age'
fill_value = train_data[feature].mode()[0]
train_data_imp[feature] = train_data_imp[feature].fillna(value=fill_value)
test_data_imp[feature] = test_data_imp[feature].fillna(value=fill_value)

In [59]:
train_data_imp['AgeBinned'], fare_bin = pd.cut(train_data_imp['Age'], 5, labels=[0,1,2,3,4], retbins=True)
fare_bin

array([ 0.34042, 16.336  , 32.252  , 48.168  , 64.084  , 80.     ])

In [60]:
test_data_imp['AgeBinned'] = pd.cut(train_data_imp['Age'], bins=fare_bin, labels=[0,1,2,3,4])
test_data_imp['AgeBinned'].unique()

[1, 2, 3, 0, 4]
Categories (5, int64): [0 < 1 < 2 < 3 < 4]

In [61]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_imp, test_data_imp)

(891, 5) (891,) (418, 5)


In [62]:
EvalTree(X_train, Y_train)

0.7912685958194714


## TicketNumber

In [63]:
train_data_imp = train_data.copy()
test_data_imp = test_data.copy()

In [64]:
def get_ticket_number(ticket):
    num = ticket.split(' ')[-1]
    if num.isdigit():
        return int(num)
    else:
        return -1

train_data_imp['TicketNumber'] = train_data_imp['Ticket'].apply(get_ticket_number)
test_data_imp['TicketNumber'] = test_data_imp['Ticket'].apply(get_ticket_number)

In [65]:
columns_to_drop = ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_imp, test_data_imp)

(891, 5) (891,) (418, 5)


In [66]:
EvalTree(X_train, Y_train)

0.7677170296905405
