# Import packages and Settings

In [1]:
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


# Preprocessing

## Missing Values

In [6]:
train_data_filled = train_data.copy()
test_data_filled = test_data.copy()

In [7]:
feature = 'Fare'
fill_value = -1
test_data_filled[feature] = test_data_filled[feature].fillna(value=fill_value)

In [10]:
feature = 'Age'
fill_value = -1
train_data_filled[feature] = train_data_filled[feature].fillna(value=fill_value)
test_data_filled[feature] = test_data_filled[feature].fillna(value=fill_value)

In [13]:
feature = 'Embarked'
fill_value = train_data_filled[feature].mode()
train_data_filled[feature] = train_data_filled[feature].fillna(value=fill_value)

## New Features

In [30]:
train_data_new = train_data_filled.copy()
test_data_new = test_data_filled.copy()

In [33]:
train_data_new['Title'] = train_data_new['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
train_data_new['Title'] = train_data_new['Title'].replace(['Mlle', 'Ms'], 'Miss')
train_data_new['Title'] = train_data_new['Title'].replace(['Mme', 'Dona'], 'Mrs')
train_data_new['Title'] = train_data_new['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Rare')
pd.crosstab(train_data_new['Title'], train_data_new['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,0,40
Miss,185,0
Mr,0,517
Mrs,126,0
Rare,3,20


In [34]:
test_data_new['Title'] = test_data_new['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test_data_new['Title'] = test_data_new['Title'].replace(['Mlle', 'Ms'], 'Miss')
test_data_new['Title'] = test_data_new['Title'].replace(['Mme', 'Dona'], 'Mrs')
test_data_new['Title'] = test_data_new['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Rare')
pd.crosstab(test_data_new['Title'], test_data_new['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Master,0,21
Miss,79,0
Mr,0,240
Mrs,73,0
Rare,0,5


## Categorical Features

In [35]:
train_data_transformed = train_data_new.copy()
test_data_transformed = test_data_new.copy()

In [36]:
train_data_transformed['Sex'] = train_data_transformed['Sex'].map({'male':0, 'female':1})
test_data_transformed['Sex'] = test_data_transformed['Sex'].map({'male':0, 'female':1})

In [37]:
encoder_onehot = OneHotEncoder(sparse_output=False)

train_data_onehot = encoder_onehot.fit_transform(train_data_transformed[['Embarked', 'Title']])
train_feature_name_onehot = encoder_onehot.get_feature_names_out()
train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot, dtype=int)

test_data_onehot = encoder_onehot.transform(test_data_transformed[['Embarked', 'Title']])
test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot, dtype=int)

train_data_onehot.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,0,1,0,0,0,1,0,0
1,1,0,0,0,0,0,0,1,0
2,0,0,1,0,0,1,0,0,0
3,0,0,1,0,0,0,0,1,0
4,0,0,1,0,0,0,1,0,0


In [38]:
train_data_transformed = pd.concat([train_data_transformed, train_data_onehot], axis=1)
test_data_transformed = pd.concat([test_data_transformed, test_data_onehot], axis=1)
train_data_transformed.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Title,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,...,Mr,0,0,1,0,0,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,...,Mrs,1,0,0,0,0,0,0,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,...,Miss,0,0,1,0,0,1,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,...,Mrs,0,0,1,0,0,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,...,Mr,0,0,1,0,0,0,1,0,0


# Training and Validation

In [27]:
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Title']

In [24]:
Y_train = train_data['Survived']

In [39]:
X_train = train_data_transformed.drop(['Survived'] + columns_to_drop, axis=1)

X_test = test_data_transformed.drop(columns_to_drop, axis=1)
X_train.shape, Y_train.shape, X_test.shape

((891, 15), (891,), (418, 15))

In [40]:
model = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(model, X_train, Y_train)
scores.mean()

0.7856443412215178