# Import packages and Settings

In [1]:
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})
test_data['Sex'] = test_data['Sex'].map({'male':0, 'female':1})

label = 'Survived'

# Functions

In [4]:
def makeOneHot(column, train_data, test_data):
    train_data_new = train_data.copy()
    test_data_new = test_data.copy()

    encoder_onehot = OneHotEncoder(sparse_output=False)

    train_data_onehot = encoder_onehot.fit_transform(pd.DataFrame(train_data[column]))
    train_feature_name_onehot = encoder_onehot.get_feature_names_out()
    train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot, dtype=int)

    test_data_onehot = encoder_onehot.transform(pd.DataFrame(test_data[column]))
    test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot, dtype=int)

    train_data_new = pd.concat([train_data_new, train_data_onehot], axis=1)
    test_data_new = pd.concat([test_data_new, test_data_onehot], axis=1)

    train_data_new = train_data_new.drop(column, axis=1)
    test_data_new = test_data_new.drop(column, axis=1)

    return train_data_new, test_data_new

In [9]:
def PrepareData(columns_to_drop, label, train_data, test_data):
    X_train = train_data.drop([label] + columns_to_drop + ['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    Y_train = train_data[label]
    X_test = test_data.drop(columns_to_drop + ['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    return X_train, Y_train, X_test

In [6]:
def Eval(model, X_train, Y_train):
    scores = cross_val_score(model, X_train, Y_train)
    return scores.mean()

# Base Model

In [11]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data, test_data)

In [12]:
model = DecisionTreeClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7845395769254911


In [15]:
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

In [16]:
result = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})
result.to_csv('./Result/result_basic.csv', index=False)

# Improvements

## Univariate

### Fare

In [13]:
def processFare(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Fare'
    fill_value = train_data_new[feature].mode()[0]
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    return train_data_new, test_data_new

In [14]:
train_data_new, test_data_new = processFare(train_data, test_data)

In [15]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [16]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7946833218253719


### Age

In [17]:
def processAge(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Age'
    fill_value = train_data_new[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    return train_data_new, test_data_new

In [18]:
train_data_new, test_data_new = processAge(train_data, test_data)

In [19]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [20]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8159437574540205


### Embarked

In [21]:
def processEmbarked(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Embarked'
    fill_value = train_data_new[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new, test_data_new = makeOneHot('Embarked', train_data_new, test_data_new)

    return train_data_new, test_data_new

In [22]:
train_data_new, test_data_new = processEmbarked(train_data, test_data)

In [23]:
columns_to_drop = ['Age', 'Fare']
X_train, Y_train, X_test = PrepareData(columns_to_drop + columns_to_drop, label, train_data_new, test_data_new)

In [24]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7946268281965978


### Title

In [25]:
def processTitle(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Title'

    def process(name):
        new_feature = name.str.extract(r' ([A-Za-z]+)\.', expand=False)
        new_feature = new_feature.replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
        new_feature = new_feature.replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        
        return new_feature
    
    train_data_new[feature] = process(train_data_new['Name'])
    test_data_new[feature] = process(test_data_new['Name'])

    train_data_new, test_data_new = makeOneHot(feature, train_data_new, test_data_new)

    return train_data_new, test_data_new

In [26]:
train_data_new, test_data_new = processTitle(train_data, test_data)

In [27]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [28]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8081036971941498


### Family

In [82]:
def processFamily(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    train_data_new['Family'] = train_data_new['SibSp'] + train_data_new['Parch'] + 1
    test_data_new['Family'] = test_data_new['SibSp'] + test_data_new['Parch'] + 1

    train_data_new = train_data_new.drop(['SibSp', 'Parch'], axis=1)
    test_data_new = test_data_new.drop(['SibSp', 'Parch'], axis=1)

    return train_data_new, test_data_new

In [83]:
train_data_new, test_data_new = processFamily(train_data, test_data)

In [84]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [85]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7879166405122089


### Alone

In [86]:
def processAlone(train_data, test_data):
    train_data_new, test_data_new = processFamily(train_data, test_data)

    train_data_new['Alone'] = train_data_new['Family'].apply(lambda x: 1 if x == 1 else 0)
    test_data_new['Alone'] = test_data_new['Family'].apply(lambda x: 1 if x == 1 else 0)

    train_data_new = train_data_new.drop('Family', axis=1)
    test_data_new = test_data_new.drop('Family', axis=1)

    return train_data_new, test_data_new

In [87]:
train_data_new, test_data_new = processAlone(train_data, test_data)

In [88]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [89]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912937040989265


### FareBinned

In [37]:
def processFareBin(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Fare'
    new_feature = 'FareBinned'

    fill_value = train_data[feature].mode()[0]
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new[new_feature], bins = pd.cut(train_data_new[feature], 5, labels=[0,1,2,3,4], retbins=True)
    test_data_new[new_feature] = pd.cut(test_data_new[feature], bins=bins, labels=[0,1,2,3,4])

    train_data_new = train_data_new.drop(feature, axis=1)
    test_data_new = test_data_new.drop(feature, axis=1)

    return train_data_new, test_data_new

In [38]:
train_data_new, test_data_new = processFareBin(train_data, test_data)

In [39]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [40]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7856631724311092


### AgeBinned

In [41]:
def processAgeBin(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Age'
    new_feature = 'AgeBinned'

    fill_value = train_data[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new[new_feature], bins = pd.cut(train_data_new[feature], 5, labels=[0,1,2,3,4], retbins=True)
    test_data_new[new_feature] = pd.cut(test_data_new[feature], bins=bins, labels=[0,1,2,3,4])

    train_data_new = train_data_new.drop(feature, axis=1)
    test_data_new = test_data_new.drop(feature, axis=1)

    return train_data_new, test_data_new

In [42]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)

In [43]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [44]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912685958194714


### TicketNumber

In [45]:
def processTicketNumber(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    def get_ticket_number(ticket):
        num = ticket.split(' ')[-1]
        if num.isdigit():
            return int(num)
        else:
            return -1

    train_data_new['TicketNumber'] = train_data_new['Ticket'].apply(get_ticket_number)
    test_data_new['TicketNumber'] = test_data_new['Ticket'].apply(get_ticket_number)

    return train_data_new, test_data_new

In [46]:
train_data_new, test_data_new = processTicketNumber(train_data, test_data)

In [47]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [48]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7677170296905405


## Multivariate

### Fare + Age

In [49]:
train_data_new, test_data_new = processFare(train_data, test_data)
train_data_new, test_data_new = processAge(train_data_new, test_data_new)

In [50]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [51]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7755759211600026


### Fare + AgeBinned

In [52]:
train_data_new, test_data_new = processFare(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)

In [53]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [54]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7867993220764549


### FareBinned + Age

In [55]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAge(train_data_new, test_data_new)

In [56]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [57]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8181846713953927


### FareBinned + AgeBinned

In [58]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)

In [59]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [60]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912811499591992


### FareBinned + Embarked

In [64]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)

In [65]:
columns_to_drop = ['Age']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [66]:
model = DecisionTreeClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8036093151716779


### FareBinned + Title

In [70]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [71]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [72]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8114682066411399


### FareBinned + Family

In [93]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [94]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [95]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7890339589479631


### FareBinned + Alone

In [96]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [97]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [98]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7946519364760529


### AgeBinned + Embarked

In [99]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)

In [100]:
columns_to_drop = ['Fare']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [101]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912748728893352


### AgeBinned + Title

In [102]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [103]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [104]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7901889398028998


### AgeBinned + Family

In [105]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [106]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [107]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8058565061829139


### AgeBinned + Alone

In [108]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [109]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [110]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8080911430544221
