# Import packages and Settings

In [1]:
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})
test_data['Sex'] = test_data['Sex'].map({'male':0, 'female':1})

label = 'Survived'

# Functions

In [4]:
def makeOneHot(column, train_data, test_data):
    train_data_new = train_data.copy()
    test_data_new = test_data.copy()

    encoder_onehot = OneHotEncoder(sparse_output=False)

    train_data_onehot = encoder_onehot.fit_transform(pd.DataFrame(train_data[column]))
    train_feature_name_onehot = encoder_onehot.get_feature_names_out()
    train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot, dtype=int)

    test_data_onehot = encoder_onehot.transform(pd.DataFrame(test_data[column]))
    test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot, dtype=int)

    train_data_new = pd.concat([train_data_new, train_data_onehot], axis=1)
    test_data_new = pd.concat([test_data_new, test_data_onehot], axis=1)

    train_data_new = train_data_new.drop(column, axis=1)
    test_data_new = test_data_new.drop(column, axis=1)

    return train_data_new, test_data_new

In [5]:
def PrepareData(columns_to_drop, label, train_data, test_data):
    X_train = train_data.drop([label] + columns_to_drop + ['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    Y_train = train_data[label]
    X_test = test_data.drop(columns_to_drop + ['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    return X_train, Y_train, X_test

In [6]:
def Eval(model, X_train, Y_train):
    scores = cross_val_score(model, X_train, Y_train)
    return scores.mean()

# Base Model

In [120]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data, test_data)

In [121]:
model = DecisionTreeClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7845395769254911


In [122]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7879103634423451


In [123]:
model = RandomForestClassifier(n_estimators=500, random_state=0)
print(Eval(model, X_train, Y_train))

0.789033958947963


In [9]:
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

In [10]:
result = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})
result.to_csv('./Result/result_basic.csv', index=False)

# Improvements

## Univariate

### Fare

In [110]:
def processFare(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Fare'
    fill_value = train_data_new[feature].mode()[0]
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    return train_data_new, test_data_new

In [111]:
train_data_new, test_data_new = processFare(train_data, test_data)

In [112]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [14]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7946833218253719


In [114]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7969367899064717


### Age

In [13]:
def processAge(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Age'
    fill_value = train_data_new[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    return train_data_new, test_data_new

In [14]:
train_data_new, test_data_new = processAge(train_data, test_data)

In [15]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [18]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8159437574540205


In [16]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8081162513338773


### Embarked

In [17]:
def processEmbarked(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Embarked'
    fill_value = train_data_new[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new, test_data_new = makeOneHot('Embarked', train_data_new, test_data_new)

    return train_data_new, test_data_new

In [18]:
train_data_new, test_data_new = processEmbarked(train_data, test_data)

In [19]:
columns_to_drop = ['Age', 'Fare']
X_train, Y_train, X_test = PrepareData(columns_to_drop + columns_to_drop, label, train_data_new, test_data_new)

In [22]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7946268281965978


In [20]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7879229175820728


### Title

In [21]:
def processTitle(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Title'

    def process(name):
        new_feature = name.str.extract(r' ([A-Za-z]+)\.', expand=False)
        new_feature = new_feature.replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
        new_feature = new_feature.replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        
        return new_feature
    
    train_data_new[feature] = process(train_data_new['Name'])
    test_data_new[feature] = process(test_data_new['Name'])

    train_data_new, test_data_new = makeOneHot(feature, train_data_new, test_data_new)

    return train_data_new, test_data_new

In [22]:
train_data_new, test_data_new = processTitle(train_data, test_data)

In [23]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [26]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8081036971941498


In [24]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8047266336074321


### Family

In [25]:
def processFamily(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    train_data_new['Family'] = train_data_new['SibSp'] + train_data_new['Parch'] + 1
    test_data_new['Family'] = test_data_new['SibSp'] + test_data_new['Parch'] + 1

    train_data_new = train_data_new.drop(['SibSp', 'Parch'], axis=1)
    test_data_new = test_data_new.drop(['SibSp', 'Parch'], axis=1)

    return train_data_new, test_data_new

In [26]:
train_data_new, test_data_new = processFamily(train_data, test_data)

In [27]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [30]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7879166405122089


In [28]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7890402360178268


### Alone

In [29]:
def processAlone(train_data, test_data):
    train_data_new, test_data_new = processFamily(train_data, test_data)

    train_data_new['Alone'] = train_data_new['Family'].apply(lambda x: 1 if x == 1 else 0)
    test_data_new['Alone'] = test_data_new['Family'].apply(lambda x: 1 if x == 1 else 0)

    train_data_new = train_data_new.drop('Family', axis=1)
    test_data_new = test_data_new.drop('Family', axis=1)

    return train_data_new, test_data_new

In [30]:
train_data_new, test_data_new = processAlone(train_data, test_data)

In [31]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [34]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912937040989265


In [32]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7912937040989265


### FareBinned

In [33]:
def processFareBin(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Fare'
    new_feature = 'FareBinned'

    fill_value = train_data[feature].mode()[0]
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new[new_feature], bins = pd.cut(train_data_new[feature], 5, labels=[0,1,2,3,4], retbins=True)
    test_data_new[new_feature] = pd.cut(test_data_new[feature], bins=bins, labels=[0,1,2,3,4])

    train_data_new = train_data_new.drop(feature, axis=1)
    test_data_new = test_data_new.drop(feature, axis=1)

    return train_data_new, test_data_new

In [34]:
train_data_new, test_data_new = processFareBin(train_data, test_data)

In [35]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [38]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7856631724311092


In [36]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7879103634423451


### AgeBinned

In [37]:
def processAgeBin(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Age'
    new_feature = 'AgeBinned'

    fill_value = train_data[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new[new_feature], bins = pd.cut(train_data_new[feature], 5, labels=[0,1,2,3,4], retbins=True)
    test_data_new[new_feature] = pd.cut(test_data_new[feature], bins=bins, labels=[0,1,2,3,4])

    train_data_new = train_data_new.drop(feature, axis=1)
    test_data_new = test_data_new.drop(feature, axis=1)

    return train_data_new, test_data_new

In [38]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)

In [39]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [42]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912685958194714


In [40]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8036281463812692


### TicketNumber

In [41]:
def processTicketNumber(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    def get_ticket_number(ticket):
        num = ticket.split(' ')[-1]
        if num.isdigit():
            return int(num)
        else:
            return -1

    train_data_new['TicketNumber'] = train_data_new['Ticket'].apply(get_ticket_number)
    test_data_new['TicketNumber'] = test_data_new['Ticket'].apply(get_ticket_number)

    return train_data_new, test_data_new

In [42]:
train_data_new, test_data_new = processTicketNumber(train_data, test_data)

In [43]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [46]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7677170296905405


In [44]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7811625133387736


## Multivariate

### Fare + Age

In [45]:
train_data_new, test_data_new = processFare(train_data, test_data)
train_data_new, test_data_new = processAge(train_data_new, test_data_new)

In [46]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [49]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7755759211600026


In [47]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8159939740129308


### Fare + AgeBinned

In [48]:
train_data_new, test_data_new = processFare(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)

In [49]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [52]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7867993220764549


In [50]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8092147385600402


### FareBinned + Age

In [51]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAge(train_data_new, test_data_new)

In [52]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [55]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8181846713953927


In [53]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8069989328981231


### FareBinned + AgeBinned

In [54]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)

In [55]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [58]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912811499591992


In [56]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8002448057246878


### FareBinned + Embarked

In [57]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)

In [58]:
columns_to_drop = ['Age']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [61]:
model = DecisionTreeClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8036093151716779


In [59]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7912937040989266


### FareBinned + Title

In [60]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [61]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [64]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8114682066411399


In [62]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8047203565375682


### FareBinned + Family

In [63]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [64]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [67]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7890339589479631


In [65]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.789033958947963


### FareBinned + Alone

In [66]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [67]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [70]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7946519364760529


In [68]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.792404745464817


### AgeBinned + Embarked

In [69]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)

In [70]:
columns_to_drop = ['Fare']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [73]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912748728893352


In [71]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8092461239093591


### AgeBinned + Title

In [72]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [73]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [76]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7901889398028998


In [74]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7958069173309898


### AgeBinned + Family

In [75]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [76]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [79]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8058565061829139


In [77]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8092272926997678


### AgeBinned + Alone

In [78]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [79]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [82]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8080911430544221


In [80]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8047391877471597


### FareBinned + AgeBinned + Embarked

In [81]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)

In [82]:
columns_to_drop = []
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [88]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912748728893353


In [83]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8126169104262131


### FareBinned + AgeBinned + Title

In [84]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [85]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [85]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7991212102190698


In [86]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8002448057246878


### FareBinned + AgeBinned + Family

In [87]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [88]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [91]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8036218693114054


In [89]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8081036971941498


### FareBinned + AgeBinned + Alone

In [90]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [91]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [94]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7991212102190698


In [92]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.8002448057246878


### FareBinned + AgeBinned + TicketNumber

In [93]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processTicketNumber(train_data_new, test_data_new)

In [94]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [97]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7822798317745276


In [95]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7968740192078337


### FareBinned + AgeBinned + Embarked + Family

In [96]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [97]:
columns_to_drop = []
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [101]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7991400414286611


In [98]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7980101688531793


### FareBinned + AgeBinned + Embarked + Alone

In [99]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [100]:
columns_to_drop = []
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [104]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.8002448057246877


In [101]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7957629778419435


### FareBinned + AgeBinned + Embarked + Title

In [102]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [103]:
columns_to_drop = []
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [107]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7946331052664616


In [104]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7980038917833155


### FareBinned + AgeBinned + Embarked + Alone + Title

In [105]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [106]:
columns_to_drop = []
X_train, Y_train, X_test = PrepareData(columns_to_drop, label, train_data_new, test_data_new)

In [110]:
model = DecisionTreeClassifier(random_state=0) 
print(Eval(model, X_train, Y_train))

0.7912623187496076


In [107]:
model = RandomForestClassifier(random_state=0)
print(Eval(model, X_train, Y_train))

0.7957629778419433


In [119]:
model = RandomForestClassifier(n_estimators=500, random_state=0)
print(Eval(model, X_train, Y_train))

0.789033958947963
