# Import packages and Settings

In [1]:
import pandas as pd

import matplotlib.pyplot as plt

from xgboost.sklearn import XGBClassifier

from catboost import CatBoostClassifier

%matplotlib inline

In [2]:
from sklearn.base import clone
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

train_data['Sex'] = train_data['Sex'].map({'male':0, 'female':1})
test_data['Sex'] = test_data['Sex'].map({'male':0, 'female':1})

label = 'Survived'

id = test_data['PassengerId']

In [4]:
models = [
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    XGBClassifier(random_state=0),
    CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
]

# Functions

In [5]:
def makeOneHot(column, train_data, test_data):
    train_data_new = train_data.copy()
    test_data_new = test_data.copy()

    encoder_onehot = OneHotEncoder(sparse_output=False)

    train_data_onehot = encoder_onehot.fit_transform(pd.DataFrame(train_data[column]))
    train_feature_name_onehot = encoder_onehot.get_feature_names_out()
    train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot, dtype=int)

    test_data_onehot = encoder_onehot.transform(pd.DataFrame(test_data[column]))
    test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot, dtype=int)

    train_data_new = pd.concat([train_data_new, train_data_onehot], axis=1)
    test_data_new = pd.concat([test_data_new, test_data_onehot], axis=1)

    train_data_new = train_data_new.drop(column, axis=1)
    test_data_new = test_data_new.drop(column, axis=1)

    return train_data_new, test_data_new

In [6]:
def prepareData(columns_to_drop, label, train_data, test_data):
    X_train = train_data.drop([label] + columns_to_drop + ['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    Y_train = train_data[label]
    X_test = test_data.drop(columns_to_drop + ['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    return X_train, Y_train, X_test

In [7]:
def evaluation(models, X_train, Y_train):
    model_names = [model.__class__.__name__ for model in models]
    model_scores = []

    cv = ShuffleSplit(random_state=0)

    for model in models:
        model = clone(model)
        cv_scores = cross_val_score(model, X_train, Y_train, cv=cv)
        model_scores.append(cv_scores.mean())

    cv_results = pd.DataFrame(columns=['name', 'accuracy_mean'])
    cv_results['name'] = model_names
    cv_results['accuracy_mean'] = model_scores

    return cv_results

In [8]:
def tuneParameters(model, params, X_train, Y_train):
    cv = ShuffleSplit(random_state=0)

    tune_model = GridSearchCV(model, param_grid=params, scoring='accuracy', cv=cv)
    tune_model.fit(X_train, Y_train)

    return tune_model.best_params_, tune_model.best_score_

In [9]:
def predict(model, X_train, Y_train, X_test, id, file_name = 'result.csv'):
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)

    result = pd.DataFrame({
        'PassengerId': id,
        'Survived': predictions
    })
    result.to_csv('./Result/' + file_name, index=False)

# Base Model

In [10]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data, test_data)

In [32]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.795556
1      RandomForestClassifier       0.796667
2  GradientBoostingClassifier       0.813333
3               XGBClassifier       0.800000
4          CatBoostClassifier       0.810000


In [15]:
model = DecisionTreeClassifier(random_state=0)
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, None]}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'criterion': 'gini', 'max_depth': 4} 0.8111111111111112


In [37]:
model = RandomForestClassifier(random_state=0)
params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, 8, 10, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'max_depth': 2, 'n_estimators': 500} 0.8233333333333335


In [53]:
model = GradientBoostingClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, 8, 10, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 100} 0.8222222222222223


In [34]:
model = XGBClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6, 8, 10, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 100} 0.8233333333333335


In [11]:
model = CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
params = {
    'learning_rate': [0.05, 0.03, 0.01],
    'iterations': [500, 800, 1000],
    'depth': [4, 6, 8]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'depth': 4, 'iterations': 500, 'learning_rate': 0.01} 0.8244444444444445


In [11]:
model = DecisionTreeClassifier(random_state=0)
predict(model, X_train, Y_train, X_test, id, 'result_basic.csv')

# Improvements

## Univariate

### Fare

In [12]:
def processFare(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Fare'
    fill_value = train_data_new[feature].mode()[0]
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    return train_data_new, test_data_new

In [13]:
train_data_new, test_data_new = processFare(train_data, test_data)

In [14]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [15]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.816667
1      RandomForestClassifier       0.811111
2  GradientBoostingClassifier       0.827778
3               XGBClassifier       0.821111
4          CatBoostClassifier       0.816667


### Age

In [13]:
def processAge(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Age'
    fill_value = train_data_new[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    return train_data_new, test_data_new

In [17]:
train_data_new, test_data_new = processAge(train_data, test_data)

In [18]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [19]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.802222
1      RandomForestClassifier       0.808889
2  GradientBoostingClassifier       0.834444
3               XGBClassifier       0.818889
4          CatBoostClassifier       0.832222


### Embarked

In [14]:
def processEmbarked(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Embarked'
    fill_value = train_data_new[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new, test_data_new = makeOneHot('Embarked', train_data_new, test_data_new)

    return train_data_new, test_data_new

In [21]:
train_data_new, test_data_new = processEmbarked(train_data, test_data)

In [22]:
columns_to_drop = ['Age', 'Fare']
X_train, Y_train, X_test = prepareData(columns_to_drop + columns_to_drop, label, train_data_new, test_data_new)

In [23]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.804444
1      RandomForestClassifier       0.805556
2  GradientBoostingClassifier       0.813333
3               XGBClassifier       0.798889
4          CatBoostClassifier       0.800000


### Title

In [15]:
def processTitle(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Title'

    def process(name):
        new_feature = name.str.extract(r' ([A-Za-z]+)\.', expand=False)
        new_feature = new_feature.replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
        new_feature = new_feature.replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        
        return new_feature
    
    train_data_new[feature] = process(train_data_new['Name'])
    test_data_new[feature] = process(test_data_new['Name'])

    train_data_new, test_data_new = makeOneHot(feature, train_data_new, test_data_new)

    return train_data_new, test_data_new

In [25]:
train_data_new, test_data_new = processTitle(train_data, test_data)

In [26]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [27]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.815556
1      RandomForestClassifier       0.814444
2  GradientBoostingClassifier       0.830000
3               XGBClassifier       0.817778
4          CatBoostClassifier       0.831111


### Family

In [16]:
def processFamily(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    train_data_new['Family'] = train_data_new['SibSp'] + train_data_new['Parch'] + 1
    test_data_new['Family'] = test_data_new['SibSp'] + test_data_new['Parch'] + 1

    train_data_new = train_data_new.drop(['SibSp', 'Parch'], axis=1)
    test_data_new = test_data_new.drop(['SibSp', 'Parch'], axis=1)

    return train_data_new, test_data_new

In [29]:
train_data_new, test_data_new = processFamily(train_data, test_data)

In [30]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [31]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.808889
1      RandomForestClassifier       0.807778
2  GradientBoostingClassifier       0.824444
3               XGBClassifier       0.813333
4          CatBoostClassifier       0.822222


### Alone

In [17]:
def processAlone(train_data, test_data):
    train_data_new, test_data_new = processFamily(train_data, test_data)

    train_data_new['Alone'] = train_data_new['Family'].apply(lambda x: 1 if x == 1 else 0)
    test_data_new['Alone'] = test_data_new['Family'].apply(lambda x: 1 if x == 1 else 0)

    train_data_new = train_data_new.drop('Family', axis=1)
    test_data_new = test_data_new.drop('Family', axis=1)

    return train_data_new, test_data_new

In [33]:
train_data_new, test_data_new = processAlone(train_data, test_data)

In [34]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [35]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.814444
1      RandomForestClassifier       0.814444
2  GradientBoostingClassifier       0.814444
3               XGBClassifier       0.814444
4          CatBoostClassifier       0.814444


### FareBinned

In [18]:
def processFareBin(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Fare'
    new_feature = 'FareBinned'

    fill_value = train_data[feature].mode()[0]
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new[new_feature], bins = pd.cut(train_data_new[feature], 5, labels=[0,1,2,3,4], retbins=True)
    train_data_new[new_feature] = train_data_new[new_feature].astype('int')
    test_data_new[new_feature] = pd.cut(test_data_new[feature], bins=bins, labels=[0,1,2,3,4])
    test_data_new[new_feature] = test_data_new[new_feature].astype('int')

    train_data_new = train_data_new.drop(feature, axis=1)
    test_data_new = test_data_new.drop(feature, axis=1)

    return train_data_new, test_data_new

In [37]:
train_data_new, test_data_new = processFareBin(train_data, test_data)

In [38]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [39]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.791111
1      RandomForestClassifier       0.794444
2  GradientBoostingClassifier       0.808889
3               XGBClassifier       0.797778
4          CatBoostClassifier       0.817778


### AgeBinned

In [19]:
def processAgeBin(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    feature = 'Age'
    new_feature = 'AgeBinned'

    fill_value = train_data[feature].mode()[0]
    train_data_new[feature] = train_data_new[feature].fillna(value=fill_value)
    test_data_new[feature] = test_data_new[feature].fillna(value=fill_value)

    train_data_new[new_feature], bins = pd.cut(train_data_new[feature], 5, labels=[0,1,2,3,4], retbins=True)
    train_data_new[new_feature] = train_data_new[new_feature].astype('int')
    test_data_new[new_feature] = pd.cut(test_data_new[feature], bins=bins, labels=[0,1,2,3,4])
    test_data_new[new_feature] = test_data_new[new_feature].fillna(value=0)
    test_data_new[new_feature] = test_data_new[new_feature].astype('int')

    train_data_new = train_data_new.drop(feature, axis=1)
    test_data_new = test_data_new.drop(feature, axis=1)

    return train_data_new, test_data_new

In [44]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)

In [45]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [46]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.798889
1      RandomForestClassifier       0.807778
2  GradientBoostingClassifier       0.826667
3               XGBClassifier       0.820000
4          CatBoostClassifier       0.827778


### TicketNumber

In [20]:
def processTicketNumber(train_data, test_data):
    train_data_new, test_data_new = train_data.copy(), test_data.copy()

    def get_ticket_number(ticket):
        num = ticket.split(' ')[-1]
        if num.isdigit():
            return int(num)
        else:
            return -1

    train_data_new['TicketNumber'] = train_data_new['Ticket'].apply(get_ticket_number)
    test_data_new['TicketNumber'] = test_data_new['Ticket'].apply(get_ticket_number)

    return train_data_new, test_data_new

In [48]:
train_data_new, test_data_new = processTicketNumber(train_data, test_data)

In [49]:
columns_to_drop = ['Age', 'Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [50]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.777778
1      RandomForestClassifier       0.791111
2  GradientBoostingClassifier       0.822222
3               XGBClassifier       0.813333
4          CatBoostClassifier       0.815556


## Multivariate

### Fare + Age

In [51]:
train_data_new, test_data_new = processFare(train_data, test_data)
train_data_new, test_data_new = processAge(train_data_new, test_data_new)

In [52]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [53]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.795556
1      RandomForestClassifier       0.823333
2  GradientBoostingClassifier       0.843333
3               XGBClassifier       0.837778
4          CatBoostClassifier       0.840000


### Fare + AgeBinned

In [54]:
train_data_new, test_data_new = processFare(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)

In [55]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [56]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.810000
1      RandomForestClassifier       0.816667
2  GradientBoostingClassifier       0.835556
3               XGBClassifier       0.838889
4          CatBoostClassifier       0.831111


### FareBinned + Age

In [57]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAge(train_data_new, test_data_new)

In [58]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [59]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.802222
1      RandomForestClassifier       0.814444
2  GradientBoostingClassifier       0.836667
3               XGBClassifier       0.820000
4          CatBoostClassifier       0.837778


### FareBinned + AgeBinned

In [60]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)

In [61]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [62]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.792222
1      RandomForestClassifier       0.807778
2  GradientBoostingClassifier       0.824444
3               XGBClassifier       0.818889
4          CatBoostClassifier       0.834444


### FareBinned + Embarked

In [63]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)

In [64]:
columns_to_drop = ['Age']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [65]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.808889
1      RandomForestClassifier       0.811111
2  GradientBoostingClassifier       0.814444
3               XGBClassifier       0.801111
4          CatBoostClassifier       0.811111


### FareBinned + Title

In [66]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [67]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [68]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.814444
1      RandomForestClassifier       0.816667
2  GradientBoostingClassifier       0.833333
3               XGBClassifier       0.818889
4          CatBoostClassifier       0.831111


### FareBinned + Family

In [69]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [70]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [71]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.806667
1      RandomForestClassifier       0.806667
2  GradientBoostingClassifier       0.823333
3               XGBClassifier       0.806667
4          CatBoostClassifier       0.822222


### FareBinned + Alone

In [72]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [73]:
columns_to_drop = ['Age', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [74]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.810000
1      RandomForestClassifier       0.810000
2  GradientBoostingClassifier       0.812222
3               XGBClassifier       0.810000
4          CatBoostClassifier       0.810000


### AgeBinned + Embarked

In [75]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)

In [76]:
columns_to_drop = ['Fare']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [77]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.801111
1      RandomForestClassifier       0.812222
2  GradientBoostingClassifier       0.827778
3               XGBClassifier       0.816667
4          CatBoostClassifier       0.820000


### AgeBinned + Title

In [78]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [79]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [80]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.815556
1      RandomForestClassifier       0.822222
2  GradientBoostingClassifier       0.841111
3               XGBClassifier       0.818889
4          CatBoostClassifier       0.831111


### AgeBinned + Family

In [81]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [82]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [83]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.811111
1      RandomForestClassifier       0.817778
2  GradientBoostingClassifier       0.830000
3               XGBClassifier       0.821111
4          CatBoostClassifier       0.821111


### AgeBinned + Alone

In [84]:
train_data_new, test_data_new = processAgeBin(train_data, test_data)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [85]:
columns_to_drop = ['Fare', 'Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [86]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.823333
1      RandomForestClassifier       0.818889
2  GradientBoostingClassifier       0.813333
3               XGBClassifier       0.816667
4          CatBoostClassifier       0.816667


### FareBinned + AgeBinned + Embarked

In [87]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)

In [88]:
columns_to_drop = []
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [89]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.811111
1      RandomForestClassifier       0.823333
2  GradientBoostingClassifier       0.830000
3               XGBClassifier       0.820000
4          CatBoostClassifier       0.834444


### FareBinned + AgeBinned + Title

In [90]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [91]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [92]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.814444
1      RandomForestClassifier       0.812222
2  GradientBoostingClassifier       0.808889
3               XGBClassifier       0.811111
4          CatBoostClassifier       0.810000


### FareBinned + AgeBinned + Family

In [93]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [94]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [95]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.805556
1      RandomForestClassifier       0.814444
2  GradientBoostingClassifier       0.825556
3               XGBClassifier       0.818889
4          CatBoostClassifier       0.821111


### FareBinned + AgeBinned + Alone

In [96]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [97]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [98]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.814444
1      RandomForestClassifier       0.812222
2  GradientBoostingClassifier       0.808889
3               XGBClassifier       0.811111
4          CatBoostClassifier       0.810000


### FareBinned + AgeBinned + TicketNumber

In [99]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processTicketNumber(train_data_new, test_data_new)

In [100]:
columns_to_drop = ['Embarked']
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [101]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.791111
1      RandomForestClassifier       0.806667
2  GradientBoostingClassifier       0.841111
3               XGBClassifier       0.815556
4          CatBoostClassifier       0.837778


### FareBinned + AgeBinned + Embarked + Family

In [102]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)

In [103]:
columns_to_drop = []
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [104]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.810000
1      RandomForestClassifier       0.814444
2  GradientBoostingClassifier       0.823333
3               XGBClassifier       0.808889
4          CatBoostClassifier       0.817778


### FareBinned + AgeBinned + Embarked + Alone

In [105]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)

In [106]:
columns_to_drop = []
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [107]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.810000
1      RandomForestClassifier       0.823333
2  GradientBoostingClassifier       0.822222
3               XGBClassifier       0.818889
4          CatBoostClassifier       0.824444


### FareBinned + AgeBinned + Embarked + Title

In [108]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [109]:
columns_to_drop = []
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [110]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.800000
1      RandomForestClassifier       0.813333
2  GradientBoostingClassifier       0.841111
3               XGBClassifier       0.820000
4          CatBoostClassifier       0.835556


### FareBinned + AgeBinned + Embarked + Family + Title

In [21]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [22]:
columns_to_drop = []
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [113]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.802222
1      RandomForestClassifier       0.815556
2  GradientBoostingClassifier       0.827778
3               XGBClassifier       0.827778
4          CatBoostClassifier       0.831111


In [34]:
model = DecisionTreeClassifier(random_state=0)
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, None]}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'criterion': 'gini', 'max_depth': 4} 0.8433333333333334


In [40]:
model = RandomForestClassifier(random_state=0)
params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, 8, 10, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'max_depth': 4, 'n_estimators': 100} 0.8466666666666667


In [21]:
model = GradientBoostingClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 100} 0.8466666666666667


In [51]:
model = XGBClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, 8, 10, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 300} 0.8455555555555556


In [23]:
model = CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
params = {
    'learning_rate': [0.05, 0.03, 0.01],
    'iterations': [500, 800, 1000],
    'depth': [4, 6, 8]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'depth': 4, 'iterations': 500, 'learning_rate': 0.01} 0.8400000000000001


### FareBinned + AgeBinned + Embarked + Alone + Title

In [24]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)

In [25]:
columns_to_drop = []
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [116]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.800000
1      RandomForestClassifier       0.807778
2  GradientBoostingClassifier       0.831111
3               XGBClassifier       0.820000
4          CatBoostClassifier       0.817778


In [31]:
model = DecisionTreeClassifier(random_state=0)
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, None]}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'criterion': 'gini', 'max_depth': 4} 0.8300000000000001


In [43]:
model = RandomForestClassifier(random_state=0)
params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, 8, 10, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'max_depth': 4, 'n_estimators': 100} 0.8277777777777778


In [24]:
model = GradientBoostingClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100} 0.8333333333333334


In [48]:
model = XGBClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100} 0.8344444444444445


In [26]:
model = CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
params = {
    'learning_rate': [0.05, 0.03, 0.01],
    'iterations': [500, 800, 1000],
    'depth': [4, 6, 8]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'depth': 4, 'iterations': 500, 'learning_rate': 0.01} 0.8355555555555556


### FareBinned + AgeBinned + Embarked + Family + Title + TicketNumber

In [27]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processFamily(train_data_new, test_data_new)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)
train_data_new, test_data_new = processTicketNumber(train_data_new, test_data_new)

In [28]:
columns_to_drop = []
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [127]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.804444
1      RandomForestClassifier       0.800000
2  GradientBoostingClassifier       0.845556
3               XGBClassifier       0.825556
4          CatBoostClassifier       0.831111


In [47]:
model = DecisionTreeClassifier(random_state=0)
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, None]}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'criterion': 'gini', 'max_depth': 4} 0.8422222222222222


In [46]:
model = RandomForestClassifier(random_state=0)
params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, 8, 10, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'max_depth': 4, 'n_estimators': 300} 0.8400000000000001


In [27]:
model = GradientBoostingClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05],
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 300} 0.8511111111111112


In [45]:
model = XGBClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, 8, 10, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 300} 0.8455555555555556


In [29]:
model = CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
params = {
    'learning_rate': [0.05, 0.03, 0.01],
    'iterations': [500, 800, 1000],
    'depth': [4, 6, 8]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'depth': 4, 'iterations': 800, 'learning_rate': 0.05} 0.8477777777777777


### FareBinned + AgeBinned + Embarked + Alone + Title + TicketNumber

In [30]:
train_data_new, test_data_new = processFareBin(train_data, test_data)
train_data_new, test_data_new = processAgeBin(train_data_new, test_data_new)
train_data_new, test_data_new = processEmbarked(train_data_new, test_data_new)
train_data_new, test_data_new = processAlone(train_data_new, test_data_new)
train_data_new, test_data_new = processTitle(train_data_new, test_data_new)
train_data_new, test_data_new = processTicketNumber(train_data_new, test_data_new)

In [31]:
columns_to_drop = []
X_train, Y_train, X_test = prepareData(columns_to_drop, label, train_data_new, test_data_new)

In [124]:
cv_results = evaluation(models, X_train, Y_train)
print(cv_results)

                         name  accuracy_mean
0      DecisionTreeClassifier       0.806667
1      RandomForestClassifier       0.797778
2  GradientBoostingClassifier       0.841111
3               XGBClassifier       0.814444
4          CatBoostClassifier       0.834444


In [28]:
model = DecisionTreeClassifier(random_state=0)
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 6, None]}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'criterion': 'gini', 'max_depth': 6} 0.8322222222222223


In [50]:
model = RandomForestClassifier(random_state=0)
params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, 8, 10, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'max_depth': 4, 'n_estimators': 100} 0.8355555555555556


In [30]:
model = GradientBoostingClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05],
    'n_estimators': [100, 300],
    'max_depth': [2, 4, 6, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100} 0.8466666666666667


In [54]:
model = XGBClassifier(random_state=0)
params = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 300, 500],
    'max_depth': [2, 4, 6, None]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100} 0.8411111111111111


In [32]:
model = CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
params = {
    'learning_rate': [0.05, 0.03, 0.01],
    'iterations': [500, 800, 1000],
    'depth': [4, 6, 8]
}
params, score = tuneParameters(model, params, X_train, Y_train)
print(params, score)

{'depth': 4, 'iterations': 500, 'learning_rate': 0.05} 0.8388888888888889
