Try a simple Gradient Boost model for comparison with the neural net.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import pickle
#import kaggle

import xgboost

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [3]:
path = Path('data')

## Load prepared data & labels for submission ##

In [4]:
df = pd.read_csv(path / 'titanic_preproc.csv')
test_ids = pd.read_csv(path / 'test.csv')['PassengerId']

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,is_test,LName,Title
0,0.0,3,male,22.0,1,0,7.25,N,S,False,Braund,Mr
1,1.0,1,female,38.0,1,0,71.2833,C,C,False,Cumings,Mrs
2,1.0,3,female,26.0,0,0,7.925,N,S,False,Heikkinen,Miss
3,1.0,1,female,35.0,1,0,53.1,C,S,False,Futrelle,Mrs
4,0.0,3,male,35.0,0,0,8.05,N,S,False,Allen,Mr


## Transform categorical values to ints ##

In [6]:
cat_names = ['Pclass','LName','Sex','SibSp','Parch','Cabin','Embarked', 'Title']
cont_names = ['Age','Fare']

for c in cat_names:
    lbl = LabelEncoder() 
    lbl.fit(list(df[c].values))
    df[c] = lbl.transform(list(df[c].values))

## Normalize continous values ##

In [7]:
x = df[cont_names].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df[cont_names] = x_scaled

## Split back to train & test set ##

In [8]:
df_train = df[df['is_test'] == False].drop('is_test', axis =1)
df_test = df[df['is_test'] == True].drop(['is_test','Survived'], axis =1)

In [9]:
y = df_train['Survived'].astype(int)
X = df_train.drop('Survived', axis = 1)

## Split of validation set from train set##

In [10]:
train_X, test_X, train_y, test_y = train_test_split(X.values, y.values, test_size = 0.2)

## Learn ##
Only changed the learning rate.

In [11]:
my_model = xgboost.XGBClassifier(n_estimators = 1000, learning_rate = 0.9)
my_model.fit(train_X, train_y, early_stopping_rounds = 5, eval_set = [(test_X,test_y)], verbose = True, eval_metric = ['error'])

[0]	validation_0-error:0.201117
Will train until validation_0-error hasn't improved in 5 rounds.
[1]	validation_0-error:0.217877
[2]	validation_0-error:0.21229
[3]	validation_0-error:0.22905
[4]	validation_0-error:0.217877
[5]	validation_0-error:0.223464
Stopping. Best iteration:
[0]	validation_0-error:0.201117



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.9, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

## Predict the labels of the test set & save submission ##

In [12]:
preds = my_model.predict(df_test.values)

In [13]:
submission = pd.DataFrame({'PassengerId': test_ids, 'Survived': preds})
submission.to_csv(path / 'submission_xgboost.csv', index = False)

## Submit to competition

In [14]:
#! kaggle competitions submit -c titanic -f submission_xgboost.csv -m "xgboost, simple model, first try"

## Results:

85% __accuracy__ on the validation set,

75% __accuracy__ on the test set,

But also with no fine tuning at all...