# <center> Titanic - Modelling </center>

### Import libraries

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

### Load data

In [70]:
test_data = pd.read_csv('dataset/test.csv')

In [71]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

X_test = pd.read_csv('X_test.csv')

### Train data

In [72]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title
0,2,1,45.0,0,0,13.5,2.0,1,3
1,3,1,0.75,2,1,19.2583,0.0,4,1
2,3,0,28.0,8,2,69.55,2.0,11,2
3,3,1,22.0,0,0,7.75,1.0,1,1
4,3,0,20.0,0,0,4.0125,0.0,1,2


In [73]:
X_train.shape

(891, 9)

In [74]:
y_train.head()

Unnamed: 0,Survived
0,1
1,1
2,0
3,1
4,0


In [75]:
y_train.Survived.shape

(891,)

### Test data

In [124]:
X_test = X_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'Title']]

In [126]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title
0,3,0,34.5,0,0,7.8292,1,1,2.0
1,3,1,47.0,1,0,7.0,2,2,3.0
2,2,0,62.0,0,0,9.6875,1,1,2.0
3,3,0,27.0,0,0,8.6625,2,1,2.0
4,3,1,22.0,1,1,12.2875,2,3,3.0


## <center> Modelling </center>

https://machinelearningmastery.com/evaluate-performance-machine-learning-algorithms-python-using-resampling/

In [77]:
type(X_train.Embarked[1])

numpy.float64

In [79]:
kfold = StratifiedKFold(n_splits=10)

DTC = DecisionTreeClassifier()

adaDTC = AdaBoostClassifier(DTC, random_state=7)

ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[100,200, 500, 1000],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 1.5]}

gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsadaDTC.fit(X_train,y_train.Survived)

ada_best = gsadaDTC.best_estimator_

Fitting 10 folds for each of 224 candidates, totalling 2240 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   23.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  7.6min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed: 12.1min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed: 17.8min
[Parallel(n_jobs=4)]: Done 2240 out of 2240 | elapsed: 22.2min finished


In [80]:
ada_best

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.1, n_estimators=100, random_state=7)

In [81]:
gsadaDTC.best_score_

0.8249158249158249

In [82]:
#ExtraTrees 
ExtC = ExtraTreesClassifier()


## Search grid for optimal parameters
ex_param_grid = {"max_depth": [2, 3, 10, 15],
              "max_features": [1, 3, 8],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300, 500, 1000],
              "criterion": ["gini"]}


gsExtC = GridSearchCV(ExtC, param_grid = ex_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsExtC.fit(X_train,y_train.Survived)

ExtC_best = gsExtC.best_estimator_

# Best score
gsExtC.best_score_

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   53.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  9.1min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed: 12.6min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 17.0min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 22.0min
[Parallel(n_jobs=4)]: Done 4320 out of 4320 | elapsed: 23.8min finished


0.8361391694725028

In [83]:
# RFC Parameters tunning 
RFC = RandomForestClassifier()


## Search grid for optimal parameters
# "max_depth": [None],
rf_param_grid = {
              "max_features": [1, 3, 8],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100, 300, 500, 1000],
              "criterion": ["gini"]}


gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose = 1)

gsRFC.fit(X_train,y_train.Survived)

RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   16.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  5.5min
[Parallel(n_jobs=4)]: Done 1080 out of 1080 | elapsed:  8.1min finished


0.8383838383838383

In [84]:
# Gradient boosting tunning

GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100, 200, 300, 500, 1000],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8, 10],
              'min_samples_leaf': [100, 150],
              'max_features': [0.3, 0.1]
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsGBC.fit(X_train,y_train.Survived)

GBC_best = gsGBC.best_estimator_

# Best score
gsGBC.best_score_

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=4)]: Done  68 tasks      | elapsed:    5.8s
[Parallel(n_jobs=4)]: Done 219 tasks      | elapsed:   20.0s
[Parallel(n_jobs=4)]: Done 469 tasks      | elapsed:   44.3s
[Parallel(n_jobs=4)]: Done 819 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 1269 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 1800 out of 1800 | elapsed:  2.8min finished


0.8316498316498316

In [86]:
### SVC classifier
SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100, 200, 300, 1000]}

gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

gsSVMC.fit(X_train,y_train.Survived)

SVMC_best = gsSVMC.best_estimator_

# Best score
gsSVMC.best_score_

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   18.6s
[Parallel(n_jobs=4)]: Done 280 out of 280 | elapsed:   37.5s finished


0.8125701459034792

In [100]:
from xgboost import XGBClassifier

In [106]:
clf = XGBClassifier(
        eval_metric = 'auc',
        num_class = 2,
        nthread = 4,
        silent = 1,
        )
parameters = {
        'learning_rate': [0.05],
        'max_depth': [6],
        'subsample': [0.9],
        'colsample_bytree': [0.9],
    }
clf = GridSearchCV(clf, parameters, n_jobs=1, cv=2)

In [109]:
y_train.head()

Unnamed: 0,Survived
0,1
1,1
2,0
3,1
4,0


In [132]:
xgb = XGBClassifier(n_estimators=6000, learning_rate=0.05, gamma=0, subsample=0.60, colsample_bytree=1, max_depth=100, cv=kfold)
model = xgb.fit(X_train, y_train.Survived)

In [133]:
xgb.predict(X_test)

  if diff:


array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [134]:
xgb.score

<bound method ClassifierMixin.score of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1,
       cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=100,
       min_child_weight=1, missing=None, n_estimators=6000, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.6)>

In [92]:
votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best),
('adac',ada_best),('gbc',GBC_best)], voting='soft', n_jobs=4)

votingC = votingC.fit(X_train, y_train.Survived)

In [93]:
votingC.predict(X_test)

  if diff:


array([0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,

In [94]:
y_pred = votingC.predict(X_test)

  if diff:


In [95]:
predictions = pd.DataFrame({
    'PassengerId':test_data['PassengerId'],
    'Survived':y_pred
})

In [135]:
predictions.to_csv('6_predictions_xgboost.csv', index=False)

In [None]:
import pickle

# now you can save it to a file
with open('filename.pkl', 'wb') as f:
    pickle.dump(clf, f)

# and later you can load it
with open('filename.pkl', 'rb') as f:
    clf = pickle.load(f)