In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('darkgrid')

In [9]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [5]:
df = pd.read_csv('abt_titanic.csv')
print(df.shape)

(891, 12)


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,missing_age,related,Embarked_C,Embarked_Q,Embarked_S,Embarked_missing
0,0,1,0,3,0,22.0,0,1,0,0,1,0
1,1,2,1,1,1,38.0,0,1,1,0,0,0
2,2,3,1,3,1,26.0,0,0,0,0,1,0
3,3,4,1,1,1,35.0,0,1,0,0,1,0
4,4,5,0,3,0,35.0,0,0,0,0,1,0


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
y = df.Survived

X = df.drop('Survived', axis = 1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

In [11]:
print( len (X_train), len(X_test), len(y_train), len(y_test))

712 179 712 179


In [12]:
X_train.describe()

Unnamed: 0.1,Unnamed: 0,PassengerId,Pclass,Sex,Age,missing_age,related,Embarked_C,Embarked_Q,Embarked_S,Embarked_missing
count,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0
mean,448.803371,449.803371,2.300562,0.359551,29.575724,0.198034,0.907303,0.182584,0.082865,0.733146,0.001404
std,257.67053,257.67053,0.845007,0.480206,13.322938,0.398798,1.62344,0.386597,0.275872,0.442626,0.037477
min,0.0,1.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,224.75,225.75,1.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,454.5,455.5,3.0,0.0,29.699118,0.0,0.0,0.0,0.0,1.0,0.0
75%,673.25,674.25,3.0,1.0,35.0,0.0,1.0,0.0,0.0,1.0,0.0
max,890.0,891.0,3.0,1.0,80.0,1.0,10.0,1.0,1.0,1.0,1.0


In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [14]:
make_pipeline(StandardScaler(),RandomForestClassifier(random_state = 123) )

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min...s='warn', n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False))])

In [15]:
# Create pipelines dictionary
pipelines = {
    'RF' : make_pipeline(StandardScaler(),RandomForestClassifier(random_state = 123)),
    'GB' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = 123))
}

In [16]:
pipelines['RF'].get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('randomforestclassifier',
   RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
               oob_score=False, random_state=123, verbose=0, warm_start=False))],
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'randomforestclassifier': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=

In [18]:
rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [100,200],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33],
    'randomforestclassifier__min_samples_leaf': [1, 3, 5,10]
}

In [20]:
pipelines['GB'].get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('gradientboostingclassifier',
   GradientBoostingClassifier(criterion='friedman_mse', init=None,
                 learning_rate=0.1, loss='deviance', max_depth=3,
                 max_features=None, max_leaf_nodes=None,
                 min_impurity_decrease=0.0, min_impurity_split=None,
                 min_samples_leaf=1, min_samples_split=2,
                 min_weight_fraction_leaf=0.0, n_estimators=100,
                 n_iter_no_change=None, presort='auto', random_state=123,
                 subsample=1.0, tol=0.0001, validation_fraction=0.1,
                 verbose=0, warm_start=False))],
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'gradientboostingclassifier': GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.1, loss='deviance', max_depth=3,
               max_features=None, max_leaf_nodes

In [22]:
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [100,200],
    'gradientboostingclassifier__learning_rate': [0.05, 0.1, 0.2 ],
    'gradientboostingclassifier__max_depth': [1, 3, 5]
}

In [23]:
hyperparameters = {
    'RF' : rf_hyperparameters,
    'GB' :gb_hyperparameters
}

In [24]:
from sklearn.model_selection import GridSearchCV

In [26]:
print(X_train.shape)
print(y_train.shape)

(712, 11)
(712,)


In [27]:
#create empty dictionnary called fitted_ model
fitted_models = {}

#Loop through model pipeline, tuning each one and saving it to fitted_model

for name, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hyperparameters[name], cv = 10, n_jobs=-1)
    
    #fit model on X_train y_train
    model.fit(X_train, y_train)
    
    #store model in fitted_models[name]
    fitted_models[name] = model
    
    #print{name} has)been fitted
    print(name, 'has been fitted')

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


RF has been fitted
GB has been fitted


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [30]:
#Auroc Review
#fitted_models

for name, model in fitted_models.items():
    print(name,model.best_score_)

RF 0.8160112359550562
GB 0.8174157303370787


In [31]:
pred = fitted_models['RF'].predict(X_test)
print(pred[0:10])


[1 0 0 1 1 0 0 1 0 0]


  Xt = transform.transform(Xt)


In [33]:
#confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test, pred)
print(cm)

[[103   6]
 [ 22  48]]
