# Aim

- End to End example of classication problem with Titanic dataset in sklearn.
- Generate some scripts that can be used for model evaluation in other projects. 

Notes:
- data from: http://web.stanford.edu/class/archive/cs/cs109/cs109.1166/problem12.html

## Get data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./titanic.csv')

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [4]:
y = df.pop('Survived')
X = df

In [5]:
X.shape

(887, 7)

In [6]:
y.shape

(887,)

Check to see imbalance in dataset:

In [7]:
np.unique(y,return_counts=True)

(array([0, 1], dtype=int64), array([545, 342], dtype=int64))

In [8]:
342/(545+342)

0.3855693348365276

# Data cleaning

In [9]:
df.isnull().sum()

Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

In [10]:
df['Sex'] = (df.Sex == 'male').astype(int)

In [11]:
df.drop('Name',axis=1,inplace=True)

In [12]:
df.head()

Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3,1,22.0,1,0,7.25
1,1,0,38.0,1,0,71.2833
2,3,0,26.0,0,0,7.925
3,1,0,35.0,1,0,53.1
4,3,1,35.0,0,0,8.05


## Make test train split

I dont think that *train_test_split* is lookin to keep an even number distribution of classes in each set. But i guess normally your train/test split is large enough that you are likley to get a good representation of each class.

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=38)


In [14]:
X_train.shape

(594, 6)

In [15]:
np.unique(y_train,return_counts=True)

(array([0, 1], dtype=int64), array([357, 237], dtype=int64))

## Make basline prediction 

In [16]:
from sklearn.metrics import accuracy_score, f1_score

Making predictions based on the sex of passenget only; male = survived.

In [17]:
class SexModel():
    """ A model which makes predictions using only the sex of the case. Male = pred of 1."""
    def __init__(self):
        pass
    
    def pred(self, X):
        " when called takes 'Sex' column and returns the series as prediction. X is dataframe which must contain column called 'Sex'"
        return(X['Sex'].values)

In [18]:
base_model = SexModel()
pred_train = base_model.pred(X_train)

Get test accuracy for training set:

In [19]:
accuracy_score(y_train,pred_train)

0.2239057239057239

In [20]:
f1_score(y_train,pred_train)

0.25040650406504067

NOTE: very low score with this basic model.

## Make basic reg model

This model does not reuqire hyper-parameter tuning and so no CrossValidation is needed. We just fit to the training data and the model is ready for evaluation onthe test data.

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
lgr = LogisticRegression(C=10000, solver='lbfgs') # high C value makes into normal logistic regression

In [23]:
lgr1 = lgr.fit(X_train,y_train)

#### Model eval

Using test data to eval model perf.

In [24]:
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, roc_curve
from matplotlib import pyplot as plt

class BinClassEval():
    """ 
    Makes all evalutation you need to assess binary classifcation models.
    Suggested use for final evaluation once models have been decided. e.g. running on test set. 
    Built for sklearn models: some functionality requires model to have either a predict proba or decision function.
    """
    
    
    def __init__(self, model, X, y, plot_title='Model Evaluation', plot = False):
        """
        Input
        -----
        Model, sklearn model, (that has had .fit() method called aready).
        X, df/numpy array, containing features
        y, df/numpy array, containing binary target
        """
        #### assign inputs
        self.model = model
        self.X = X
        self.y = y
        self.plot_title = plot_title
        
        self.proba_pred_avail = False
        
        #### calc prob and decision functions (where possible). 
        ### NOTE: these are assigned to the same proba_preds attribute. If predict_prob avail then this takes precident.
        if hasattr(model, 'decision_function'):
            predDF = model.decision_function(X) # warning, some model dont have DF
            self.proba_preds = predDF
            self.DF = predDF
            print('Model has decision_function.')
            
        if hasattr(model, 'predict_proba'):
            proba_preds = model.predict_proba(X)[:,1]
            self.proba_preds = proba_preds
            self.proba_preds_avail = True
            print('Model has predict_proba.')
            
        self.label_preds = self.model.predict(X)
            
        #### run evaluation
        self.AUC()
        self.F1()
        self.accuracy()
        
        if plot == True:
            self.plot_AUC_PR()
            
        return
    
    def AUC(self):
        "Prints and returns AUC score."
        AUC = roc_auc_score(self.y, self.proba_preds).round(3)
        self.AUC = AUC
        print('AUC: ', AUC) # NOTE: sklearn doc says use prob, if not use decision function.
        return
    
    def F1(self):
        "Prints and returns F1 score."
        F1 = f1_score(self.y, self.label_preds).round(3)
        self.F1 = F1
        print('F1 score: ', F1)
        return
    
    def accuracy(self):
        "Prints and returns accuracy score."
        accuracy = accuracy_score(self.y, self.label_preds).round(3)
        self.accuracy = accuracy
        print('accuracy: ', accuracy)
        return
    
    def confusion_matrix(self):
        "Prints confusion matrix."
        return
    
    def plot_AUC_PR(self):
        "Plots AUC and Precision-Recall plot as one figures."
        
        precisions, recalls, thresholds = precision_recall_curve(self.y, self.proba_preds)

        fpr,tpr,thresholds_ROC = roc_curve(self.y, self.proba_preds)
        
        fig,ax = plt.subplots(1,2,figsize=(9,4))
        fig.suptitle(self.plot_title)
        # prec-recall plot
        ax[0].plot(recalls[:-1],precisions[:-1],'g-') #,label='Reca')
        ax[0].set_ylabel('Precision')
        ax[0].set_xlabel('Recall')
        ax[0].legend(frameon=True,loc='center right')
        ax[0].set_ylim([0,1.1])
        # AUC plot
        ax[1].plot(fpr,tpr)
        ax[1].plot([0,1],[0,1],'k--') # 45def line
        ax[1].set_xlabel('F positive rate')
        ax[1].set_ylabel('T positve rate')
        
        return

In [25]:
BinClassEval(lgr1, X_train, y_train)

Model has decision_function.
Model has predict_proba.
AUC:  0.854
F1 score:  0.752
accuracy:  0.81


<__main__.BinClassEval at 0x1d4d2705358>

The above evaluation is evaluating the model on the whole training set after it has been fit on that set. It looks to be better practice to use a cross_val score. This fits to each individual set and evaluates on the seperate 'validation' set. An average is taken over each of the folds. This means that each model after traiing is tested on data that has not been used in training. Presumably it therefore gives a better idea of how the model may generalise to new data when it is fitted to it.

In [26]:
from sklearn.model_selection import cross_val_score, GridSearchCV

In [29]:
scores = cross_val_score(lgr1, X_train, y_train, cv=4, scoring='accuracy')

def disp_scores(scores):
    " Prints mean and std of scores."
    print('Mean: ', scores.mean())
    print('Std: ', scores.std())
    
disp_scores(scores)

Mean:  0.7945045045045045
Std:  0.025328857267352796


In [30]:
scores = cross_val_score(lgr1, X_train, y_train, cv=4, scoring='f1')

def disp_scores(scores):
    " Prints mean and std of scores."
    print('Mean: ', scores.mean())
    print('Std: ', scores.std())
    
disp_scores(scores)

Mean:  0.7344393207436685
Std:  0.018732595805414292


NOTE: accurancy and f1 score have both reduced when we use CV scoring. This is expected as model is less likley to overfit. It is more likely a closer estimate to hte true performance.

## Log regression with regularisation

If we need to tune hyperparameters then we need to use Cross Validation within the training set.

NOTE: dont understand why there is no difference when re-running; there is surely a difference depending on the cv splitting, which should change each time i rerun??)

In [38]:
lgrr = LogisticRegression(solver='liblinear') # high C value makes into normal logistic regression

In [39]:
param_grid = {'C':[0.1,1,10,100,1000,10000]}
grid_search = GridSearchCV(lgrr, param_grid, scoring='f1', cv=10, return_train_score=True)

In [40]:
grid_search.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 1, 10, 100, 1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [41]:
grid_search.best_estimator_

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [42]:
cvres = grid_search.cv_results_
for mean_score, std_test_score, params in zip(cvres["mean_test_score"], cvres["std_test_score"], cvres["params"]):
    print((mean_score), std_test_score , params)

0.6939070448789496 0.05895751199200433 {'C': 0.1}
0.7140921372539883 0.055690204322775244 {'C': 1}
0.7380263289463946 0.07047613587559015 {'C': 10}
0.7440226395540406 0.07629141689060742 {'C': 100}
0.7440226395540406 0.07629141689060742 {'C': 1000}
0.7440226395540406 0.07629141689060742 {'C': 10000}


NOTE: optimal 'C' is changing when have differnet training and test sets (i.e. random state is not set). Could be a conversion issue of logreg model (based on features not scaled).


Perofrmance for each fold data is available for each test and train set:

In [44]:
cvres

{'mean_fit_time': array([0.00419209, 0.00259681, 0.00248897, 0.00278914, 0.002595  ,
        0.00289316]),
 'mean_score_time': array([0.00159838, 0.00079763, 0.00099988, 0.00099483, 0.00099699,
        0.00129437]),
 'mean_test_score': array([0.69390704, 0.71409214, 0.73802633, 0.74402264, 0.74402264,
        0.74402264]),
 'mean_train_score': array([0.70776571, 0.72309439, 0.74392428, 0.7488709 , 0.74929759,
        0.74960654]),
 'param_C': masked_array(data=[0.1, 1, 10, 100, 1000, 10000],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.1},
  {'C': 1},
  {'C': 10},
  {'C': 100},
  {'C': 1000},
  {'C': 10000}],
 'rank_test_score': array([6, 5, 4, 1, 1, 1]),
 'split0_test_score': array([0.66666667, 0.71428571, 0.71428571, 0.71428571, 0.71428571,
        0.71428571]),
 'split0_train_score': array([0.7025641 , 0.72361809, 0.75425791, 0.75662651, 0.75662651,
        0.75662651]),
 'split1_test_score': a

## Make SVC with model selection

In [45]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':['linear'], 'C':[100]}
svc = svm.SVC(gamma="scale")
clf2 = GridSearchCV(svc, parameters, cv=3, scoring='f1')
clf2.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [100], 'kernel': ['linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [46]:
cvres = clf2.cv_results_
for mean_score, std_test_score, params in zip(cvres["mean_test_score"], cvres["std_test_score"], cvres["params"]):
    print((mean_score), std_test_score , params)

0.712117971242059 0.012689565237110768 {'kernel': 'linear', 'C': 100}


In [47]:
BinClassEval(clf2, X_train, y_train, 'Training Set Eval')

Model has decision_function.
AUC:  0.841
F1 score:  0.715
accuracy:  0.783


<__main__.BinClassEval at 0x1d4d31e2e48>

In [55]:
scores = cross_val_score(clf2, X_train, y_train, cv=3, scoring='f1')

def disp_scores(scores):
    " Prints mean and std of scores."
    print('Mean: ', scores.mean())
    print('Std: ', scores.std())
    
disp_scores(scores)

Mean:  0.7121179712420589
Std:  0.012689565237110768


NOTE: we would compare the scores and std of each model perofrmance and now pick one or two suitable candidates for evaluation on the test set.

# Eval on test set

SLight drop in non-reg log r (0.73).

In [59]:
BinClassEval(lgr1, X_test, y_test)

Model has decision_function.
Model has predict_proba.
AUC:  0.861
F1 score:  0.724
accuracy:  0.812


<__main__.BinClassEval at 0x1d4d320ddd8>

Regularised LogR model sees a drop in f1 from 0.74

In [52]:
BinClassEval(grid_search, X_test, y_test)

Model has decision_function.
Model has predict_proba.
AUC:  0.861
F1 score:  0.72
accuracy:  0.809


<__main__.BinClassEval at 0x1d4d31e24a8>

SVM sees a slight increase from 0.71:

In [54]:
BinClassEval(clf2, X_test, y_test)

Model has decision_function.
AUC:  0.846
F1 score:  0.726
accuracy:  0.812


<__main__.BinClassEval at 0x1d4d320d860>

# Code to compare multiple models within training set with CV

In [67]:
from sklearn.model_selection import cross_val_score

class CompareBinModels():
    """
    Takes list of models and evaluates each on same set of data.
    
    This may take some time to run as currently retrains each model on each fold(4).
    """
    def __init__(self, models, X, y):
        """
        Input
        ------
        models, dict, name (string) and sklearn model pairs (that have been .fit()),
        X, df/np array , with features,
        y, df/np array , with target (binary),
        
        """
        #### assign inputs as attributes
        self.models = models
        self.X = X
        self.y = y
        
        #### run evals
        self.run_evaluations()
        
        return
    
    def run_evaluations(self):
        #### instantiate each model evaluation object
        names = []
        means = []
        stds = []
        print('start run_evals')
        
        for modelname in models:
            model = models[modelname]
#             print('in loop', model)
            
            scores = get_cross_validation_scores(model, self.X, self.y, 'f1')
            names.append(modelname)
            means.append(np.mean(scores))
            
            stds.append(np.std(scores))
            
#             print(scores)
            
#             m_eval_obj = BinClassEval(model, X, y, plot=False) # create model eval object
            # save F1 score
#             modelevals[modelname] = m_eval_obj # save evaluation object
            
        #### make model evaluations, scores etc into df
        cols = ['model_name', 'mean', 'std']
        df = pd.DataFrame([names, means, stds], index=cols).T
        self.modelevals = df
        
        return
    
    def compare_models(self):
        pass
    
def get_cross_validation_scores(model, X, y, scoring='f1'):
    "Calc cross validation scores."
    scores = cross_val_score(model, X, y, cv=4, scoring=scoring)
    return(scores)
    
    

In [68]:
models = {'simple log reg': lgr1, 'reg log reg': grid_search}
evals = CompareBinModels(models, X_train, y_train)

start run_evals




In [69]:
evals.modelevals

Unnamed: 0,model_name,mean,std
0,simple log reg,0.734439,0.0187326
1,reg log reg,0.731617,0.0222615
