In [35]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 

In [36]:
df = sns.load_dataset('titanic')
df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [37]:
df['pclass'].unique()

array([3, 1, 2], dtype=int64)

Bagging(oob_Score = False)

In [38]:
df['sex'].unique()

array(['male', 'female'], dtype=object)

Data Pre-Processing

In [39]:
subset = df[['pclass', 'sex', 'age', 'survived']].copy()
subset.dropna(inplace=True)

In [40]:
X = subset[['pclass', 'sex', 'age']].copy()

In [41]:
# lb = preprocessing.LabelBinarizer()
le = preprocessing.LabelEncoder()
X['sex'] = le.fit_transform(subset['sex'])
y = subset['survived'].copy()

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [43]:
def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train),
                                                        lb.transform(clf.predict(X_train)))))
 
        #cv_res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        #print("Average Accuracy: \t {0:.4f}".format(np.mean(cv_res)))
        #print("Accuracy SD: \t\t {0:.4f}".format(np.std(cv_res)))
 
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test), lb.transform(res_test))))

In [44]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
print_score(clf, X_train, X_test, y_train, y_test, train=True)
print("\n********************************\n")
print_score(clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8998

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.94      0.92       294
           1       0.90      0.85      0.87       205

    accuracy                           0.90       499
   macro avg       0.90      0.89      0.90       499
weighted avg       0.90      0.90      0.90       499


Confusion Matrix: 
 [[275  19]
 [ 31 174]]

ROC AUC: 0.8921


********************************

Test Result:

accuracy score: 0.8233

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.83      0.85       130
           1       0.76      0.81      0.78        85

    accuracy                           0.82       215
   macro avg       0.81      0.82      0.82       215
weighted avg       0.83      0.82      0.82       215


Confusion Matrix: 
 [[108  22]
 [ 16  69]]

ROC AUC: 0.8213



Bagging False

In [45]:
bag_clf = BaggingClassifier(estimator=clf, n_estimators=20,
                            bootstrap=True, n_jobs=None,
                            random_state=42)
bag_clf.fit(X_train, y_train)
print_score(bag_clf, X_train, X_test, y_train, y_test, train=True)
print("\n********************************\n")
print_score(bag_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8717

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.93      0.90       294
           1       0.89      0.79      0.83       205

    accuracy                           0.87       499
   macro avg       0.88      0.86      0.86       499
weighted avg       0.87      0.87      0.87       499


Confusion Matrix: 
 [[274  20]
 [ 44 161]]

ROC AUC: 0.8587


********************************

Test Result:

accuracy score: 0.8326

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.87      0.86       130
           1       0.80      0.78      0.79        85

    accuracy                           0.83       215
   macro avg       0.83      0.82      0.82       215
weighted avg       0.83      0.83      0.83       215


Confusion Matrix: 
 [[113  17]
 [ 19  66]]

ROC AUC: 0.8229



In [46]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf.fit(X_train, y_train)
print_score(rf_clf, X_train, X_test, y_train, y_test, train=True)
print("\n******************************\n")
print_score(rf_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8998

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.94      0.92       294
           1       0.90      0.85      0.87       205

    accuracy                           0.90       499
   macro avg       0.90      0.89      0.90       499
weighted avg       0.90      0.90      0.90       499


Confusion Matrix: 
 [[275  19]
 [ 31 174]]

ROC AUC: 0.8921


******************************

Test Result:

accuracy score: 0.8233

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.83      0.85       130
           1       0.76      0.81      0.78        85

    accuracy                           0.82       215
   macro avg       0.81      0.82      0.82       215
weighted avg       0.83      0.82      0.82       215


Confusion Matrix: 
 [[108  22]
 [ 16  69]]

ROC AUC: 0.8213



In [47]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
params_grid = {"max_depth": [3, None],
               "min_samples_split": [2, 3, 10],
               "min_samples_leaf": [1, 3, 10],
               "bootstrap": [True, False],
               "criterion": ['gini', 'entropy']}
grid_search = GridSearchCV(rf_clf, params_grid,
                       n_jobs=-1, cv=5,
                       verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
grid_search.best_estimator_.get_params()
print_score(grid_search, X_train, X_test, y_train, y_test, train=True)
print("\n******************************\n")
print_score(grid_search, X_train, X_test, y_train, y_test, train=False)
 

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Train Result:

accuracy score: 0.8156

Classification Report: 
               precision    recall  f1-score   support

           0       0.77      0.97      0.86       294
           1       0.93      0.60      0.73       205

    accuracy                           0.82       499
   macro avg       0.85      0.78      0.79       499
weighted avg       0.84      0.82      0.81       499


Confusion Matrix: 
 [[285   9]
 [ 83 122]]

ROC AUC: 0.7823


******************************

Test Result:

accuracy score: 0.8233

Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.96      0.87       130
           1       0.91      0.61      0.73        85

    accuracy                           0.82       215
   macro avg       0.85      0.79      0.80       215
weighted avg       0.84      0.82      0.81       215


Confusion Matrix: 
 [[125   5]
 [ 33  52]]

ROC AUC: 0.7867



In [48]:
grid_search.best_estimator_.get_params()
 
#EXTRA TREE
 
from sklearn.ensemble import ExtraTreesClassifier
xt_clf = ExtraTreesClassifier(random_state=42, n_estimators=10)
xt_clf.fit(X_train, y_train)
print_score(xt_clf, X_train, X_test, y_train, y_test, train=True)
print("\n******************************\n")
print_score(xt_clf, X_train, X_test, y_train, y_test, train=False)
 

Train Result:

accuracy score: 0.8998

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.98      0.92       294
           1       0.97      0.78      0.86       205

    accuracy                           0.90       499
   macro avg       0.92      0.88      0.89       499
weighted avg       0.91      0.90      0.90       499


Confusion Matrix: 
 [[289   5]
 [ 45 160]]

ROC AUC: 0.8817


******************************

Test Result:

accuracy score: 0.8233

Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.88      0.86       130
           1       0.81      0.73      0.77        85

    accuracy                           0.82       215
   macro avg       0.82      0.81      0.81       215
weighted avg       0.82      0.82      0.82       215


Confusion Matrix: 
 [[115  15]
 [ 23  62]]

ROC AUC: 0.8070

