# Boosting

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df.dropna(inplace=True)

In [4]:
X = df[['pclass', 'sex', 'age']]

In [5]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [6]:
X['sex'] = lb.fit_transform(X['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [7]:
y = df['survived']

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.3)

In [10]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
    '''
    print accuracy score and confusion matrix
    '''
    
    if train:
        '''
        train performance
        '''
        print('Train Result: \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy: \t {0:.4f}'.format(np.mean(res)))
        print('Average SD: \t\t {0:.4f}'.format(np.std(res)))
        
    elif train == False:
        '''
        test performance
        '''
        print('Test Result \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))

## AdaBoost

In [12]:
from sklearn.ensemble import AdaBoostClassifier

In [13]:
ada_clf = AdaBoostClassifier(random_state=42)

In [14]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=42)

In [15]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 0.9055

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.82      0.84        38
           1       0.92      0.94      0.93        89

   micro avg       0.91      0.91      0.91       127
   macro avg       0.89      0.88      0.89       127
weighted avg       0.90      0.91      0.90       127
 

Confusion Matrix: 
 [[31  7]
 [ 5 84]] 

Average Accuracy: 	 0.7547
Average SD: 		 0.0920


In [17]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.6909

Classification Report: 
               precision    recall  f1-score   support

           0       0.60      0.57      0.59        21
           1       0.74      0.76      0.75        34

   micro avg       0.69      0.69      0.69        55
   macro avg       0.67      0.67      0.67        55
weighted avg       0.69      0.69      0.69        55
 

Confusion Matrix: 
 [[12  9]
 [ 8 26]] 



## AdaBoost with Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
ada_clf = AdaBoostClassifier(RandomForestClassifier())

In [20]:
ada_clf.fit(X_train, y_train)



AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=None)

In [21]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 0.9291

Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.84      0.88        38
           1       0.93      0.97      0.95        89

   micro avg       0.93      0.93      0.93       127
   macro avg       0.92      0.90      0.91       127
weighted avg       0.93      0.93      0.93       127
 

Confusion Matrix: 
 [[32  6]
 [ 3 86]] 





















Average Accuracy: 	 0.8043
Average SD: 		 0.1032


In [22]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.7636

Classification Report: 
               precision    recall  f1-score   support

           0       0.65      0.81      0.72        21
           1       0.86      0.74      0.79        34

   micro avg       0.76      0.76      0.76        55
   macro avg       0.76      0.77      0.76        55
weighted avg       0.78      0.76      0.77        55
 

Confusion Matrix: 
 [[17  4]
 [ 9 25]] 



***

# Gradient Boosting Machine (GBM)

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

In [26]:
gbc_clf = GradientBoostingClassifier()
gbc_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [27]:
print_score(gbc_clf, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 0.9291

Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.87      0.88        38
           1       0.94      0.96      0.95        89

   micro avg       0.93      0.93      0.93       127
   macro avg       0.92      0.91      0.91       127
weighted avg       0.93      0.93      0.93       127
 

Confusion Matrix: 
 [[33  5]
 [ 4 85]] 

Average Accuracy: 	 0.8210
Average SD: 		 0.0956


In [28]:
print_score(ada_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.7636

Classification Report: 
               precision    recall  f1-score   support

           0       0.65      0.81      0.72        21
           1       0.86      0.74      0.79        34

   micro avg       0.76      0.76      0.76        55
   macro avg       0.76      0.77      0.76        55
weighted avg       0.78      0.76      0.77        55
 

Confusion Matrix: 
 [[17  4]
 [ 9 25]] 



***

| Classifier | Decision Tree | Bagging | Random Forest | Optimized RF | Extra-Trees | AdaBoost(CART) | AdaBoost(RF) | Gradient Boosting |
|:-|:-|:-|:-|:-|:-|:-|:-|:-|
| Train Accuracy Score | 0.9669 | 0.9669 | 0.9134 | 0.8740 | 0.9370 | 0.9055 | 0.9291 | 0.9291 |
| Average Accuracy Score | 0.8198 | 0.8044 | 0.7478 | 0.7580 | 0.7407 | 0.7547 | 0|.8043 | 0.8210 |
| SD | 0.1217 | 0.1360 | 0.1003 | 0.0836 | 0.0584 |  0.0920 | 0.1032 | 0.0956 |
| Test Accuracy Score | 0.7213 | 0.7541 | 0.7636 | 0.8545 | 0.8182 |  0.6909 | 0.7636 | 0.7636 