# Bagging Machine Learning Algorithm

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [None]:
df = sns.load_dataset('titanic')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
df['pclass'].unique()

In [None]:
df['pclass'].value_counts()

In [None]:
df['sex'].unique()

In [None]:
df['sex'].value_counts()

In [None]:
df['age'].hist(bins=50);

## Data Pre-processing

In [None]:
X = df[['pclass', 'sex', 'age']]

In [None]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [None]:
X['sex'] = lb.fit_transform(X['sex'])

In [None]:
X.head()

In [None]:
X.shape

In [None]:
X.describe()

In [None]:
X.info()

In [None]:
y = df['survived']

In [None]:
y.value_counts()

***

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))    
        

***

# Boosting (Hypothesis Boosting)

* Combine several weak learners into a strong learner. 

* Train predictors sequentially

# AdaBoost / Adaptive Boosting

[Robert Schapire](http://rob.schapire.net/papers/explaining-adaboost.pdf)

[Wikipedia](https://en.wikipedia.org/wiki/AdaBoost)

[Chris McCormick](http://mccormickml.com/2013/12/13/adaboost-tutorial/)

[Scikit Learn AdaBoost](http://scikit-learn.org/stable/modules/ensemble.html#adaboost)

1995

As above for Boosting:
* Similar to human learning, the algo learns from past mistakes by focusing more on difficult problems it did not get right in prior learning. 
* In machine learning speak, it pays more attention to training instances that previously underfitted.

Source: Scikit-Learn:

* Fit a sequence of weak learners (i.e., models that are only slightly better than random guessing, such as small decision trees) on repeatedly modified versions of the data. 
* The predictions from all of them are then combined through a weighted majority vote (or sum) to produce the final prediction.
* The data modifications at each so-called boosting iteration consist of applying weights $w_1, w_2, …, w_N$ to each of the training samples. 
* Initially, those weights are all set to $w_i = 1/N$, so that the first step simply trains a weak learner on the original data. 
* For each successive iteration, the sample weights are individually modified and the learning algorithm is reapplied to the reweighted data. 
* At a given step, those training examples that were incorrectly predicted by the boosted model induced at the previous step have their weights increased, whereas the weights are decreased for those that were predicted correctly. 
* As iterations proceed, examples that are difficult to predict receive ever-increasing influence. Each subsequent weak learner is thereby forced to concentrate on the examples that are missed by the previous ones in the sequence.



In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier()

In [None]:
ada_clf.fit(X_train, y_train)

[SAMME16](https://web.stanford.edu/~hastie/Papers/samme.pdf) (Stagewise Additive Modeling using a Multiclass Exponential loss function).

R stands for real

In [None]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=True)

In [None]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=False)

## AdaBoost with Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
ada_clf = AdaBoostClassifier(RandomForestClassifier())

In [None]:
ada_clf.fit(X_train, y_train)

In [None]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=True)

In [None]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=False)

***

In [None]:
ada_clf = AdaBoostClassifier(base_estimator=RandomForestClassifier())

In [None]:
ada_clf.fit(X_train, y_train)

In [None]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=True)

In [None]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=False)

***

***