In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.display import Image


# data generation
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

#evaluation
from sklearn.metrics import fbeta_score , accuracy_score , recall_score , precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve

# Ensemble Methods
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor

## Vocabulary

__Wisdom of crowds:__

__Ensemble:__

__Ensemble learning:__

__Ensemble method:__

__Weak learner:__

__Strong learner:__

__Hard voting:__

__Soft voting:__

__Bagging:__

__Pasting:__

__Boosting:__

__out of bag instances:__

__oob_score:__

__Feature sampling:__

__Random patches method:__

__Feature importance:__

In [None]:
data_train = pd.read_csv("clean_charity_ml.csv")

In [None]:
y = data_train['income']
X = data_train.drop(columns='income')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Recap 

##### Base Accuracy

In [None]:
1-np.mean(y_train)

##### Single Decison Tree

In [None]:
dt_reg = DecisionTreeClassifier()
dt_reg.fit(X_train, y_train)

In [None]:
dt_reg.score(X_test, y_test)

In [None]:
y_hat = dt_reg.predict(X_test)

In [None]:
acc_score = accuracy_score(y_test, y_hat)

### Random Forrest

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
rf_clf.fit(X_train, y_train)

#### Check the accuracy and oob_score of the random forrest

#### What are the attributes of a Random Forrest and what do they mean


#### Create a Random Forrest with more and smaller trees

#### Let's figure out the optimal number of trees

In [None]:
def cumulative_accuracy(predictors, X_test,  y_test):
    
    np.random.shuffle(predictors)
    N = len(predictors)
    summed_prediction = np.zeros(len(y_test))
    acc_score = np.zeros(N)
    
    for i in range(N): 
        y_hat_new = predictors[i].predict(X_test)
        summed_prediction += y_hat_new
        averaged_prediction = summed_prediction/float(i+1)

        acc_score[i] = accuracy_score(y_test, (averaged_prediction>0.5).astype(int))
        
    return acc_score

In [None]:
for k in range(10):
    plt.plot(cumulative_accuracy(rf_clf.estimators_, X_test, y_test))

# Ada Boost

In [None]:
Image(filename='figs/BaggingVsBoosting.png') 

#### Let's create an AdaBoost Classifier

In [None]:
ab_clf = AdaBoostClassifier(learning_rate=0.8, n_estimators=100, algorithm='SAMME')

#### staged_score

#### how do the feature impartances change?

#### different Ensemble Methods create different decison boundaries

In [None]:
Image(filename='figs/DecisionBoundaries.PNG') 

image from https://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_twoclass.html#sphx-glr-auto-examples-ensemble-plot-adaboost-twoclass-py

### Let's make our own decision boundaries

In [None]:
def create_blob_data(centers=[(0,0), (0,1), (1,0), (1,1)], sigma=0.3, plot=True):
    X, y = make_blobs(centers=centers, cluster_std=sigma)
    labels = (y%3!=0).astype(int)
    x_min = min(X[:, 0])
    x_max = max(X[:, 0])
    y_min = min(X[:, 1])
    y_max = max(X[:, 1])
    if plot:
        plt.scatter(X[:, 0], X[:,1], c=labels)
        plt.xlabel('$X_1$', size=16)
        plt.xlim(x_min-0.1*(x_max-x_min), x_max+0.1*(x_max-x_min))
        plt.ylabel('$X_2$', size=16)
        plt.ylim(y_min-0.1*(y_max-y_min), y_max+0.1*(y_max-y_min))
    
    features = pd.DataFrame({"x1":X[:,0], "x2":X[:,1]})
    targets = pd.DataFrame({"y":labels})
    return features, targets

In [None]:
def plot_decision_beundary(features, targets, predictor, res=30):
    x_min=features['x1'].min()
    x_max=features['x1'].max()    
    y_min=features['x2'].min()
    y_max=features['x2'].max()
    
    xx = np.linspace(x_min, x_max, res)
    yy = np.linspace(y_min,y_max, res)
    xv, yv = np.meshgrid(xx,yy)
    mesh = np.vstack((xv.flatten(), yv.flatten())).T
    pred = predictor.predict(mesh)
    
    cs = plt.contourf(xv,yv, pred.reshape(xv.shape), cmap=plt.cm.coolwarm)
    
    plt.colorbar()
    plt.scatter(features['x1'], features['x2'], c=targets['y'])
    plt.xlabel('$X_1$', size=16)
    plt.xlim(x_min-0.1*(x_max-x_min), x_max+0.1*(x_max-x_min))
    plt.ylabel('$X_2$', size=16)
    plt.ylim(y_min-0.1*(y_max-y_min), y_max+0.1*(y_max-y_min))

In [None]:
features, targets= create_blob_data(sigma=0.1)
clf = RandomForestClassifier()
clf.fit(features, targets)

In [None]:
plot_decision_beundary(features, targets, clf, res=10)

### Summary Baggin vs Boosting

There’s not an outright winner; it depends on the data, the simulation and the circumstances. Bagging and Boosting decrease the variance of your single estimate as they combine several estimates from different models. So the result may be a model with higher stability.

If the problem is that the single model gets a very low performance, Bagging will rarely get a better bias. However, Boosting could generate a combined model with lower errors as it optimises the advantages and reduces pitfalls of the single model.

By contrast, if the difficulty of the single model is over-fitting, then Bagging is the best option. Boosting for its part doesn’t help to avoid over-fitting; in fact, this technique is faced with this problem itself. For this reason, Bagging is effective more often than Boosting.

https://quantdare.com/what-is-the-difference-between-bagging-and-boosting/


# Gradient Boost

In [None]:
Image(filename='figs/GradientVsAda.PNG') 

In [None]:
def make_data(N=100):
    x= np.linspace(-1,1,N)
    y = 2*(x**2)+np.random.normal(loc=0, scale=0.3, size=N)
    return x.reshape(len(x),1), y

In [None]:
x,y = make_data()
plt.scatter(x,y)

In [None]:
gb_clf = GradientBoostingRegressor()
gb_clf.fit(x,y)
ests = gb_clf.estimators_
for i in [0, 1, 10,  50, 99]:
    plt.plot(x, ests[i][0].predict(x), linewidth=3, label=str(i))
    
plt.legend()

### Some more resources

https://quantdare.com/what-is-the-difference-between-bagging-and-boosting/

https://medium.com/datadriveninvestor/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10

https://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_twoclass.html#sphx-glr-auto-examples-ensemble-plot-adaboost-twoclass-py

https://www.mygreatlearning.com/blog/gradient-boosting/