# Bagging (with scikit-learn)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from dtreeviz.trees import dtreeviz

# Moons

In [None]:
from sklearn.datasets import make_moons

In [None]:
x, y = make_moons(n_samples=400, noise=0.3, random_state=42)

In [None]:
sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    random_state=0)

In [None]:
knn_clf = KNeighborsClassifier()
logreg_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()

In [None]:
voting_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='hard')

In [None]:
voting_clf.fit(x_train, y_train)

In [None]:
classifiers = [knn_clf, logreg_clf, tree_clf, voting_clf]

In [None]:
for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
votingsoft_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='soft')

votingsoft_clf.fit(x_train, y_train)

classifiers = [knn_clf, logreg_clf, tree_clf, voting_clf, votingsoft_clf]

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)
print(bag_clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(tree_clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(bag_clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True)

bag_clf.fit(x_train, y_train)
bag_clf.oob_score_

In [None]:
y_pred = bag_clf.predict(x_test)
accuracy_score(y_test, y_pred)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=500, 
                                 max_leaf_nodes=16, 
                                 n_jobs=-1,
                                 random_state=42)
rnd_clf.fit(x_train, y_train)

y_pred_rf = rnd_clf.predict(x_test)
accuracy_score(y_test, y_pred_rf)

In [None]:
# equivalent bagging classifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1,
    random_state=42)

In [None]:
bag_clf.fit(x_train, y_train)
y_pred_rf = bag_clf.predict(x_test)
accuracy_score(y_test, y_pred_rf)

In [None]:
classifiers = [knn_clf, logreg_clf, tree_clf, voting_clf, votingsoft_clf, rnd_clf]

In [None]:
for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(rnd_clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

## This also works for our regression example

In [None]:
x = np.linspace(0,10,100)

In [None]:
np.random.seed(42)
noise = np.random.normal(0,1.5,100)

y = x + noise #0.3*(x-5)**3 + noise

In [None]:
plt.plot(x,y,'ko')

## Straight-forward linear regression

In [None]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x.reshape(-1,1), y, random_state=42)

In [None]:
lin_reg = sklearn.linear_model.LinearRegression()
lin_reg.fit(x_train,y_train)
print('R2 score: ',lin_reg.score(x_test, y_test))

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
plt.plot([0,10],lin_reg.predict([[0],[10]]))

## Decision Tree

In [None]:
tree_reg = sklearn.tree.DecisionTreeRegressor()
tree_reg.fit(x_train,y_train)
print('R2 score: ',tree_reg.score(x_test, y_test))

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
xnew=np.linspace(0,10,1000).reshape(-1,1)
ynew=tree_reg.predict(xnew)
plt.plot(xnew,ynew)

## Random Forest

In [None]:
rf_reg = sklearn.ensemble.RandomForestRegressor(n_jobs=-1,
                                                random_state=42)
rf_reg.fit(x_train,y_train)
print('R2 score: ',rf_reg.score(x_test, y_test))

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
xnew=np.linspace(0,10,1000).reshape(-1,1)
ynew=rf_reg.predict(xnew)
plt.plot(xnew,ynew)

In [None]:
rf_reg = sklearn.ensemble.RandomForestRegressor(max_depth=3,
                                                n_jobs=-1,
                                                random_state=42)
rf_reg.fit(x_train,y_train)
print('R2 score: ',rf_reg.score(x_test, y_test))

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
xnew=np.linspace(0,10,1000).reshape(-1,1)
ynew=rf_reg.predict(xnew)
plt.plot(xnew,ynew)