# Chapter 7. Ensemble Learning and Random Forests

##  Voting Classifiers

Voting Classifiers combine classifiers and predicts the class with majority votes 

In [1]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
# data splitting
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# voting classifier model
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(random_state=42))
    ]
)

voting_clf.fit(X_train, y_train)  # fit data

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(random_state=42))])

In [3]:
# accessing the results of the classifiers
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test, y_test))

lr = 0.864
rf = 0.896
svc = 0.896


In [4]:
# voting clf prediction
voting_clf.predict(X_test[:1])

array([1], dtype=int64)

In [5]:
[clf.predict(X_test[:1]) for clf in voting_clf.estimators_]

[array([1], dtype=int64), array([1], dtype=int64), array([0], dtype=int64)]

In [6]:
# accuracy on the classifier 
voting_clf.score(X_test, y_test)

0.912

The voting classifier performed well compared to the 3 classifier

**using the `predict_proba()` method**:

In [7]:
voting_clf.voting = "soft"
voting_clf.named_estimators['svc'].probability = True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.92

92% accuracy for soft voting. not bad! 

## Bagging and Pasting

This `BaggingClassifier` trains an ensemble of 500 `DecisionTreeClassifier`: with each trained on 100 training instances randomly sampled from the training set

In [8]:
from sklearn.ensemble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier 

bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, 
                                max_samples=100, random_state=42)
bagging_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, random_state=42)

In [9]:
bagging_clf.score(X_test, y_test)

0.904

**Pasting:** 

In [10]:
pasting_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, 
                                max_samples=100, bootstrap=False, random_state=42)
pasting_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=100, n_estimators=500, random_state=42)

In [11]:
pasting_clf.score(X_test, y_test)

0.92

Pasting achieved a score of 92%

## Out-Of-Bag Evaluation

Gives you the likely estimate of the models accuracy

In [12]:
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, 
                                max_samples=100, oob_score=True, random_state=42)
bagging_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, oob_score=True, random_state=42)

In [13]:
bagging_clf.oob_score_

0.9253333333333333

In [14]:
from sklearn.metrics import accuracy_score
y_pred = bagging_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

The oob was 2% higher.

In [15]:
# decison_function()
bagging_clf.oob_decision_function_[:3]  # probas for the first 3 instances

array([[0.35579515, 0.64420485],
       [0.43513514, 0.56486486],
       [1.        , 0.        ]])

The oob evaluation estimates that the first training
instance has a 64.2% probability of belonging to the positive class, and 35.5% of
belonging to the negative class

## Random Patches and Random Subspaces

Sampling both training instances and features is called the **Random Patches method**. Keeping all training instances (by setting `bootstrap=False` and `max_samples=1.0`) but sampling features (by setting `bootstrap_features=True` and/or `max_features < 1.0`) is called the **Random Subspaces method**.

## Random Forests

In [16]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,
                                n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [17]:
print(accuracy_score(y_test, y_pred_rf))

0.912


Equivalent model:

In [18]:
bagging_clf = BaggingClassifier(
    DecisionTreeClassifier(max_features="sqrt", max_leaf_nodes= 16),
                          n_estimators=500, n_jobs=-1, random_state=42)
bagging_clf.fit(X_train, y_train)
y_pred_bag_clf = bagging_clf.predict(X_test)

In [19]:
print(accuracy_score(y_test, y_pred_bag_clf))

0.912


## Feature Importance

The following code trains a RandomForestClassifier on the iris dataset and outputs each feature’s importance.

In [20]:
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)  # load data 
X_iris, y_iris = iris.data, iris.target  # split data

# create and fit data in model
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)
rnd_clf.fit(X_iris, y_iris)

RandomForestClassifier(n_estimators=500, random_state=42)

In [21]:
for score, name in zip(rnd_clf.feature_importances_, X_iris.columns):
    print(round(score, 2), name)

0.11 sepal length (cm)
0.02 sepal width (cm)
0.44 petal length (cm)
0.42 petal width (cm)


Random Forest helps interpret **feature importances** and aids in feature selection 

## Boosting

The general idea of most boosting methods is to train predictors sequentially, each trying to correct its predecessor.

### AdaBoost

In [22]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=30,
    learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=30, random_state=42)

In [23]:
ada_clf.score(X_test, y_test)

0.904

### Gradient Boosting 

In [24]:
import numpy as np 
from sklearn.tree import DecisionTreeRegressor 

np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(100)  # quadratic function with Gaussian noise

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

DecisionTreeRegressor(max_depth=2, random_state=42)

A second DecisionTreeRegressor on the residual errors made
by the first predictor

In [25]:
y2= y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=43)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2, random_state=43)

A third regressor on the residual errors made by the second predictor

In [26]:
y3= y - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=44)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2, random_state=44)

In [27]:
X_new = np.array([[-0.4], [0.], [0.5]])

An ensemble containing three trees

In [28]:
sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

array([0.97559364, 0.1703819 , 1.08692075])

The ensemble’s predictions gradually get better as trees are added to the ensemble

In [29]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=5,
                                 learning_rate=1, random_state=42)
gbrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1, max_depth=2, n_estimators=5,
                          random_state=42)

In [30]:
gbrt.score(X, y)

0.9385421713141573

In [31]:
gbrt2 = GradientBoostingRegressor(max_depth=2, n_estimators=100,
                                 learning_rate=0.05,random_state=42)
gbrt2.fit(X, y)

GradientBoostingRegressor(learning_rate=0.05, max_depth=2, random_state=42)

In [32]:
gbrt2.score(X, y)

0.9804226792230585

In [33]:
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=500,
                                 learning_rate=0.05, n_iter_no_change=10, random_state=42)
gbrt_best.fit(X, y)

GradientBoostingRegressor(learning_rate=0.05, max_depth=2, n_estimators=500,
                          n_iter_no_change=10, random_state=42)

In [34]:
gbrt_best.n_estimators_

92

### Histogram-Based Gradient Boosting

Scikit-Learn also provides another GBRT implementation, optimized for large datasets: Histogram-based Gradient Boosting (HGB). It works by binning the inputs features,
replacing them with integers

Doesn't require more preprocessing since it supports missing values and categorical features.
**Code Example:**

In [35]:
from sklearn.pipeline import make_pipeline 
from sklearn.compose import make_column_transformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("datasets/housing.csv")
                 
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
housing_labels = train_set["median_house_value"]
housing = train_set.drop("median_house_value", axis=1)               

In [41]:
hgb_reg = make_pipeline(
    make_column_transformer((OrdinalEncoder(), ['ocean_proximity']),
                           remainder="passthrough"),
    HistGradientBoostingRegressor(categorical_features=[0], random_state=42)
)
hgb_reg.fit(housing, housing_labels)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinalencoder',
                                                  OrdinalEncoder(),
                                                  ['ocean_proximity'])])),
                ('histgradientboostingregressor',
                 HistGradientBoostingRegressor(categorical_features=[0],
                                               random_state=42))])

## Stacking 

Unlike Voting Classifier, trains a model to perform this aggregation.

In [42]:
from sklearn.ensemble import StackingClassifier 

stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(probability=True, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5  # number of cross-validdation folds
)
stacking_clf.fit(X_train, y_train)

StackingClassifier(cv=5,
                   estimators=[('lr', LogisticRegression(random_state=42)),
                               ('rf', RandomForestClassifier(random_state=42)),
                               ('svc', SVC(probability=True, random_state=42))],
                   final_estimator=RandomForestClassifier(random_state=43))

In [43]:
stacking_clf.score(X_test, y_test)

0.928

In [44]:
y_pred_stk = stacking_clf.predict(X_test)

In [45]:
from sklearn.metrics import classification_report

In [46]:
print(classification_report(y_test, y_pred_stk))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93        61
           1       0.95      0.91      0.93        64

    accuracy                           0.93       125
   macro avg       0.93      0.93      0.93       125
weighted avg       0.93      0.93      0.93       125



# Exercises

## 8.

In [47]:
from sklearn.datasets import fetch_openml

X_mnist, y_mnist = fetch_openml('mnist_784', return_X_y=True, as_frame=False)

In [48]:
X_mnist.shape

(70000, 784)

In [50]:
X_train, y_train = X_mnist[:50_000], y_mnist[:50_000]
X_valid, y_valid = X_mnist[50_000:60_000], y_mnist[50_000:60_000]
X_test, y_test = X_mnist[60_000:], y_mnist[60_000:]

In [57]:
# Random Forest
# Extra-Trees
# SVM

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

et_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)

svm_clf = SVC(max_iter=100, probability=True, random_state=42)

In [58]:
estimators =[rf_clf, et_clf, svm_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)
    print(f"{estimator} score {estimator.score(X_valid, y_valid)}")

Training the RandomForestClassifier(random_state=42)
RandomForestClassifier(random_state=42) score 0.9736
Training the ExtraTreesClassifier(random_state=42)
ExtraTreesClassifier(random_state=42) score 0.9743
Training the SVC(max_iter=100, probability=True, random_state=42)




SVC(max_iter=100, probability=True, random_state=42) score 0.9397


In [59]:
# voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('random_forest', rf_clf),
        ('extra_trees', et_clf),
        ('svm', svm_clf)
    ]
)

voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('random_forest',
                              RandomForestClassifier(random_state=42)),
                             ('extra_trees',
                              ExtraTreesClassifier(random_state=42)),
                             ('svm',
                              SVC(max_iter=100, probability=True,
                                  random_state=42))])

In [60]:
voting_clf.score(X_valid, y_valid)

0.9768