In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Hard Voting

In [3]:
log_reg = LogisticRegression(solver="liblinear")
svm = SVC(gamma="auto")
random_forest = RandomForestClassifier(n_estimators=10)

In [4]:
voting_clf = VotingClassifier(
                            estimators = [('lr', log_reg), ('rf', random_forest), ('svc', svm)],
                            voting = "hard")

In [5]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='g...
                                        

In [6]:
from sklearn.metrics import accuracy_score

for clf in (log_reg, svm, random_forest, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
SVC 0.888
RandomForestClassifier 0.856
VotingClassifier 0.88


## Soft Voting

In [7]:
log_reg = LogisticRegression(solver="liblinear")
svm = SVC(gamma="auto", probability = True)
random_forest = RandomForestClassifier(n_estimators=10)

In [8]:
voting_clf = VotingClassifier(
                            estimators = [('lr', log_reg), ('rf', random_forest), ('svc', svm)],
                            voting = "soft")

## Bagging

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
                            DecisionTreeClassifier(), n_estimators = 500,
                            max_samples = 100, bootstrap = True, n_jobs = -1, oob_score = True)

bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [10]:
bag_clf.predict(X_test)

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0], dtype=int64)

In [11]:
bag_clf.oob_score_

0.928

In [12]:
from sklearn.metrics import accuracy_score

y_pred = bag_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.904

In [13]:
bag_clf.oob_decision_function_

array([[0.37726098, 0.62273902],
       [0.39583333, 0.60416667],
       [1.        , 0.        ],
       [0.008     , 0.992     ],
       [0.01902174, 0.98097826],
       [0.11082474, 0.88917526],
       [0.41909814, 0.58090186],
       [0.06005222, 0.93994778],
       [0.93888889, 0.06111111],
       [0.8515625 , 0.1484375 ],
       [0.54636591, 0.45363409],
       [0.0390625 , 0.9609375 ],
       [0.74615385, 0.25384615],
       [0.85449735, 0.14550265],
       [0.89420655, 0.10579345],
       [0.08423913, 0.91576087],
       [0.03439153, 0.96560847],
       [0.92972973, 0.07027027],
       [0.67010309, 0.32989691],
       [0.97043011, 0.02956989],
       [0.05221932, 0.94778068],
       [0.2434555 , 0.7565445 ],
       [0.88579387, 0.11420613],
       [0.99234694, 0.00765306],
       [0.96216216, 0.03783784],
       [0.00529101, 0.99470899],
       [0.98172324, 0.01827676],
       [0.9972752 , 0.0027248 ],
       [0.02717391, 0.97282609],
       [0.7020202 , 0.2979798 ],
       [0.

## Random Forests

In [14]:
rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1)
rnd_clf.fit(X_train, y_train)

y_pred = rnd_clf.predict(X_test)

In [15]:
accuracy_score(y_pred, y_test)

0.92

### Important

The following BaggingClassifier is roughly equivalent to the previous RandomForestClassifier:

In [16]:
bag_clf = BaggingClassifier(
                            DecisionTreeClassifier(splitter = "random", max_leaf_nodes = 16),
                            n_estimators = 500, max_samples = 1.0, bootstrap = True, n_jobs = -1)

In [17]:
bag_clf.fit(X_train, y_train)
accuracy_score(bag_clf.predict(X_test), y_test)

0.904

## AdaBoost

In [18]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
                            DecisionTreeClassifier(max_depth=1), n_estimators=200,
                            algorithm="SAMME.R", learning_rate=0.5)

ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [19]:
accuracy_score(ada_clf.predict(X_test), y_test)

0.896

## Gradient Boosting

In [20]:
import numpy as np

np.random.seed(42)

X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [21]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [22]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [23]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [24]:
X_new = np.array([[0.8]])

In [25]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [26]:
y_pred

array([0.75026781])

### Gradient Boosting with early stopping
Instead of performing Gradient Boosting manually, you can use method from sklearn `GradientBoostingRegressor`


In [27]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 120)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=120,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [28]:
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

In [29]:
import matplotlib.pyplot as plt

plt.plot(range(1, len(errors)+1), errors, '-o', markersize = 2)
plt.ylim([-0.01, 0.07])

(-0.01, 0.07)

In [30]:
gbrt_best = GradientBoostingRegressor(max_depth = 2, n_estimators = bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=79,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Another Approach

In [31]:
gbrt = GradientBoostingRegressor(max_depth = 2, warm_start = True)

min_val_error = np.float('inf')
error_going_up = 0

for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_pred, y_val)
    
    if val_error<min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up +=1
        if error_going_up == 5:
            break

In [32]:
min_val_error

0.0026930464329994377

In [33]:
gbrt.n_estimators

59

## Exercise 8

Load the MNIST data (introduced in Chapter 3), and split it into a training set, a validation set, and a
test set (e.g., use 40,000 instances for training, 10,000 for validation, and 10,000 for testing). Then
train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM.
Next, try to combine them into an ensemble that outperforms them all on the validation set, using a
soft or hard voting classifier. Once you have found one, try it on the test set. How much better does it
perform compared to the individual classifiers?

In [34]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784')

In [35]:
X_train_val, X_test, y_train_val, y_test = train_test_split(mnist.data, mnist.target, test_size = 10000)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = 10000)

In [47]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [48]:
random_forest = RandomForestClassifier(n_estimators = 10)
extra_trees = ExtraTreesClassifier(n_estimators = 10)
svm_clf = LinearSVC()
mlp_clf = MLPClassifier()

In [38]:
estimators = [random_forest, extra_trees, svm_clf]

for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

Training the RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Training the ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
Trainin



In [39]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9474, 0.9486, 0.8484]

In [49]:
named_estimators = [
    ('random_forest_clf', random_forest),
    ('ExtraTrees_clf', extra_trees),
    ("mlp_clf", mlp_clf)

]

In [50]:
from sklearn.ensemble import VotingClassifier

vote_clf = VotingClassifier(named_estimators)

In [51]:
vote_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=10,
                                                     n_jobs=No

In [52]:
vote_clf.score(X_val, y_val)

0.9645

In [55]:
vote_clf.voting = "soft"

In [56]:
vote_clf.score(X_val, y_val)

0.9704