# Ensemble learning

In [1]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
X, y = make_moons(n_samples = 500, noise = 0.30, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42 )

In [3]:
voting_clf = VotingClassifier(
    estimators = [('lr', LogisticRegression(random_state =42)),
                 ('rf', RandomForestClassifier(random_state = 42)),
                 ('svc', SVC(random_state=42))]
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(random_state=42))])

In [4]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test, y_test))

lr = 0.864
rf = 0.896
svc = 0.896


In [5]:
#predicting classes (hard voting) - voting based on largest group of predictions

voting_clf.predict(X_test[:1])

array([1])

In [6]:
[clf.predict(X_test[:1]) for clf in voting_clf.estimators_]

[array([1]), array([1]), array([0])]

In [7]:
#scoring
voting_clf.score(X_test, y_test) #outperforms all individual classifiers

0.912

In [8]:
#soft voting - voting based on avg probabilites 
voting_clf.voting = "soft"
voting_clf.named_estimators["svc"].probability = True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test) #higher than hard voting

0.92

## Bagging (bootstrap aggregating) and pasting
* training same algorithms on different subsets of data (with replacement) - bagging
* training same algorithms on different subsets of data (w/o replacement) - pasting

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [10]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 500,
                           max_samples = 100, n_jobs = -1, random_state = 42)
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1, random_state=42)

In [11]:
bag_clf.score(X_test, y_test) #bagging has slightly higher bias than pasting

0.904

### out of bag evaluation
Mathematically only about 63% of the training instances are sampled on an average for each predictor. The remaining 37% of the training instances that are not sampled are called out-of-bag(OOB) instances.

A bagging ensemble can be evaluated using OOB instances, without the need for a separate validation se

In [12]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, 
                           oob_score = True, n_jobs = -1, random_state=42)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_ #classifier is likely to achieve about 89.6% accuracy on test set

0.896

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
accuracy_score(y_test, bag_clf.predict(X_test))

0.92

In [15]:
# OOB decision function (because underlying algo has predict_proba() method)
bag_clf.oob_decision_function_[:3]

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ]])

## Random patches and Random subspaces

just as we choose subset of instances for each estimator, we can choose features instead with `max_features` and `bootstrap_features` hyperparameters.

Sampling both training instances and features is called the `random patches method`

Keeping all training instances (by setting `bootstrap=False` and `max_samples=1.0`) but sampling features (by setting bootstrap_features to True and/or max_features to a value smaller than 1.0) is called the `random subspaces` method.⁠

## Random Forests

* same as training an ensemble classifier with decision trees via bagging method <br>
* at each node only a random subset of the 'best' features is considered for splitting

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16,
                                n_jobs = -1, random_state = 42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [18]:
accuracy_score(y_test, y_pred_rf)

0.912

by default uses $\sqrt{n}$ features out of n.<br>
Trades higher bias for a lower variance

In [19]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(max_features = "sqrt", max_leaf_nodes = 16),
                           n_estimators = 500, n_jobs = -1, random_state = 42)

### extremely random trees (extra-trees)
* random forest with splitting nodes using random thresholds for each features

In [20]:
#extra trees; making trees even more random 
from sklearn.ensemble import ExtraTreesClassifier

In [21]:
extra_clf = ExtraTreesClassifier(random_state=42,)
extra_clf.fit(X_train, y_train)
y_pred_extra  = extra_clf.predict(X_test)

In [22]:
accuracy_score(y_pred_extra, y_test)

0.88

In [23]:
RandomForestClassifier(bootstrap=False,)

RandomForestClassifier(bootstrap=False)

In [24]:
ex_clf = BaggingClassifier(DecisionTreeClassifier(max_features = "sqrt", max_leaf_nodes = 16, splitter='random'),
                           n_estimators = 500, n_jobs = -1, random_state = 42)

In [25]:
ex_clf.fit(X_train, y_train)
accuracy_score(y_test, ex_clf.predict(X_test))

0.912

### feature importance

In [26]:
from sklearn.datasets import load_iris
iris = load_iris(as_frame = True)
rnd_clf = RandomForestClassifier(n_estimators=500, random_state = 42)
rnd_clf.fit(iris.data, iris.target)

RandomForestClassifier(n_estimators=500, random_state=42)

In [27]:
for score,name in zip(rnd_clf.feature_importances_,iris.data.columns):
    print(name, ":", round(score,2))

sepal length (cm) : 0.11
sepal width (cm) : 0.02
petal length (cm) : 0.44
petal width (cm) : 0.42


## Boosting

Boosting (originally called hypothesis boosting) refers to any Ensemble method that
can combine several weak learners into a strong learner. The general idea of most
boosting methods is to train predictors sequentially, each trying to correct its predecessor

### 1. Adaptive Boosting (AdaBoost)

One way for a new predictor to correct its predecessor is to pay a bit more attention
to the training instances that the predecessor underfitted. This results in new predic‐
tors focusing more and more on the hard cases.

In [28]:
# stagewise additive modeling using a multiclass Exponential loss function
# SAMME and SAMME.R

from sklearn.ensemble import AdaBoostClassifier

In [29]:
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth = 1), n_estimators= 200,
algorithm = "SAMME.R", learning_rate = 0.5)

In [30]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

### 2. Gradient Boosting

In [44]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [32]:
from sklearn.tree import DecisionTreeRegressor

In [45]:
tree_reg1 = DecisionTreeRegressor(max_depth = 2)
tree_reg1.fit(X,y)

DecisionTreeRegressor(max_depth=2)

In [46]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth = 2)
tree_reg2.fit(X,y2)

DecisionTreeRegressor(max_depth=2)

In [47]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth = 2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2)

In [49]:
X_new = np.array([[0.8]])

In [50]:
y_pred = [sum(tree.predict(X_new)) for tree in (tree_reg1, tree_reg2, tree_reg3)]

In [40]:
from sklearn.ensemble import GradientBoostingRegressor

In [41]:
gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, learning_rate = 1.0)
gbrt.fit(X,y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [42]:
# finding the optimal number of estimators in Gradient Boost; early stopping
import numpy as np
from sklearn.metrics import mean_squared_error

In [51]:
X_train, X_val, y_train, y_val = train_test_split(X,y)


In [53]:
gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 120)
gbrt.fit(X_train,y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [54]:
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]

In [55]:
bst_n_estimators = np.argmin(errors)

In [56]:
gbrt_best = GradientBoostingRegressor(max_depth =2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=52)

In [58]:
# xgboost
import xgboost

  from pandas import MultiIndex, Int64Index


In [59]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [60]:
y_pred = xgb_reg.predict(X_val)

## Stacking

In [61]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(probability=True, random_state=42))
    ],
    final_estimator=RandomForestClassifier(random_state=43),
    cv=5  # number of cross-validation folds
)
stacking_clf.fit(X_train, y_train)

ValueError: Unknown label type: 'continuous'