# Ensemble learning

In [1]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
X, y = make_moons(n_samples = 500, noise = 0.30, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42 )

In [3]:
voting_clf = VotingClassifier(
    estimators = [('lr', LogisticRegression(random_state =42)),
                 ('rf', RandomForestClassifier(random_state = 42)),
                 ('svc', SVC(random_state=42))]
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(random_state=42))])

In [7]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test, y_test))

lr = 0.864
rf = 0.896
svc = 0.896


In [8]:
#predicting classes (hard voting) - voting based on largest group of predictions

voting_clf.predict(X_test[:1])

array([1])

In [9]:
[clf.predict(X_test[:1]) for clf in voting_clf.estimators_]

[array([1]), array([1]), array([0])]

In [10]:
#scoring
voting_clf.score(X_test, y_test) #outperforms all individual classifiers

0.912

In [11]:
#soft voting - voting based on avg probabilites 
voting_clf.voting = "soft"
voting_clf.named_estimators["svc"].probability = True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test) #higher than hard voting

0.92

## Bagging (bootstrap aggregating) and pasting
* training same algorithms on different subsets of data (with replacement) - bagging
* training same algorithms on different subsets of data (w/o replacement) - pasting

In [12]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [13]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 500,
                           max_samples = 100, n_jobs = -1, random_state = 42)
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1, random_state=42)

In [14]:
bag_clf.score(X_test, y_test) #bagging has slightly higher bias than pasting

0.904

### out of bag evaluation
Mathematically only about 63% of the training instances are sampled on an average for each predictor. The remaining 37% of the training instances that are not sampled are called out-of-bag(OOB) instances.

A bagging ensemble can be evaluated using OOB instances, without the need for a separate validation se

In [17]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, 
                           oob_score = True, n_jobs = -1, random_state=42)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_ #classifier is likely to achieve about 89.6% accuracy on test set

0.896

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
accuracy_score(y_test, bag_clf.predict(X_test))

0.92

In [20]:
# OOB decision function (because underlying algo has predict_proba() method)
bag_clf.oob_decision_function_[:3]

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ]])

## Random patches and Random subspaces

just as we choose subset of instances for each estimator, we can choose features instead with `max_features` and `bootstrap_features` hyperparameters.

Sampling both training instances and features is called the `random patches method`

Keeping all training instances (by setting `bootstrap=False` and `max_samples=1.0`) but sampling features (by setting bootstrap_features to True and/or max_features to a value smaller than 1.0) is called the `random subspaces` method.⁠

## Random Forests

* same as training an ensemble classifier with decision trees via bagging method <br>
* at each node only a random subset of the 'best' features is considered for splitting

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16,
                                n_jobs = -1, random_state = 42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [27]:
accuracy_score(y_test, y_pred_rf)

0.912

by default uses $\sqrt{n}$ features out of n.<br>
Trades higher bias for a lower variance

In [25]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(max_features = "sqrt", max_leaf_nodes = 16),
                           n_estimators = 500, n_jobs = -1, random_state = 42)

### extremely random trees (extra-trees)
* random forest with splitting nodes using random thresholds for each features

In [26]:
#extra trees; making trees even more random 
from sklearn.ensemble import ExtraTreesClassifier

In [30]:
extra_clf = ExtraTreesClassifier(random_state=42,)
extra_clf.fit(X_train, y_train)
y_pred_extra  = extra_clf.predict(X_test)

In [31]:
accuracy_score(y_pred_extra, y_test)

0.88

In [32]:
RandomForestClassifier(bootstrap=False,)

<bound method BaseEstimator.get_params of RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=-1,
                       random_state=42)>

In [33]:
ex_clf = BaggingClassifier(DecisionTreeClassifier(max_features = "sqrt", max_leaf_nodes = 16, splitter='random'),
                           n_estimators = 500, n_jobs = -1, random_state = 42)

In [34]:
ex_clf.fit(X_train, y_train)
accuracy_score(y_test, ex_clf.predict(X_test))

0.912

### feature importance

In [35]:
from sklearn.datasets import load_iris
iris = load_iris(as_frame = True)
rnd_clf = RandomForestClassifier(n_estimators=500, random_state = 42)
rnd_clf.fit(iris.data, iris.target)

RandomForestClassifier(n_estimators=500, random_state=42)

In [37]:
for score,name in zip(rnd_clf.feature_importances_,iris.data.columns):
    print(name, ":", round(score,2))

sepal length (cm) : 0.11
sepal width (cm) : 0.02
petal length (cm) : 0.44
petal width (cm) : 0.42
