<a href="https://colab.research.google.com/github/ArezooAalipanah/machine_learning3/blob/main/HML3_Ch7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ensemble Learning and Random Forests 🍀

In [1]:
# Voting Classifier

from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X,y = make_moons(n_samples = 500, noise = 0.30, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

voting_clf = VotingClassifier(
    estimators = [
        ("lr", LogisticRegression(random_state = 42)),
        ("rf", RandomForestClassifier(random_state = 42)),
        ("svc", SVC(random_state = 42))
    ]
)
voting_clf.fit(X_train, y_train)

In [2]:
"""
When you fit a VotingClassifier, it clones every estimator and fits the clones.
The original estimators are available via the estimators attribute,
while the fitted clones are available via the estimators_ attribute.
If you prefer a dict rather than a list, you
can use named_estimators or named_estimators_ instead."""

# accuracy for each:

for name, clf in voting_clf.named_estimators_.items():
  print(name, "=", clf.score(X_test, y_test))
  

lr = 0.864
rf = 0.896
svc = 0.896


In [3]:
"""
When you call the voting classifier’s predict() method, it performs hard voting.
 For example, the voting classifier predicts class 1 for the first instance
of the test set, because two out of three classifiers predict that class:
"""
voting_clf.predict(X_test[:1])

array([1])

In [4]:
[clf.predict(X_test[:1]) for clf in voting_clf.estimators_]

[array([1]), array([1]), array([0])]

In [5]:
# the performance of the voting classifiers:
voting_clf.score(X_test, y_test)

0.912

In [6]:
# soft voting
voting_clf.voting = "soft"
voting_clf.named_estimators["svc"].probability = True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.92

In [7]:
# Baggign and Pasting
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)

In [8]:
"""
you can set oob_score=True when creating a BaggingClassifier
to request an automatic OOB evaluation after training. 
The resulting evaluation score is available in the oob_score_ attribute:
"""
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators= 500,
                            oob_score = True, n_jobs = -1, random_state = 42)

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.896

In [9]:
# Let's Verify
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)

In [10]:
accuracy_score(y_test, y_pred)

0.92

In [11]:
bag_clf.oob_decision_function_[:3] 
# probas for the first 3 instances

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ]])

In [12]:
"""
The following code trains a random forest classifier with 500 trees, 
each limited to maximum 16 leaf nodes, using all available CPU cores"""

from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators= 500, max_leaf_nodes=16, 
                                 n_jobs = -1, random_state=42)
rnd_clf.fit (X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [13]:
#following BaggingClassifier is equivalent to previous RandomForestClassifier:

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_features = "sqrt", max_leaf_nodes = 16),
    n_estimators = 500, n_jobs = -1, random_state = 42)


In [14]:
"""
the following code trains a RandomForestClassifier on the iris dataset and
outputs each feature’s importance. It seems that the most important features are
the petal length (44%) and width (42%), while sepal length and width are rather
unimportant in comparison (11% and 2%, respectively)"""

from sklearn.datasets import load_iris
iris= load_iris(as_frame = True)
rnd_clf = RandomForestClassifier(n_estimators = 500, random_state = 42)
rnd_clf.fit(iris.data, iris.target)
for score, name in zip(rnd_clf.feature_importances_, iris.data.columns):
  print(round(score, 2), name)


0.11 sepal length (cm)
0.02 sepal width (cm)
0.44 petal length (cm)
0.42 petal width (cm)


In [15]:
"""
following code trains an AdaBoost classifier based on 30 decision stumps using
Scikit-Learn’s AdaBoostClassifier class"""

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=30,
    learning_rate = 0.5, random_state = 42)

ada_clf.fit(X_train, y_train)

In [17]:
"""
a simple regression example, using decision trees as the base predictors; 
gradient tree boosting, or gradient boosted regression trees (GBRT).

a noisy quadratic dataset and fit a DecisionTreeRegressor to it:"""

import numpy as np
from sklearn.tree import DecisionTreeRegressor

np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0]**2 + 0.05 * np.random.randn(100) # y = 3x² + Gaussian noise

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state = 42)
tree_reg1.fit(X, y)

In [18]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state = 43)
tree_reg2.fit(X, y2)


In [19]:
#a third regressor on the residual errors made by the second predictor:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth = 2, random_state = 44)
tree_reg3.fit(X, y3)

In [20]:
#we have an ensemble containing three trees. 
#It can make predictions on a new
# instance simply by adding up the predictions of all the trees
X_new = np.array([[-0.4], [0.],[0.5]])
sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

array([0.49484029, 0.04021166, 0.75026781])

In [21]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3,
                                 learning_rate = 1.0, random_state = 42)
gbrt.fit(X, y)

In [22]:
gbrt_best = GradientBoostingRegressor(
    max_depth = 2, learning_rate = 0.05, n_estimators = 500,
    n_iter_no_change=10, random_state = 42)
gbrt_best.fit(X, y)


In [24]:
gbrt_best.n_estimators_

92

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder
hgb_reg = make_pipeline(
 make_column_transformer((OrdinalEncoder(), ["ocean_proximity"]),
 remainder="passthrough"),
 HistGradientBoostingRegressor(categorical_features=[0], random_state=42)
)
hgb_reg.fit(housing, housing_labels)

NameError: ignored

In [27]:
from sklearn.ensemble import StackingClassifier
stacking_clf = StackingClassifier(
 estimators=[
 ('lr', LogisticRegression(random_state=42)),
 ('rf', RandomForestClassifier(random_state=42)),
 ('svc', SVC(probability=True, random_state=42))
 ],
 final_estimator=RandomForestClassifier(random_state=43),
 cv=5 # number of cross-validation folds
)
stacking_clf.fit(X_train, y_train)
