<a href="https://colab.research.google.com/github/AashiDutt/Hands-on-Machine-Learning-with-sklearn-keras-and-tensorflow/blob/main/Ensemble_Learning_%26_Random_Forests_Chapter_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Creating an ensemble of 3 classifiers

# Imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # support vector machine
from sklearn.ensemble import VotingClassifier # used for voting for best classifier


In [2]:
# get data

from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples = 500, noise= 0.30, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)


In [3]:
log_clf = LogisticRegression(solver = "lbfgs", random_state = 42)
rnd_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
svm_clf = SVC(gamma = "scale", random_state = 42)


# test with hard voting i.e  majority voting.



In [4]:
voting_clf = VotingClassifier(
    estimators= [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting = "hard"  
)

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(random_state=42))])

In [5]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__, accuracy_score(y_test, y_pred))


<class 'sklearn.linear_model._logistic.LogisticRegression'> 0.864
<class 'sklearn.ensemble._forest.RandomForestClassifier'> 0.896
<class 'sklearn.svm._classes.SVC'> 0.896
<class 'sklearn.ensemble._voting.VotingClassifier'> 0.912


# test with soft voting i.e to predict the class with highest class probability averaged over all individual classifiers.

In [6]:
log_clf = LogisticRegression(solver = "lbfgs", random_state = 42)
rnd_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
svm_clf = SVC(gamma = "scale",probability = True, random_state = 42)

In [7]:
voting_clf = VotingClassifier(
    estimators= [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting = "soft"  
)

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(probability=True, random_state=42))],
                 voting='soft')

In [8]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__, accuracy_score(y_test, y_pred))

<class 'sklearn.linear_model._logistic.LogisticRegression'> 0.864
<class 'sklearn.ensemble._forest.RandomForestClassifier'> 0.896
<class 'sklearn.svm._classes.SVC'> 0.896
<class 'sklearn.ensemble._voting.VotingClassifier'> 0.92


Bagging(Bootstrap Aggregating) and Pasting

Bagging - when sampling is performed with replacement i.e instead of using different algorithms for every predictor, we use same training algorithm and train them on different random subsets of training set.

Pasting - when sampling is performed without replacement

Bagging and pasting scale very well.

In [10]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# n_estimators - no. of trees, 100 training instances, bootstrap(bagging) = True / False for Pasting , n_jobs - cpu cores
bag_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators = 500, max_samples = 100, bootstrap = True, n_jobs = 1 )
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [14]:
# Out of bag evaluation (oob) are the instances that are not included in the training instance due to replacement. Instead they can be used for validation

bag_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators = 500, bootstrap = True, n_jobs = 1 , oob_score = True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_


0.9013333333333333

In [15]:
# lets verify using test set
from sklearn.metrics import accuracy_score

y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

In [16]:
bag_clf.oob_decision_function_

array([[0.37634409, 0.62365591],
       [0.33526012, 0.66473988],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.07853403, 0.92146597],
       [0.36413043, 0.63586957],
       [0.00520833, 0.99479167],
       [0.99468085, 0.00531915],
       [0.97814208, 0.02185792],
       [0.8030303 , 0.1969697 ],
       [0.00574713, 0.99425287],
       [0.82758621, 0.17241379],
       [0.84126984, 0.15873016],
       [0.96174863, 0.03825137],
       [0.02604167, 0.97395833],
       [0.        , 1.        ],
       [0.97849462, 0.02150538],
       [0.96195652, 0.03804348],
       [1.        , 0.        ],
       [0.00564972, 0.99435028],
       [0.28804348, 0.71195652],
       [0.92777778, 0.07222222],
       [1.        , 0.        ],
       [0.96132597, 0.03867403],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.63218391, 0.36781609],
       [0.

1.Random patches method - sampling both training instances and features -includes use of 2 kep hyper parameters - max_features and bootstrap_features

2.Random subspaces method - keeping all training instances but sampling features 



# Random Forests

In [17]:
from sklearn.ensemble import RandomForestClassifier


In [18]:
rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = 1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [20]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(max_features = "auto", max_leaf_nodes = 16), n_estimators = 500, max_samples = 1.0, bootstrap = True, n_jobs = -1)

In [21]:
# Feature importance - relative impotance of each feature

from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
  print(name, score)

sepal length (cm) 0.1059020727323791
sepal width (cm) 0.023519136593076948
petal length (cm) 0.4369261110999872
petal width (cm) 0.43365267957455683


Boosting - ensemble that combines several weak learners into a strong learner

Boosting method - is to train predictors sequentially, each trying to correct its predecessor.

1. Adaboosting - Adaptive Boosting

Correct predecessor by paying more attention to training instances that the predecessor underfitted.
Algo trains a base classifier( decision tree) and uses it to make predictions on training set. Then algo increases relative weight of missclassified training instances and train next classifier with updated weights.

2. Gradient Boosting

Unlike AdaBoosting, Gradient boosting tries to fit the new predictor to residual errors made by the predecessor.


In [22]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 1), n_estimators = 200, algorithm = "SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [32]:
import numpy as np
np.random.seed(42)
X = np.random.rand(100,1)-0.5
y = 3*X[:, 0]**2 +0.05*np.random.randn(100)

In [33]:
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth =2)
tree_reg1.fit(X,y)

DecisionTreeRegressor(max_depth=2)

In [34]:
# training next predictor using residual errors of tree_reg1

y2 = y- tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth = 2)
tree_reg2.fit(X,y2)

DecisionTreeRegressor(max_depth=2)

In [39]:
y3 = y2- tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth = 2)
tree_reg3.fit(X,y3)

DecisionTreeRegressor(max_depth=2)

In [40]:
X_new = np.array([[0.8]])

In [41]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))


In [42]:
y_pred

array([0.76670256])

In [45]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, random_state = 42)
gbrt.fit(X,y)

GradientBoostingRegressor(max_depth=2, n_estimators=3, random_state=42)

In [47]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)


gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 120)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [49]:
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) +1

gbrt_best = GradientBoostingRegressor(max_depth = 2, n_estimators = bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=110)

In [51]:
# stochastic gradient boosting
gbrt = GradientBoostingRegressor(max_depth = 2, warm_start = True)

min_val_error = float("inf")
error_going_up = 0

for n_estimators in range(1, 120):
  gbrt.n_estimators = n_estimators
  gbrt.fit(X_train, y_train)
  y_pred = gbrt.predict(X_val)
  val_error = mean_squared_error(y_val, y_pred)

  if val_error< min_val_error:
    min_val_error = val_error
    error_going_up = 0

  else:
    error_going_up +=1
    if error_going_up ==5:
      break

In [52]:
print(gbrt.n_estimators)

72


In [53]:
print("min validation MSE", min_val_error)

min validation MSE 0.0028209980081892543


In [54]:
# using XGBoost

import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)



In [55]:
# or try this
xgb_reg.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.252919
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.22967
[2]	validation_0-rmse:0.208932
[3]	validation_0-rmse:0.190537
[4]	validation_0-rmse:0.174254
[5]	validation_0-rmse:0.159642
[6]	validation_0-rmse:0.146745
[7]	validation_0-rmse:0.134787
[8]	validation_0-rmse:0.123794
[9]	validation_0-rmse:0.11347
[10]	validation_0-rmse:0.10435
[11]	validation_0-rmse:0.095989
[12]	validation_0-rmse:0.0889
[13]	validation_0-rmse:0.083319
[14]	validation_0-rmse:0.078312
[15]	validation_0-rmse:0.073715
[16]	validation_0-rmse:0.06965
[17]	validation_0-rmse:0.06641
[18]	validation_0-rmse:0.063814
[19]	validation_0-rmse:0.061651
[20]	validation_0-rmse:0.05997
[21]	validation_0-rmse:0.058577
[22]	validation_0-rmse:0.057521
[23]	validation_0-rmse:0.056667
[24]	validation_0-rmse:0.055901
[25]	validation_0-rmse:0.055247
[26]	validation_0-rmse:0.054666
[27]	validation_0-rmse:0.054397
[28]	validation_0-rmse:0.054156
[29]	validation_0-rmse:0.0