## Voting Classifier

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

#usingmoons dataset that we used chapter 5
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#training a voting classifier using 3 classifiers

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf),('rf',rnd_clf),('svc', svm_clf)],
    voting='hard')

voting_clf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [3]:
#lets look at each classifier's accuracy
#this is an example of hard voting

from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.904
SVC 0.896
VotingClassifier 0.904


what we notice here is that the voting classifier outperforms all other clasifiers, taht is due to the fact that the voting classifier uses all their outcomes to predict a more precise one; askinga crowd for answer and compiling them gives more information than asking one expert.

In [4]:
#lets looka t an example of soft voting

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf),('rf',rnd_clf),('svc', svm_clf)],
    voting='soft')

voting_clf.fit(X_train,y_train)

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


soft voting gave us an even better performance of 91% !!!

## Bagging and Pasting in SkLearn

In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#example of bagging uèsing SkLearn, we can use either bagging or pasting but most of the time we can just got for bagging as
#default since it gives us overall better models, but if we have the time and cpu power required, we can go for 
#cross validation between the 2 and use the one with better results
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, 
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

## Out of Bag Evaluation

Basically when using baging, some of the instances are not used (roughly 37% usually), so data taht is not part of the dataset used for the training is called "out of bag" data.
We can use that data to test our model( by changing the value of "oob_score" to true ) 

In [6]:
#we create a bagging classifier and turn oob_score on 
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 500, 
    bootstrap=True, n_jobs=-1, oob_score=True)

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.896

In [7]:
#we compare the oob_score to the accuracy score

from sklearn.metrics import accuracy_score 
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.904

turns out they are close 

In [8]:
#returning the class probabilities for each of each instance used 
bag_clf.oob_decision_function_

array([[0.37790698, 0.62209302],
       [0.36931818, 0.63068182],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.09278351, 0.90721649],
       [0.38743455, 0.61256545],
       [0.01142857, 0.98857143],
       [0.98529412, 0.01470588],
       [0.97252747, 0.02747253],
       [0.79679144, 0.20320856],
       [0.00574713, 0.99425287],
       [0.83510638, 0.16489362],
       [0.84126984, 0.15873016],
       [0.94972067, 0.05027933],
       [0.07692308, 0.92307692],
       [0.        , 1.        ],
       [0.99438202, 0.00561798],
       [0.96363636, 0.03636364],
       [1.        , 0.        ],
       [0.03888889, 0.96111111],
       [0.32786885, 0.67213115],
       [0.90804598, 0.09195402],
       [1.        , 0.        ],
       [0.97282609, 0.02717391],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.65      , 0.35      ],
       [0.

## Random Patches and Random Subspaces

BaggingClassifier supports sampling features as well. this is usefull when dealing with high dimensionnal inputs (such as images). This can be done by keeping all training instances (bootstrap=false and max_samples=1.0) but sampling features (bootstrap_features = true and max_features= a value smaller than 1.0) 
This trades a bit of bias for lower variance.

# Random Forests

In [9]:
#training a random forest classifier with 500 trees

from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train,y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [10]:
#this is a BaggingClassifier that is roughly the equivalent of the previous randomForestClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

## Extra Trees

extra-trees is actually called Extremely Randomized Trees; what it does is set a random number of features for splitting for each node.
its api is the exact same as RandomForestClassifier(similarly, ExtraTreesRegressor has the same API as RandomForestRegressor.

## Feature Importance

In [11]:
#RandomForest algorythme has a variable that stores the importance of each feature in the dataset
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"],iris["target"])
#prints name and scores of each feature 
for name,score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10136174131071805
sepal width (cm) 0.02482022738446275
petal length (cm) 0.44656247115207326
petal width (cm) 0.42725556015274596


## Boosting 

The general idea of most boosting methods is to train predictors sequentially, each trying to correct its prede‐
cessor. 

### AdaBoost

adaboost works by training a base classifier (of our choosing) then uses it to make predictions ont he training set. 
The algorithme then increases the relative weight of missclassified training instances to focus harder on them and then trains a new model based on the new weights.

In [14]:
#an example of adaboost algorithme
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200, 
    algorithm="SAMME.R", learning_rate = 0.5)

ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

### Gradient Boosting

works the same way as AdaBoost except instead of adding predictors that correct old ones, it tries to fit the new model to the residual errors of the old predictor.

In [28]:
#let's look at a simple example
from sklearn.tree import DecisionTreeRegressor
import numpy as np
#lets build a regression tree first
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X,y)

DecisionTreeRegressor(max_depth=2)

In [17]:
#then lets build a regression tree on the residual errors made by the first predictor

y2= y-tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X,y2)

DecisionTreeRegressor(max_depth=2)

In [25]:
y3 = y2-tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X,y3)

DecisionTreeRegressor(max_depth=2)

In [34]:
X_new = np.array([[0.8,0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [35]:
y_pred

array([-0.06976324])

In [37]:
#all of that can also be done gradientBoostingRegressor in one line 

from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X,y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [40]:
#this is training a GBRT while using early stopping (in order to find the optimal number of trees (estimators))

import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X,y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train,y_train)

errors = [mean_squared_error(y_val, y_pred)
         for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=78)

In [42]:
gbrt_best.n_estimators

78

In [45]:
#another way to implement it but this time instead of training all 120 and then looking back to the one the most fitting 
#as we train we look at the MSE of our model and if it doesnt improve 5 times in a row we stop the training and
#take the one with the lowest MSE 
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
    if error_going_up == 5:
        break # early stopping

In [49]:
#another library we can use for our purposes is xgboost, think of it as a better optimized sklearn (with very similar library)
#an example of its uses 

import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

#it also automatically takes care of early stopping 
xgb_reg.fit(X_train, y_train,
 eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

ModuleNotFoundError: No module named 'xgboost'

### Stacking