<a href="https://colab.research.google.com/github/AndrewDavidRatnam/HandsonWorkingML/blob/main/Ensemble_Methods_and_Random_Forests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import fetch_openml

X_mnist, y_mnist = fetch_openml('mnist_784', return_X_y=True, as_frame=False,
                                parser='auto')

In [2]:
# train valid and test data
X_train, y_train = X_mnist[:50_000], y_mnist[:50_000]
X_valid, y_valid = X_mnist[50_000:60_000], y_mnist[50_000:60_000]
X_test, y_test = X_mnist[60_000:], y_mnist[60_000:]

In [3]:
#get different classifiers and make an ensemble
from sklearn.ensemble import ExtraTreesClassifier # faster than RandomForestClassifier
#from sklearn.linear_model import LogisticRegression #way too long
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_tress_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(max_iter=100, tol=20,dual=True, random_state=42) # number features is more than number of instances then useful
mlp_clf = MLPClassifier(random_state=42)

estimators = [random_forest_clf, extra_tress_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

Training the RandomForestClassifier(random_state=42)
Training the ExtraTreesClassifier(random_state=42)
Training the LinearSVC(max_iter=100, random_state=42, tol=20)
Training the MLPClassifier(random_state=42)


In [5]:
[estimator.score(X_valid, y_valid) for estimator in estimators]

[0.9736, 0.9743, 0.8662, 0.9613]

In [37]:
from sklearn.ensemble import VotingClassifier
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees,clf", extra_tress_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf)
]

In [7]:
voting_clf = VotingClassifier(named_estimators)

In [8]:
voting_clf.fit(X_train, y_train)

In [11]:
voting_clf.score(X_valid, y_valid)

0.975

In [12]:
#convert classes to class indices using LabelEncoder

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_valid_encoded = encoder.fit_transform(y_valid) #not needed as MNIST is class name is the intergers and correspond actual indices

In [14]:
#for mnist
y_valid_encoder = y_valid.astype(np.int64)

In [15]:
[estimator.score(X_valid, y_valid_encoded)
for estimator in voting_clf.estimators_]

[0.9736, 0.9743, 0.8662, 0.9613]

In [16]:
#removing svm
voting_clf.set_params(svm_clf="drop")

In [17]:
svm_clf_trained = voting_clf.named_estimators_.pop("svm_clf")
voting_clf.estimators_.remove(svm_clf_trained)

In [19]:
voting_clf.score(X_valid, y_valid)

0.9761

In [22]:
#using soft voting
print("Normal",voting_clf.score(X_valid, y_valid))
voting_clf.voting = "soft"
print("Soft voting",voting_clf.score(X_valid, y_valid))


Normal 0.9703
Soft voting 0.9703


In [23]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9733

In [24]:
[estimator.score(X_test, y_test.astype(np.int64))
for estimator in voting_clf.estimators_]

[0.968, 0.9703, 0.9618]

## Stacking Ensemble

In [25]:
X_valid_predictions = np.empty((len(X_valid), len(estimators)), dtype=object)

for index, estimator in enumerate(estimators):
  X_valid_predictions[:, index] = estimator.predict(X_valid)

In [26]:
X_valid_predictions

array([['3', '3', '3', '3'],
       ['8', '8', '8', '8'],
       ['6', '6', '6', '6'],
       ...,
       ['5', '5', '5', '5'],
       ['6', '6', '6', '6'],
       ['8', '8', '8', '8']], dtype=object)

In [27]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)

rnd_forest_blender.fit(X_valid_predictions, y_valid)

In [28]:
rnd_forest_blender.oob_score_

0.9738

In [30]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=object)

for index, estimator in enumerate(estimators):
  X_test_predictions[:, index] = estimator.predict(X_test)

In [31]:
y_pred = rnd_forest_blender.predict(X_test_predictions)

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9688

In [34]:
#Stacking Classifier --> also uses cross validation so don't need a separate validation set
X_train_full, y_train_full = X_mnist[:60_000], y_mnist[:60_000]

In [42]:
from sklearn.ensemble import StackingClassifier

stack_clf = StackingClassifier(named_estimators,
                               final_estimator=rnd_forest_blender)
#stack_clf.fit(X_train_full, y_train_full) #30 mins to execute
# stack_clf.score(X_test, y_test)