### Load the MNIST dataset

In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml(name="mnist_784")

X, y = mnist.data, mnist.target

  warn(


### Split it to train, validation and test sets

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=20000, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=10000, random_state=42)


### Train several classifiers

In [3]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb

y_train = y_train.astype(int)
y_val = y_val.astype(int)
y_test = y_test.astype(int)

In [4]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_val)
rf_accuracy = accuracy_score(y_val, rf_predictions)
print("Random Forest Classifier Accuracy:", rf_accuracy)


Random Forest Classifier Accuracy: 0.9677


In [5]:
et_classifier = ExtraTreesClassifier(n_estimators=100, random_state=42)
et_classifier.fit(X_train, y_train)
et_predictions = et_classifier.predict(X_val)
et_accuracy = accuracy_score(y_val, et_predictions)
print("Extra-Trees Classifier Accuracy:", et_accuracy)

Extra-Trees Classifier Accuracy: 0.9689


In [6]:

xgb_classifier = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)
xgb_classifier.fit(X_train, y_train)
xgb_predictions = xgb_classifier.predict(X_val)
xgb_accuracy = accuracy_score(y_val, xgb_predictions)
print("XGBoost Classifier Accuracy:", xgb_accuracy)

XGBoost Classifier Accuracy: 0.9348


### We create a new training set with the prediction of the previous classifiers on the validation set

In [7]:
import numpy as np

rf_predictions = rf_classifier.predict(X_val)
et_predictions = et_classifier.predict(X_val)
xgb_predictions = xgb_classifier.predict(X_val)

stacked_predictions = np.column_stack((rf_predictions, et_predictions, xgb_predictions))

### We train a new classifier on the new training set (stacked_predictions) the target is y_val

In [8]:
new_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
new_classifier.fit(stacked_predictions, y_val) 
new_classifier_predictions = new_classifier.predict(stacked_predictions)

new_classifier_accuracy = accuracy_score(y_val, new_classifier_predictions)
print("New Classifier (Random Forest) Accuracy on Stacked Predictions:", new_classifier_accuracy)


New Classifier (Random Forest) Accuracy on Stacked Predictions: 0.9753


### We stack the test predictions for each image into a new feature matrix and then we use the trained blender (new classifier) to make predictions on the stacked test predictions

In [9]:
rf_test_predictions = rf_classifier.predict(X_test)
et_test_predictions = et_classifier.predict(X_test)
xgb_test_predictions = xgb_classifier.predict(X_test)

stacked_test_predictions = np.column_stack((rf_test_predictions, et_test_predictions, xgb_test_predictions))

ensemble_test_predictions = new_classifier.predict(stacked_test_predictions)

test_accuracy = accuracy_score(y_test, ensemble_test_predictions)
print("Ensemble (Random Forest Blender) Accuracy on Test Set:", test_accuracy)

Ensemble (Random Forest Blender) Accuracy on Test Set: 0.9664


### It is slightly better than the voting