# Exercise: Load the MNIST data and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing).

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

mnist = fetch_openml('mnist_784', version=1, as_frame=False)

train_size: int = 50000
val_size: int = 10000
test_size: int = 10000

#First 50,000 samples.
X_train = mnist["data"][:train_size]
y_train = mnist["target"][:train_size]

#Next 10,000 samples after the training set.
X_val = mnist["data"][train_size:train_size + val_size]
y_val = mnist["target"][train_size:train_size + val_size]

#Next 10,000 samples after the validation set.
X_test = mnist["data"][train_size + val_size:train_size + val_size + test_size]
y_test = mnist["target"][train_size + val_size:train_size + val_size + test_size]


Exercise: Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM.


In [4]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC

rfc_clf = RandomForestClassifier(n_estimators=85, random_state=42)
etc_clf = ExtraTreesClassifier(n_estimators=85, random_state=42)
svm_clf = LinearSVC(max_iter=85, random_state=42)

rfc_clf.fit(X_train, y_train)
etc_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)



In [7]:
estimators = [rfc_clf, etc_clf, svm_clf]
for i in estimators:
    print(i.score(X_val, y_val))

0.9713
0.975
0.9215


# Exercise: Next, try to combine them into an ensemble that outperforms them all on the validation set, using a soft or hard voting classifier.

In [13]:
from sklearn.ensemble import VotingClassifier

estimators_all = [
    ("random_forest_clf", rfc_clf),
    ("extra_trees_clf", etc_clf),
    ("svm_clf", svm_clf),
]
vc_clf_hard = VotingClassifier(estimators=estimators_all, voting="hard", n_jobs=-1)

vc_clf_hard.fit(X_train, y_train)



# try it on the test set. How much better does it perform compared to the individual classifiers?

In [15]:
print(vc_clf_hard.score(X_val, y_val))

0.9729


In [None]:
del estimators_all[2] # SVC doesn't have predict_proba

In [20]:
vc_clf_soft = VotingClassifier(estimators=estimators_all, voting="soft", n_jobs=-1)
vc_clf_soft.fit(X_train, y_train)
print(vc_clf_soft.score(X_val, y_val))

0.9747


# Stacking Ensemble

In [24]:
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier, StackingClassifier
import xgboost

base_models = [
    #('svc', SVC(probability=True)),
    ('random_forest_clf', RandomForestClassifier()),
    ('extra_trees_clf', ExtraTreesClassifier()),
    ('xgboost', xgboost.XGBClassifier())
]

meta_model = AdaBoostClassifier(n_estimators=85)

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

stacking_clf.fit(X_train, y_train)




In [25]:
# Store model
import joblib

joblib.dump(stacking_clf, 'stacking_classifier_model.pkl')
joblib.dump(vc_clf_soft, 'vc_classifier_soft_model.pkl')

['vc_classifier_soft_model.pkl']

In [26]:
from sklearn.metrics import accuracy_score
y_pred = stacking_clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.8178