In [1]:
# importing the libraries
import pandas as pd
import numpy as np

In [2]:
# Loading the dataset
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)
X, y = mnist.data, mnist.target

In [3]:
X[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [4]:
y[:10]

array(['5', '0', '4', '1', '9', '2', '1', '3', '1', '4'], dtype=object)

In [5]:
X.shape

(70000, 784)

In [6]:
y.shape

(70000,)

In [7]:
# splitting dataset into training, validation and test set
X_train, y_train = X[:50_000], y[:50_000]
X_valid, y_valid = X[50_000:60_000], y[50_000:60_000]
X_test, y_test = X[60_000:], y[60_000:]

In [8]:
# 50000 training set,10000 validation set and and 10000 test set
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)
print(X_test.shape)
print(y_test.shape)

(50000, 784)
(50000,)
(10000, 784)
(10000,)
(10000, 784)
(10000,)


In [9]:
# Training the dataset into different model
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
svc_classifier = LinearSVC(max_iter=100,tol=20,dual=True,random_state=42)
neighbors_classifier = KNeighborsClassifier()
random_classifier = RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=42)
extra_classifier = ExtraTreesClassifier(n_estimators=100,criterion='entropy',random_state=42)

In [10]:
models = [svc_classifier,random_classifier,extra_classifier,neighbors_classifier]
for model in models:
  print(f'Training {model}')
  model.fit(X_train,y_train)

Training LinearSVC(dual=True, max_iter=100, random_state=42, tol=20)
Training RandomForestClassifier(criterion='entropy', random_state=42)
Training ExtraTreesClassifier(criterion='entropy', random_state=42)
Training KNeighborsClassifier()


In [11]:
[model.score(X_valid,y_valid) for model in models]

[0.8662, 0.9716, 0.9733, 0.9718]

In [12]:
estimator = [
    ('random_forest_clf',random_classifier),
    ('svm',svc_classifier),
    ('k_classifier',neighbors_classifier),
    ('extra_classifier',extra_classifier),
]

In [15]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=estimator,voting='hard')
voting_clf.fit(X_train,y_train)

In [16]:
voting_clf.score(X_valid,y_valid)

0.9731

Testing each model evaluation in valid set so we need to convet y_valid in class indices because voting classifier change the class into indices and train the data. So in MINST dataset we can change the class names to integer,since the digits match the class ids

In [17]:
y_valid_encoded = y_valid.astype(np.int64)

In [18]:
# evaluating in each estimator
[estimator.score(X_valid,y_valid_encoded) for estimator in voting_clf.estimators_]

[0.9716, 0.8662, 0.9718, 0.9733]

In [21]:
# removing svm from the estimator
estimator = [
    ('random_forest_clf',random_classifier),
    ('k_classifier',neighbors_classifier),
    ('extra_classifier',extra_classifier),
]

In [22]:
voting_clf = VotingClassifier(estimators=estimator,voting='hard')
voting_clf.fit(X_train,y_train)

In [24]:
voting_clf.score(X_valid,y_valid)

0.9751

In [25]:
# Testing on test set
voting_clf.score(X_test,y_test)

0.972

In [26]:
[estimator.score(X_test,y_test.astype(np.int64)) for estimator in voting_clf.estimators_]

[0.9677, 0.9664, 0.9703]