# 8

In [12]:
import numpy as np

In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', parser='auto', as_frame=False)

In [5]:
print(mnist.DESCR) 

**Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges  
**Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown  
**Please cite**:  

The MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  

It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image b

In [6]:
X_train, y_train = mnist.data[:50000], mnist.target[:50000]
X_valid, y_valid = mnist.data[50000:60000], mnist.target[50000:60000]
X_test, y_test = mnist.data[60000:], mnist.target[60000:]

In [7]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.svm import SVC

rf_clf = RandomForestClassifier(random_state=42)
et_clf = ExtraTreesClassifier(random_state=42)
svm_clf = SVC(random_state=42, probability=True)

rf_clf.fit(X_train, y_train)
et_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

In [8]:
estimator = [rf_clf, et_clf, svm_clf]
for i in estimator:
    print(i.__class__.__name__, "score:", i.score(X_valid, y_valid))

RandomForestClassifier score: 0.9736
ExtraTreesClassifier score: 0.9743
SVC score: 0.9802


In [9]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('et', et_clf),
        ('svm', svm_clf)
    ]
)

voting_clf.fit(X_train, y_train)

In [10]:
voting_clf.score(X_valid, y_valid)

0.9778

In [20]:
voting_clf.score(X_test, y_test)

0.9737

In [13]:
[estimator.score(X_test, y_test.astype(np.int64))
 for estimator in voting_clf.estimators_]

[0.968, 0.9703, 0.9785]

# 9

In [14]:
X_valid_pred = np.empty((len(X_valid), len(voting_clf.estimators_)), dtype=object)

In [16]:
for estimator in range(len(voting_clf.estimators_)):
    X_valid_pred[:, estimator] = voting_clf.estimators_[estimator].predict(X_valid)

In [18]:
rf_blender = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rf_blender.fit(X_valid_pred, y_valid)

In [19]:
rf_blender.oob_score_

0.9762

In [21]:
X_test_pred = np.empty((len(X_test), len(voting_clf.estimators_)), dtype=object)
for estimator in range(len(voting_clf.estimators_)):
    X_test_pred[:, estimator] = voting_clf.estimators_[estimator].predict(X_test)

In [22]:
y_test_pred = rf_blender.predict(X_test_pred)

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)

0.9749

In [24]:
from sklearn.ensemble import StackingClassifier
stack = StackingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('et', et_clf),
        ('svm', svm_clf)
    ],
    final_estimator=rf_blender
)

In [25]:
stack.fit(X_train, y_train)

In [26]:
stack.score(X_test, y_test)

0.9801