## Voting and Stack Exploration
  
Here we experiment with voting and stack ensemble methods on the MNIST Dataset  
  
We begin with designing a voting classifier from Random Forest, SVM, and Logistic Regression Models

In [1]:
import numpy as np
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import StandardScaler

# Get MNIST data

mnist = fetch_mldata('MNIST original')
X = StandardScaler().fit_transform(mnist['data'].astype(np.float))
y = mnist['target']



In [2]:
from sklearn.model_selection import train_test_split

# Split into train, validation, and test sections

X_val, X_test, y_val, y_test = train_test_split(X,y, test_size=10000)
X_train, X_val, y_train, y_val = train_test_split(X_val,y_val, test_size=10000)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# Train independent models

for_clf = RandomForestClassifier(n_estimators=15)
svm_clf = LinearSVC()
log_clf = LogisticRegression()

estimators = [for_clf, svm_clf, log_clf]

In [4]:
# View models performance

for model in estimators:
    model.fit(X_train,y_train)

[model.score(X_val,y_val) for model in estimators]



[0.9544, 0.9088, 0.9178]

In [11]:
from sklearn.ensemble import VotingClassifier

# Create a voting classifier

vot_clf = VotingClassifier([('for',for_clf),
                            ('svm',svm_clf),
                            ('log',log_clf)])
vot_clf.fit(X_train,y_train)



VotingClassifier(estimators=[('for', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_w...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [12]:
# Voting performance

vot_clf.score(X_val,y_val)

0.9231

In [13]:
# Remove weakest classifier and implement soft voting?

del vot_clf.estimators_[1]
vot_clf.voting = 'soft'
vot_clf.score(X_val,y_val)

0.9407

Next we build a stack classifier using the estimators we've already trained

In [5]:
# Get predictions of models as inputs for stack classifier

est_outputs = np.empty((len(X_val),len(estimators)), dtype=np.float32)

for index, model in enumerate(estimators):
    est_outputs[:,index] = model.predict(X_val)

est_outputs

array([[5., 5., 5.],
       [2., 2., 2.],
       [7., 7., 7.],
       ...,
       [1., 1., 1.],
       [4., 4., 4.],
       [6., 6., 6.]], dtype=float32)

In [9]:
# train the stack classifier 

stk_clf = RandomForestClassifier(n_estimators=15)

stk_clf.fit(est_outputs, y_val)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
from sklearn.metrics import accuracy_score

# Test performance of estimator

est_preds = np.empty((len(X_test),len(estimators)), dtype=np.float32)

for index, model in enumerate(estimators):
    est_preds[:,index] = model.predict(X_test)

accuracy_score(stk_clf.predict(est_preds),y_test)

0.9424

Stack is still performing below a regular Random Forest classifier.. at least we tried