# Question 8
## First load and split the data

In [1]:
from sklearn.datasets import fetch_mldata
import os

datasets_path = "datasets/"

mnist_data = os.path.join(datasets_path, "mnist_data/")
if not os.path.isdir(mnist_data):
    os.makedirs(mnist_data)

mnist = fetch_mldata('MNIST original', data_home=mnist_data)

In [2]:
# Check the data shape
mnist.data.shape

(70000, 784)

In [84]:
from sklearn.model_selection import StratifiedShuffleSplit


sss = StratifiedShuffleSplit(n_splits=1, test_size=10000/70000, train_size=50000/70000)

X = mnist.data
y = mnist.target

train_index, test_index = next(sss.split(X, y))
X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
X_train, X_val, y_train, y_val = X_train[:40000], X_train[40000:], y_train[:40000], y_train[40000:]

## Train Individual Classifiers

### Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()

In [5]:
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [6]:
from sklearn.metrics import accuracy_score
y_rf = rf_clf.predict(X_test)
accuracy_score(y_test, y_rf)

0.94440000000000002

### Extra Trees

In [7]:
from sklearn.ensemble import ExtraTreesClassifier

ex_clf = ExtraTreesClassifier()

In [8]:
ex_clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [9]:
y_ex = ex_clf.predict(X_test)
accuracy_score(y_test, y_ex)

0.94850000000000001

In [10]:
ex_clf.predict_proba(X_test)

array([[ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  1. , ...,  0. ,  0. ,  0. ],
       ..., 
       [ 0.1,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0.9,  0. , ...,  0.1,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  0. ]])

### SVM classifier

LinearSVC cant return probability. So We wont try soft voting on LinearSVC.

In [29]:
from sklearn.svm import LinearSVC

sv_clf = LinearSVC()

TypeError: __init__() got an unexpected keyword argument 'probability'

In [21]:
sv_clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [22]:
y_sv = sv_clf.predict(X_test)
accuracy_score(y_test, y_sv)

0.84699999999999998

## Now Train the voting machine

### Hard Voting

In [25]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
        ("rf", rf_clf),
        ("ex", ex_clf),
        ("sgd", sgd_clf),
        ("sv", sv_clf),
    ],
    voting="hard"
)

In [26]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [28]:
for clf in (rf_clf, ex_clf, sv_clf, sgd_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

RandomForestClassifier 0.9431
ExtraTreesClassifier 0.9414
LinearSVC 0.8601
SGDClassifier 0.8515
VotingClassifier 0.9385


### Now soft voting without SVM, SGD

In [43]:
from sklearn.ensemble import VotingClassifier

soft_voting_clf = VotingClassifier(estimators=[
        ("rf", rf_clf),
        ("ex", ex_clf),
    ],
    voting="soft"
)

In [44]:
soft_voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...timators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [46]:
for clf in (rf_clf, ex_clf, soft_voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

RandomForestClassifier 0.9425
ExtraTreesClassifier 0.9486
VotingClassifier 0.9578


### Hard Voting without SGD

In [33]:
from sklearn.ensemble import VotingClassifier

voting_clf_2 = VotingClassifier(estimators=[
        ("rf", rf_clf),
        ("ex", ex_clf),
        ("sv", sv_clf),
    ],
    voting="hard"
)

In [107]:
for clf in (rf_clf, ex_clf, sv_clf, voting_clf_2):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

RandomForestClassifier 0.9398
ExtraTreesClassifier 0.943
LinearSVC 0.8482
VotingClassifier 0.945


The SGD regressor is draggin the accuracy down!

#### Overall, the soft voting machine with only EX and RF wins

# Qeustion 9

In [77]:
unique, counts = np.unique(y_train[:13333], return_counts=True)
dict(zip(unique, counts))

{0.0: 1292,
 1.0: 1509,
 2.0: 1340,
 3.0: 1378,
 4.0: 1333,
 5.0: 1197,
 6.0: 1272,
 7.0: 1409,
 8.0: 1279,
 9.0: 1324}

Seems the classes are well distributed

## Let's try train with split training set/ without split training set

### With split

In [79]:
sv_clf.fit(X_train[:13333], y_train[:13333])
rf_clf.fit(X_train[13333:26666], y_train[13333:26666])
ex_clf.fit(X_train[26666:40000], y_train[26666:40000])

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [85]:
import numpy as np
import pandas as pd


preds = np.array([sv_clf.predict(X_val), rf_clf.predict(X_val), ex_clf.predict(X_val)])
X_lay1 = preds.T
y_lay1 = y_val

In [86]:
preds_df = pd.DataFrame(preds.T, columns=["sv", "rf", "ex"])
corr_preds = preds_df.corr()
corr_preds

Unnamed: 0,sv,rf,ex
sv,1.0,0.865182,0.866702
rf,0.865182,1.0,0.904473
ex,0.866702,0.904473,1.0


In [88]:
rf_blender = RandomForestClassifier()

In [89]:
rf_blender.fit(X_lay1, y_lay1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [96]:
X_test_blend = np.array([sv_clf.predict(X_test), rf_clf.predict(X_test), ex_clf.predict(X_test)]).T
y_blender = rf_blender.predict(X_test_blend)
accuracy_score(y_test, y_blender)

0.94120000000000004

In [99]:
for clf in (sv_clf, rf_clf, ex_clf):
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
print(rf_blender.__class__.__name__, accuracy_score(y_test, y_blender))


LinearSVC 0.8484
RandomForestClassifier 0.9337
ExtraTreesClassifier 0.9408
RandomForestClassifier 0.9412


It out performed other models. 

### Now let's try using whole data for training all predictors

In [100]:
sv_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
ex_clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [101]:
preds = np.array([sv_clf.predict(X_val), rf_clf.predict(X_val), ex_clf.predict(X_val)])
X_lay1 = preds.T
y_lay1 = y_val

In [102]:
preds_df = pd.DataFrame(preds.T, columns=["sv", "rf", "ex"])
corr_preds = preds_df.corr()
corr_preds

Unnamed: 0,sv,rf,ex
sv,1.0,0.868766,0.870622
rf,0.868766,1.0,0.930912
ex,0.870622,0.930912,1.0


In [103]:
rf_blender.fit(X_lay1, y_lay1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [104]:
X_test_blend = np.array([sv_clf.predict(X_test), rf_clf.predict(X_test), ex_clf.predict(X_test)]).T
y_blender = rf_blender.predict(X_test_blend)
for clf in (sv_clf, rf_clf, ex_clf):
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
print(rf_blender.__class__.__name__, accuracy_score(y_test, y_blender))

LinearSVC 0.8643
RandomForestClassifier 0.9415
ExtraTreesClassifier 0.9484
RandomForestClassifier 0.9478


In [105]:
X_val_blend = np.array([sv_clf.predict(X_val), rf_clf.predict(X_val), ex_clf.predict(X_val)]).T
y_blender = rf_blender.predict(X_val_blend)
for clf in (sv_clf, rf_clf, ex_clf):
    y_pred = clf.predict(X_val)
    print(clf.__class__.__name__, accuracy_score(y_val, y_pred))
print(rf_blender.__class__.__name__, accuracy_score(y_val, y_blender))

LinearSVC 0.8622
RandomForestClassifier 0.9354
ExtraTreesClassifier 0.9436
RandomForestClassifier 0.9632


#### It's good performance on validation set, But the blender is overfitting the model. And performs even worse than ExtraTreesClassifier!

### Linear Regression(No longer needed)

In [40]:
from sklearn.linear_model import SGDClassifier, LinearRegression

sgd_clf = SGDClassifier(loss="log")
lr_clf = LinearRegression()



In [41]:
lr_clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [42]:
y_lr = lr_clf.predict(X_test)
accuracy_score(y_test, y_lr)

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [None]:
sgd_clf.fit(X_train, y_train)

In [17]:
y_sgd = sgd_clf.predict(X_test)
accuracy_score(y_test, y_sgd)

0.8649

In [19]:
sgd_clf.predict_proba(X_test)

  np.exp(prob, prob)
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


array([[ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0.5,  0. ],
       [ 0. ,  0. ,  1. , ...,  0. ,  0. ,  0. ],
       ..., 
       [ 0. ,  0. ,  0. , ...,  0. ,  0.5,  0. ],
       [ 0. ,  1. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  1. ,  0. ,  0. ]])