# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [1]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [3]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [4]:
X.min()

0.0

In [5]:
#look at max/min value in the data
X.max()

255.0

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)

In [6]:
from sklearn.model_selection import train_test_split

X_norm = X / X.max()
#y_norm = y / y.max()

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=10000, random_state=42)


In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [7]:
%%time

#my_svc = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
my_svc = SVC(kernel='rbf', max_iter=100)

distributions = dict(C=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2],
                     gamma=['auto', 'scale'])

RSCV = RandomizedSearchCV(my_svc, distributions, random_state=0, n_jobs=-1)
RSCV.fit(X_train, y_train)

RSCV.best_params_



{'gamma': 'scale', 'C': 0.9}

In [8]:
%%time

my_svc = SVC(kernel='rbf', C=0.9, gamma='scale');
my_svc.fit(X_train, y_train)

Wall time: 5min 3s


SVC(C=0.9)

In [9]:
%%time

y_test_pred = my_svc.predict(X_test)

Wall time: 1min 22s


In [10]:
%%time

y_train_pred = my_svc.predict(X_train)

Wall time: 8min 17s


In [11]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_test_pred)

array([[ 973,    0,    3,    0,    1,    0,    1,    1,    3,    1],
       [   0, 1140,    4,    3,    0,    0,    0,    3,    1,    1],
       [   2,    2,  948,    1,    2,    1,    3,    3,    5,    0],
       [   0,    2,    8, 1001,    1,    7,    1,    6,    4,    4],
       [   1,    0,    2,    0,  886,    0,    3,    2,    2,   10],
       [   0,    0,    2,   12,    0,  912,    7,    1,    3,    0],
       [   1,    0,    0,    0,    3,    3,  951,    0,    3,    0],
       [   0,    4,    9,    0,    5,    0,    0, 1029,    0,    8],
       [   1,    4,    8,   10,    2,    5,    5,    5,  925,    4],
       [   5,    5,    1,    6,    9,    1,    0,    6,    4,  999]],
      dtype=int64)

In [12]:
confusion_matrix(y_train, y_train_pred)

array([[5894,    1,    2,    1,    2,    5,    5,    1,    5,    4],
       [   1, 6686,   14,    3,    2,    1,    1,   10,    3,    4],
       [   7,    4, 5967,    7,    9,    0,    1,   16,    9,    3],
       [   1,    3,   18, 6008,    1,   23,    0,   19,   23,   11],
       [   2,    9,    5,    0, 5852,    0,   12,    3,    1,   34],
       [   7,    2,    5,   19,    6, 5314,   13,    1,    4,    5],
       [   9,    5,    1,    0,    5,    7, 5884,    0,    4,    0],
       [   2,   21,   19,    1,   15,    0,    0, 6151,    3,   26],
       [   3,   17,    9,   11,    6,   13,    4,    3, 5787,    3],
       [   7,    6,    2,   18,   41,    6,    1,   28,    8, 5805]],
      dtype=int64)

### E1.2: Pipelines and simple Neural Networksconfusion_matrixMNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import RandomizedSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42)


from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from scipy.stats import uniform

In [31]:
%%time

myMLP = make_pipeline(StandardScaler(), MLPClassifier(random_state=42, max_iter=1000, solver='sgd', activation='tanh'))

distributions = dict(mlpclassifier__alpha = uniform(0.0001, 0.9),
                    mlpclassifier__learning_rate=['constant', 'invscaling', 'adaptive'])

RSCV = RandomizedSearchCV(myMLP, distributions, random_state=0, n_jobs=-1)

Wall time: 0 ns


In [32]:
%%time


RSCV.fit(X_train, y_train)
RSCV.best_params_

Wall time: 17min 46s


{'mlpclassifier__alpha': 0.34604353656342984,
 'mlpclassifier__hidden_layer_sizes': 3,
 'mlpclassifier__learning_rate': 'constant'}

In [39]:
%%time

alpha = 0.3451973669431999
learning_rate = 'adaptive'

myMLP = make_pipeline(StandardScaler(), MLPClassifier(random_state=42, max_iter=10000, alpha=0.3451973669431999, learning_rate='adaptive', solver='sgd', activation='tanh'))

myMLP.fit(X_train, y_train)

Wall time: 5min 30s


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.3451973669431999,
                               learning_rate='adaptive', max_iter=10000,
                               random_state=42, solver='sgd'))])

In [40]:
%%time

y_test_pred = myMLP.predict(X_test)

Wall time: 112 ms


In [41]:
%%time

y_train_pred = myMLP.predict(X_train)

Wall time: 535 ms


In [42]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_test_pred)

array([[ 963,    0,    4,    0,    1,    2,    6,    2,    4,    1],
       [   0, 1136,    3,    3,    1,    1,    0,    3,    4,    1],
       [   1,    6,  922,    2,    4,    4,    9,    8,   11,    0],
       [   1,    4,   12,  983,    1,    8,    3,   12,    5,    5],
       [   0,    1,    1,    0,  881,    2,    4,    4,    1,   12],
       [   1,    1,    1,   17,    3,  899,    8,    0,    5,    2],
       [   4,    1,    1,    0,    3,    5,  944,    0,    2,    1],
       [   2,    4,    9,    0,    7,    1,    0, 1025,    0,    7],
       [   3,    5,    5,   14,    4,    2,    6,    6,  918,    6],
       [   4,    5,    1,    6,   11,    2,    0,   11,    6,  990]],
      dtype=int64)

In [43]:
confusion_matrix(y_train, y_train_pred)

array([[5874,    1,    2,    3,    5,    7,   12,    2,   13,    1],
       [   1, 6676,   14,    4,    8,    4,    2,    7,    7,    2],
       [  14,    9, 5932,    7,   16,    0,    4,   20,   19,    2],
       [   3,    5,   17, 5990,    4,   26,    1,   17,   29,   15],
       [   1,   11,    6,    1, 5846,    0,   12,    3,    4,   34],
       [   9,    2,    4,   23,    5, 5303,   14,    0,    9,    7],
       [  15,    9,    1,    1,    8,   11, 5862,    0,    8,    0],
       [   4,   19,   15,    1,   20,    2,    1, 6154,    4,   18],
       [   8,   27,    4,   15,    6,   10,    4,    6, 5766,   10],
       [   6,   11,    1,   17,   32,    7,    2,   21,   12, 5813]],
      dtype=int64)

In [44]:
from sklearn.metrics import f1_score

f1_score(y_test, y_test_pred, average=None)

array([0.98165138, 0.98142549, 0.95742471, 0.95483244, 0.96706915,
       0.96511004, 0.97269449, 0.96425212, 0.95376623, 0.96069869])

In [45]:
f1_score(y_train, y_train_pred, average=None)

array([0.99097427, 0.98940348, 0.98710375, 0.98446873, 0.98517021,
       0.9869719 , 0.99112351, 0.98716715, 0.98337171, 0.9832544 ])