# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [1]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [3]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [4]:
X.min()

0.0

In [5]:
#look at max/min value in the data
X.max()

255.0

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)
* use max_iter in the SVM to avoid long training times 

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
#from scipy.stats import uniform

In [7]:
svc = SVC()
svc.fit(X_train,y_train)

SVC()

In [12]:
params=dict(C=[1,2,6,20,165.3,198],gamma=['scale', 'auto',6.1, 2.3, 17.2, 25.3])
randomSearch = RandomizedSearchCV(svc, params, n_jobs=3, random_state=0)
fitting=randomSearch.fit(X_train[:500,:],y_train[:500])
fitting.best_params_

{'gamma': 'scale', 'C': 198}

In [10]:
fitting.best_score_

0.89

In [11]:
fitting.cv_results_

{'mean_fit_time': array([0.23441167, 0.23266454, 0.23191113, 0.18724475, 0.23371444,
        0.23670321, 0.23735471, 0.23348002, 0.23099475, 0.2292912 ]),
 'std_fit_time': array([0.00182829, 0.0003886 , 0.00055271, 0.00179467, 0.00353201,
        0.00305152, 0.00698462, 0.00664705, 0.00023045, 0.00208637]),
 'mean_score_time': array([0.03087659, 0.03094697, 0.03090158, 0.02764964, 0.03100157,
        0.03051209, 0.0316278 , 0.03082538, 0.03055263, 0.03035846]),
 'std_score_time': array([9.68057413e-05, 2.76651944e-04, 2.34590730e-04, 3.28121797e-04,
        4.81436010e-04, 1.96206642e-04, 8.73167737e-04, 4.24531070e-04,
        1.59755606e-04, 6.92309828e-04]),
 'param_gamma': masked_array(data=['auto', 6.1, 17.2, 'scale', 17.2, 2.3, 17.2, 6.1, 25.3,
                    25.3],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_C': masked_array(data=[198, 20, 7, 198, 2

### E1.2: Pipelines and simple Neural Networks
Split the MNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [13]:
#1 layer, alpha=0.0001, 100 neurons per layer
pipeline1 = make_pipeline(StandardScaler(), MLPClassifier(random_state=1, hidden_layer_sizes=(100,), activation='tanh', solver='sgd', alpha=0.0001, max_iter=100))

In [14]:
pipeline1.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', max_iter=100, random_state=1,
                               solver='sgd'))])

In [15]:
pipeline1.score(X_test, y_test)

0.9546285714285714

In [31]:
#1 layer, alpha=0.0001, 100 neurons per layer, learning rate init=0.1 instead of default 0.001
pipeline1_lr = make_pipeline(StandardScaler(), MLPClassifier(random_state=1, hidden_layer_sizes=(100,), activation='tanh', solver='sgd', alpha=0.0001, learning_rate_init=0.1, max_iter=100))

In [32]:
pipeline1_lr.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', learning_rate_init=0.1,
                               max_iter=100, random_state=1, solver='sgd'))])

In [33]:
pipeline1_lr.score(X_test, y_test)

0.9637714285714286

In [19]:
#1 layer, alpha=0.01, 100 neurons per layer
pipeline1_1 = make_pipeline(StandardScaler(), MLPClassifier(random_state=1, hidden_layer_sizes=(100,), activation='tanh', solver='sgd', alpha=0.01, max_iter=100))

In [20]:
pipeline1_1.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.01, max_iter=100,
                               random_state=1, solver='sgd'))])

In [21]:
pipeline1_1.score(X_test, y_test)

0.9548

In [24]:
#1 layer, alpha=0.01, 200 neurons per layer
pipeline1_1_200=make_pipeline(StandardScaler(), MLPClassifier(random_state=1, hidden_layer_sizes=(200,), activation='tanh', solver='sgd', alpha=0.01, max_iter=100))

In [25]:
pipeline1_1_200.fit(X_train,y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.01,
                               hidden_layer_sizes=(200,), max_iter=100,
                               random_state=1, solver='sgd'))])

In [27]:
pipeline1_1_200.score(X_test, y_test)

0.9568571428571429

In [22]:
#2 layers, alpha=0.01, 100 neurons per layer
pipeline2 = make_pipeline(StandardScaler(), MLPClassifier(random_state=1, hidden_layer_sizes=(100,100), activation='tanh', solver='sgd', alpha=0.01, max_iter=100))

In [23]:
pipeline2.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.01,
                               hidden_layer_sizes=(100, 100), max_iter=100,
                               random_state=1, solver='sgd'))])

In [26]:
pipeline2.score(X_test, y_test)

0.9572

In [28]:
#3 layers, alpha=0.01, 100 neurons per layer
pipeline3=make_pipeline(StandardScaler(), MLPClassifier(random_state=1, hidden_layer_sizes=(100,100,100), activation='tanh', solver='sgd', alpha=0.01, max_iter=100))

In [29]:
pipeline3.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.01,
                               hidden_layer_sizes=(100, 100, 100), max_iter=100,
                               random_state=1, solver='sgd'))])

In [30]:
pipeline3.score(X_test, y_test)

0.9600571428571428