# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [1]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [3]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [4]:
X.min()

0.0

In [5]:
#look at max/min value in the data
X.max()

255.0

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)
* use max_iter in the SVM to avoid long training times 

In [57]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split as tts
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.neural_network import MLPClassifier as MLPC
from sklearn.metrics import mean_squared_error as mse

X_train,X_test,y_train,y_test = tts(X,y,test_size=65000)

In [19]:
pip_SVC = make_pipeline(StandardScaler(), SVC(kernel='rbf',max_iter=100))

distributions = dict(svc__gamma=['scale','auto'],
                     svc__C = uniform(loc=1,scale=10))

RSCV = RandomizedSearchCV(pip_SVC,distributions,random_state=0,n_jobs=-1,n_iter=10)
#print(clf.get_params().keys())

In [20]:
%%time
res_RSCV = RSCV.fit(X_train,y_train)

CPU times: user 5.36 s, sys: 72.1 ms, total: 5.43 s
Wall time: 1min 51s




In [21]:
#res_RSCV = search
res_RSCV.best_params_

{'svc__C': 6.4881350392732475, 'svc__gamma': 'auto'}

### E1.2: Pipelines and simple Neural Networks
Split the MNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


In [33]:
X_scaled = X /255
print(X_scaled.max())

X_sc_train,X_sc_test,y_sc_train,y_sc_test = tts(X_scaled,y,test_size=65000)
np.shape(X_sc_train)

1.0


(5000, 784)

In [120]:
mlpcm = MLPC(hidden_layer_sizes=(30,30,30),learning_rate = 'constant',alpha =0.0001,solver='sgd', activation='tanh',random_state=1,max_iter=200)
NN_pip = make_pipeline(StandardScaler(),mlpcm)

In [121]:
NN_pip.fit(X_sc_train,y_sc_train)



Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(30, 30, 30),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=200, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=1, shuffle=True, solver='sgd',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False))],
         verbose=False)

In [122]:
pred_y_sc_test = NN_pip.predict(X_sc_test)

In [124]:
mse(y_sc_test.astype(np.float),pred_y_sc_test.astype(np.float))

1.6755076923076924

***Variation 1: Variation of number of Layers***

In [126]:
non = 40
Layer_poss = [(non),(non,non),(non,non,non),(non,non,non,non)]

for layers in Layer_poss:
    mlpcm = MLPC(hidden_layer_sizes=layers,learning_rate = 'constant',alpha =0.0001,solver='sgd', activation='tanh',random_state=1,max_iter=1000)
    NN_pip = make_pipeline(StandardScaler(),mlpcm) 
    NN_pip.fit(X_sc_train,y_sc_train)
    pred_y_sc_test = NN_pip.predict(X_sc_test)
    res_mse = mse(y_sc_test.astype(np.float),pred_y_sc_test.astype(np.float))
    print('\n layers: ', layers, 'mse: ', res_mse)


 layers:  40 mse:  1.5421384615384615

 layers:  (40, 40) mse:  1.5603384615384615

 layers:  (40, 40, 40) mse:  1.5528923076923078

 layers:  (40, 40, 40, 40) mse:  1.641676923076923


***Variation 2:variation of number of neurons with 1 layer***

In [127]:
Layer_poss = [(10),(20),(30),(40),(50),(60)]

for layers in Layer_poss:
    mlpcm = MLPC(hidden_layer_sizes=layers,learning_rate = 'constant',alpha =0.0001,solver='sgd', activation='tanh',random_state=1,max_iter=1000)
    NN_pip = make_pipeline(StandardScaler(),mlpcm) 
    NN_pip.fit(X_sc_train,y_sc_train)
    pred_y_sc_test = NN_pip.predict(X_sc_test)
    res_mse = mse(y_sc_test.astype(np.float),pred_y_sc_test.astype(np.float))
    print('\n layers: ', layers, 'mse: ', res_mse)


 layers:  10 mse:  2.0679846153846153

 layers:  20 mse:  1.6497076923076923

 layers:  30 mse:  1.615

 layers:  40 mse:  1.5421384615384615

 layers:  50 mse:  1.5587692307692307

 layers:  60 mse:  1.5651846153846154


***Variation 3: variation of number of neurons with 3 layers***

In [128]:
Layer_poss = [(10,10,10),(20,20,20),(30,30,30),(40,40,40),(50,50,50),(60,60,60)]

for layers in Layer_poss:
    mlpcm = MLPC(hidden_layer_sizes=layers,learning_rate = 'constant',alpha =0.0001,solver='sgd', activation='tanh',random_state=1,max_iter=1000)
    NN_pip = make_pipeline(StandardScaler(),mlpcm) 
    NN_pip.fit(X_sc_train,y_sc_train)
    pred_y_sc_test = NN_pip.predict(X_sc_test)
    res_mse = mse(y_sc_test.astype(np.float),pred_y_sc_test.astype(np.float))
    print('\n layers: ', layers, 'mse: ', res_mse)


 layers:  (10, 10, 10) mse:  2.453492307692308

 layers:  (20, 20, 20) mse:  1.876723076923077

 layers:  (30, 30, 30) mse:  1.6840461538461537

 layers:  (40, 40, 40) mse:  1.5528923076923078

 layers:  (50, 50, 50) mse:  1.5460923076923077

 layers:  (60, 60, 60) mse:  1.616046153846154


***Variation 3: variation of number of neurons with 3 layers***

macht mse überhaupt sinn?