# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [0]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [0]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [4]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [5]:
X.min()

0.0

In [6]:
#look at max/min value in the data
X.max()

255.0

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)

In [0]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [0]:
svm = SVC() #rbf is default kernel
parameters = dict(C=[0.1,1,2,5,10,100],gamma=[10,5, 1,0.1,'scale']) #create search space
search = RandomizedSearchCV(svm, parameters, n_jobs=4,n_iter=20, cv=5,random_state=0)

In [10]:
svmClassifier = SVC(kernel='rbf')
distributions = dict(C=[0.1,0.5,1,2,5,10,50,100],gamma=[10,5,1,0.5,0.1, 'scale'])
clf = RandomizedSearchCV(svmClassifier, distributions, random_state=0,n_jobs=4,n_iter=20)
search = clf.fit(X_train[:1000,:],y_train[:1000])
search.best_params_

{'C': 5, 'gamma': 'scale'}

### E1.2: Pipelines and simple Neural Networks
Split the MNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [0]:
##Standard
mlp1 = MLPClassifier(hidden_layer_sizes=(1, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant', max_iter=300)
mlp2 = MLPClassifier(hidden_layer_sizes=(1,2, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant', max_iter=300)
mlp3 = MLPClassifier(hidden_layer_sizes=(1,2,3, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant', max_iter=300)

pipe1 = make_pipeline(StandardScaler(), mlp1)
pipe2 = make_pipeline(StandardScaler(), mlp2)
pipe3 = make_pipeline(StandardScaler(), mlp3)

In [13]:
pipe1.fit(X_train, y_train)
print (pipe1.score(X_test, y_test))
pipe2.fit(X_train, y_train)
print(pipe2.score(X_test, y_test))
pipe3.fit(X_train, y_train)
print(pipe3.score(X_test, y_test))



0.40062857142857144




0.3356
0.4594285714285714




In [0]:
##More Neurons
mlp1 = MLPClassifier(hidden_layer_sizes=(10, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant', max_iter=300)
mlp2 = MLPClassifier(hidden_layer_sizes=(10,20, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant', max_iter=300)
mlp3 = MLPClassifier(hidden_layer_sizes=(10,20,30, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.001, learning_rate='constant', max_iter=300)

pipe1 = make_pipeline(StandardScaler(), mlp1)
pipe2 = make_pipeline(StandardScaler(), mlp2)
pipe3 = make_pipeline(StandardScaler(), mlp3)

In [15]:
pipe1.fit(X_train, y_train)
print (pipe1.score(X_test, y_test))
pipe2.fit(X_train, y_train)
print(pipe2.score(X_test, y_test))
pipe3.fit(X_train, y_train)
print(pipe3.score(X_test, y_test))



0.9166285714285715




0.9226857142857143




0.9246285714285715


In [0]:
##Higher Alpha
mlp1 = MLPClassifier(hidden_layer_sizes=(10, ), solver='sgd', activation='tanh', alpha=0.01, learning_rate_init=0.001, learning_rate='constant', max_iter=300)
mlp2 = MLPClassifier(hidden_layer_sizes=(10,20, ), solver='sgd', activation='tanh', alpha=0.01, learning_rate_init=0.001, learning_rate='constant', max_iter=300)
mlp3 = MLPClassifier(hidden_layer_sizes=(10,20,30, ), solver='sgd', activation='tanh', alpha=0.01, learning_rate_init=0.001, learning_rate='constant', max_iter=300)

pipe1 = make_pipeline(StandardScaler(), mlp1)
pipe2 = make_pipeline(StandardScaler(), mlp2)
pipe3 = make_pipeline(StandardScaler(), mlp3)

In [17]:
pipe1.fit(X_train, y_train)
print (pipe1.score(X_test, y_test))
pipe2.fit(X_train, y_train)
print(pipe2.score(X_test, y_test))
pipe3.fit(X_train, y_train)
print(pipe3.score(X_test, y_test))



0.9139428571428572




0.926




0.9219428571428572


In [0]:
##Lower Alpha
mlp1 = MLPClassifier(hidden_layer_sizes=(10, ), solver='sgd', activation='tanh', alpha=0.00001, learning_rate_init=0.001, learning_rate='constant', max_iter=300)
mlp2 = MLPClassifier(hidden_layer_sizes=(10,20, ), solver='sgd', activation='tanh', alpha=0.00001, learning_rate_init=0.001, learning_rate='constant', max_iter=300)
mlp3 = MLPClassifier(hidden_layer_sizes=(10,20,30, ), solver='sgd', activation='tanh', alpha=0.00001, learning_rate_init=0.001, learning_rate='constant', max_iter=300)

pipe1 = make_pipeline(StandardScaler(), mlp1)
pipe2 = make_pipeline(StandardScaler(), mlp2)
pipe3 = make_pipeline(StandardScaler(), mlp3)

In [19]:
pipe1.fit(X_train, y_train)
print (pipe1.score(X_test, y_test))
pipe2.fit(X_train, y_train)
print(pipe2.score(X_test, y_test))
pipe3.fit(X_train, y_train)
print(pipe3.score(X_test, y_test))



0.9177714285714286




0.9220571428571429




0.9210857142857143


In [0]:
##Higher Learning Rate
mlp1 = MLPClassifier(hidden_layer_sizes=(10, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.1, learning_rate='constant', max_iter=300)
mlp2 = MLPClassifier(hidden_layer_sizes=(10,20, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.1, learning_rate='constant', max_iter=300)
mlp3 = MLPClassifier(hidden_layer_sizes=(10,20,30, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.1, learning_rate='constant', max_iter=300)

pipe1 = make_pipeline(StandardScaler(), mlp1)
pipe2 = make_pipeline(StandardScaler(), mlp2)
pipe3 = make_pipeline(StandardScaler(), mlp3)

In [21]:
pipe1.fit(X_train, y_train)
print (pipe1.score(X_test, y_test))
pipe2.fit(X_train, y_train)
print(pipe2.score(X_test, y_test))
pipe3.fit(X_train, y_train)
print(pipe3.score(X_test, y_test))

0.8976571428571428
0.9096
0.9134285714285715


In [0]:
##Lower Learning Rate
mlp1 = MLPClassifier(hidden_layer_sizes=(10, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.0001, learning_rate='constant', max_iter=300)
mlp2 = MLPClassifier(hidden_layer_sizes=(10,20, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.0001, learning_rate='constant', max_iter=300)
mlp3 = MLPClassifier(hidden_layer_sizes=(10,20,30, ), solver='sgd', activation='tanh', alpha=0.0001, learning_rate_init=0.0001, learning_rate='constant', max_iter=300)

pipe1 = make_pipeline(StandardScaler(), mlp1)
pipe2 = make_pipeline(StandardScaler(), mlp2)
pipe3 = make_pipeline(StandardScaler(), mlp3)

In [23]:
pipe1.fit(X_train, y_train)
print (pipe1.score(X_test, y_test))
pipe2.fit(X_train, y_train)
print(pipe2.score(X_test, y_test))
pipe3.fit(X_train, y_train)
print(pipe3.score(X_test, y_test))



0.9097142857142857




0.9171428571428571




0.9138285714285714
