# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [1]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [3]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [4]:
X.min()

0.0

In [5]:
#look at max/min value in the data
X.max()

255.0

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)

In [6]:
from sklearn import datasets 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [7]:
#welche Parameter gibt es
SVC().get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [8]:
#Gamma
param = {'C':[1, 10, 100, 1000], 
          'cache_size':[200], 
          'gamma': [0.01, 0.001, 0.0001], 
          'kernel':['rbf']}


In [9]:
from sklearn.model_selection import RandomizedSearchCV

svc_model = SVC(max_iter=20)
randm_search = RandomizedSearchCV(svc_model, param, cv=3, n_jobs=-1, pre_dispatch='n_jobs')

randm_search.fit(X,y)




RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=20,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'C': [1, 10, 100, 1000],
                                        'cache_size': [200],
                                        'gamma': [0.01, 0.001, 0.0001],
                                        'kernel': ['rbf']},
                   pre_dispatch='n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [10]:
print(randm_search.best_params_)
print(randm_search.best_score_)

{'kernel': 'rbf', 'gamma': 0.0001, 'cache_size': 200, 'C': 1}
0.7045575262004656


In [11]:
#Modell trainieren
#beste gamma wert laut schritt vorher: 0,0001
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

SVC_with_best = SVC(gamma=0.0001, max_iter=50)
scaled_best = make_pipeline(StandardScaler(), SVC_with_best)



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaled_best.fit(X_train, y_train)

predicted = scaled_best.predict(X_test)




In [12]:
print(predicted)
print(scaled_best.fit(X_train, y_train))


['9' '9' '4' ... '0' '9' '8']
Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma=0.0001, kernel='rbf', max_iter=50, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)




### E1.2: Pipelines and simple Neural Networks
Split the MNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


## Runde 1

In [13]:
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)



model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(28,28), 
                                         solver='sgd', 
                                         alpha=0.1, 
                                         max_iter=300))

model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1]
0.96




In [14]:
X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)

model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(28,28), 
                                         solver='sgd', 
                                         alpha=0.01, 
                                         max_iter=50))

model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 1 1 1 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0]
0.8




In [15]:


X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)

model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(28,28,28, 28), 
                                         solver='sgd', 
                                         alpha=0.001, 
                                         max_iter=300))
model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1]
0.96




In [16]:
X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)

model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(64,64), 
                                         solver='sgd', 
                                         alpha=0.0001, 
                                         max_iter=300))

model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 1]
0.92




In [17]:




X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)

model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(64,64), 
                                         solver='sgd', 
                                         alpha=0.000001, 
                                         max_iter=300))

model_pipe.fit(X_train, y_train)

print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[[0.15688793 0.84311207]
 [0.97981397 0.02018603]
 [0.05585516 0.94414484]
 [0.31830517 0.68169483]
 [0.04259432 0.95740568]
 [0.94194757 0.05805243]
 [0.93741591 0.06258409]
 [0.08513987 0.91486013]
 [0.61300887 0.38699113]
 [0.96846751 0.03153249]
 [0.98503895 0.01496105]
 [0.06646362 0.93353638]
 [0.12536707 0.87463293]
 [0.91985209 0.08014791]
 [0.16789189 0.83210811]
 [0.91064496 0.08935504]
 [0.89067313 0.10932687]
 [0.86701097 0.13298903]
 [0.97761815 0.02238185]
 [0.44688805 0.55311195]
 [0.05457322 0.94542678]
 [0.08683231 0.91316769]
 [0.93596826 0.06403174]
 [0.90712652 0.09287348]
 [0.19409603 0.80590397]]
[1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 1]
0.92




## Runde 2:

In [28]:
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)



model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(56), 
                                         solver='sgd', 
                                         alpha=0.1, 
                                         max_iter=300))


model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1]
0.92




In [32]:
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)



model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(60,56), 
                                         solver='sgd', 
                                         alpha=0.1, 
                                         max_iter=300))

model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1]
0.96




In [34]:
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)



model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(64,56), 
                                         solver='sgd', 
                                         alpha=0.1, 
                                         max_iter=300))

model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1]
0.96




## Test

In [35]:
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)



model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(1,56), 
                                         solver='sgd', 
                                         alpha=0.1, 
                                         max_iter=300))

model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 1 1 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 0 1 0 1]
0.92




In [36]:
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)



model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(2,56), 
                                         solver='sgd', 
                                         alpha=0.1, 
                                         max_iter=300))

model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0]
0.68




In [37]:
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

X, y = make_classification(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)



model_pipe = make_pipeline(StandardScaler(), 
                           MLPClassifier(random_state=1, 
                                         activation='tanh',
                                         hidden_layer_sizes=(3,56), 
                                         solver='sgd', 
                                         alpha=0.1, 
                                         max_iter=300))

model_pipe.fit(X_train, y_train)

#print(model_pipe.predict_proba(X_test))

print(model_pipe.predict(X_test))

print(model_pipe.score(X_test, y_test))

[1 0 1 0 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 1 1 1 0 1]
0.8


