# Questions 

## How many hidden layers are used?
##### 1

## Which activation function should be used in the hidden layer(s)?
#### none linear e.g. ReLU, sigmuid  

## Which activation function should be used in the output layer?
#### any or nothing if the softmay layer is used afterwards

## Which loss function is used?
#### cross-entropy

# First Experimentation

## Make a simple train-test split of the breast cancer data ( test_size=0.1 ) using random_state=42 

In [None]:
from sklearn.datasets import load_breast_cancer
import numpy as np
x,y = load_breast_cancer(return_X_y = True, as_frame = True)
x.head()

## Implement a Multi-layer Perceptron ( MLPClassifier ) with one hidden layer consisting of 64 hidden nodes with a ReLU activation and a softmax output layer to compute the cross entropy loss. Use a batch size of 100 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

x_train, x_test, y_train, y_test = train_test_split(x ,y, train_size=0.9, random_state=42)

scaler = StandardScaler(copy=True)
xTrain_scaled = scaler.fit_transform(x_train, y_train)
xTest_scaled = scaler.transform(x_test)

clf = MLPClassifier(random_state=42, hidden_layer_sizes=(64),learning_rate_init=0.0001, batch_size=100, activation='relu')
#clf.out_activation_ can not be overwritten, would change after fit again to logistic
#clf.out_activation_='softmax'
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

print(clf.score(x_test,y_test))
#print(clf.out_activation_)
#print(clf.n_outputs_)
print(classification_report(y_test, y_pred))
plot_confusion_matrix(clf, x_test, y_test, cmap ='Blues')

## Train the MLPClassifier with SGD optimizer and the default L2 regularization. No momentum (needs to be turned off).
## Print/plot the classification report and the confusion matrix for the test data.
## Plot the training loss curve

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


def plot_learning_curve(df_loss_curve, title='Loss curve - Breast Cancer Data'):
    sns.lineplot(data=df_loss_curve, dashes=False)
    plt.title(title)
    plt.show()

max_iter = 200
x_train, x_test, y_train, y_test = train_test_split(x ,y, train_size=0.9, random_state=42)

scaler = StandardScaler(copy=True)
xTrain_scaled = scaler.fit_transform(x_train, y_train)
xTest_scaled = scaler.transform(x_test)

clf = MLPClassifier(random_state=42, max_iter= max_iter, hidden_layer_sizes=[64],learning_rate_init=0.01, batch_size=100, activation='relu', solver = "sgd")
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

print(clf.score(x_test,y_test))
print(classification_report(y_test, y_pred))
#plot_confusion_matrix(clf, x_test, y_test, cmap ='Blues')

loss_curve = clf.loss_curve_ + [np.NaN] * (max_iter - len(clf.loss_curve_))
loss_curve_df = pd.DataFrame(loss_curve, columns=['SGD Baseline'])
plot_learning_curve(loss_curve_df)

## Reusing the same train-test split, experiment with different learning rates (by changing the value of learning_rate_init in the range between 0.0001 and 0.1 ) and interpret the changes in the loss curve

In [None]:
def plot_learning_curve(df_loss_curve, learningRate, ax):
    title = "Loss curve with learning rate = " + str(learningRate)
    sns.lineplot(data=df_loss_curve, dashes=False, ax=ax)
    ax.set_title(title)

def myclassifer(learning_rate, xTrain, xTest, yTrain, yTest):
    max_iter = 200

    clf = MLPClassifier(random_state=42, max_iter= max_iter, hidden_layer_sizes=[64],learning_rate_init=learning_rate, batch_size=100, activation='relu', solver = "sgd")
    clf.fit(xTrain, yTrain)
    y_pred = clf.predict(xTest)

    #print(clf.score(xTest, yTest))
    #print(classification_report(yTest, y_pred))
    return clf

x_train, x_test, y_train, y_test = train_test_split(x ,y, train_size=0.9, random_state=42)
scaler = StandardScaler(copy=True)
xTrain_scaled = scaler.fit_transform(x_train, y_train)
xTest_scaled = scaler.transform(x_test)

learningrates =[0.0001, 0.001, 0.01, 0.1]

fig, axs = plt.subplots(2,2,figsize=(15,20))

i = 0
for row in range (2):
    for column in range(2):
        clf = myclassifer(learningrates[i], x_train, x_test, y_train, y_test )
        print("learning rate = ", learningrates[i])
        xxx = clf.coefs_
        print("weight of first layer = ", len(clf.coefs_[0]))
        print("weight of second layer = ", len(clf.coefs_[1]))
        loss_curve = clf.loss_curve_ + [np.NaN] * (max_iter - len(clf.loss_curve_))
        loss_curve_df = pd.DataFrame(loss_curve, columns=['SGD Baseline'])
        plot_learning_curve(loss_curve_df, learningrates[i], axs[row,column])
        i+=1
        print("----------------------------------------------------------")


### Inspect the shape of the classifier's attribute coef_ (the weights of the network) and interpret it.
#### we can see in the clf.coefs_[0] the weights of each input node and in clf.coeffs_[1] the weights of each hidden node

# Different Optimizers

## Next, experiment with different optimizers and settings. Use an initial learning rate of 0.0001 andalpha=0.001 throughout. Make sure to implement the following: 
#### * 'SGD Baseline' : SGD optimizer with default L2 regularization.
#### * 'SGD with momentum' : SGD optimizer with default L2 regularization and a momentum of 0.9.
#### * 'SGD with decreasing lr' : SGD optimizer with automatically decreasing learning rate.
#### * 'Adam' : Adam optimizer with default L2 regularization and decay rates.

#### Plot all training loss curves in a single plot and note the differences.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x ,y, train_size=0.9, random_state=42)
scaler = StandardScaler(copy=True)
xTrain_scaled = scaler.fit_transform(x_train, y_train)
xTest_scaled = scaler.transform(x_test)

maxIter = 2000
#alpha 0.0001 = L2 regularization
clf = MLPClassifier(random_state=42, alpha=0.0001, max_iter= maxIter, hidden_layer_sizes=[64],
    learning_rate_init=0.0001, batch_size=100, momentum=0, activation='relu', solver = "sgd")
clf.fit(xTrain_scaled, y_train)
loss_curve = clf.loss_curve_ + [np.NaN] * (max_iter - len(clf.loss_curve_))
loss_curve_df = pd.DataFrame(loss_curve, columns=['SGD Baseline'])

#SGD optimizer with default L2 regularization and a momentum of 0.9.
clf = MLPClassifier(random_state=42, alpha=0.0001, max_iter= maxIter, hidden_layer_sizes=[64],
    learning_rate_init=0.0001, batch_size=100, momentum=0.9, activation='relu', solver = "sgd")
clf.fit(xTrain_scaled, y_train)
loss_curve = clf.loss_curve_ + [np.NaN] * (max_iter - len(clf.loss_curve_))
loss_curve_df['SGD with momentum'] = pd.DataFrame(loss_curve)

#SGD optimizer with automatically decreasing learning rate.
clf = MLPClassifier(random_state=42, alpha=0.0001, max_iter= maxIter, hidden_layer_sizes=[64],
    learning_rate_init=0.0001, learning_rate = "invscaling", batch_size=100, momentum=0.9, activation='relu', solver = "sgd")
clf.fit(xTrain_scaled, y_train)
loss_curve = clf.loss_curve_ + [np.NaN] * (max_iter - len(clf.loss_curve_))
loss_curve_df['SGD with decreasing lr'] = pd.DataFrame(loss_curve)

#Adam optimizer with default L2 regularization and decay rates
clf = MLPClassifier(random_state=42, alpha=0.0001, max_iter= maxIter, hidden_layer_sizes=[64],
    learning_rate_init=0.0001, batch_size=100, activation='relu', solver = "adam")
clf.fit(xTrain_scaled, y_train)
loss_curve = clf.loss_curve_ + [np.NaN] * (max_iter - len(clf.loss_curve_))
loss_curve_df['Adam'] = pd.DataFrame(loss_curve)

title = "Loss curves"
sns.lineplot(data=loss_curve_df, dashes=False)
plt.title(title)


## Early Stopping Strategy
#### Adapt the different classifiers/optimizer settings from above such that they stop early if the validation loss is increasing. This is used to prevent overfitting. Use 10% of the data for validation. Interpret the results.

In [None]:
scaler = StandardScaler(copy=True)
xTrain_scaled = scaler.fit_transform(x, y)

maxIter = 2000
#alpha 0.0001 = L2 regularization
clf = MLPClassifier(random_state=42, alpha=0.0001, max_iter= maxIter, hidden_layer_sizes=[64],
    learning_rate_init=0.0001, batch_size=100, momentum=0, activation='relu', solver = "sgd", early_stopping=True, n_iter_no_change=10)
clf.fit(xTrain_scaled, y)
loss_curve = clf.loss_curve_ + [np.NaN] * (maxIter - len(clf.loss_curve_))
loss_curve_df = pd.DataFrame(loss_curve, columns=['SGD Baseline'])

#SGD optimizer with default L2 regularization and a momentum of 0.9.
clf = MLPClassifier(random_state=42, alpha=0.0001, max_iter= maxIter, hidden_layer_sizes=[64],
    learning_rate_init=0.0001, batch_size=100, momentum=0.9, activation='relu', solver = "sgd", early_stopping=True, n_iter_no_change=10)
clf.fit(xTrain_scaled, y)
loss_curve = clf.loss_curve_ + [np.NaN] * (maxIter - len(clf.loss_curve_))
loss_curve_df['SGD with momentum'] = pd.DataFrame(loss_curve)

#SGD optimizer with automatically decreasing learning rate.
clf = MLPClassifier(random_state=42, alpha=0.0001, max_iter= maxIter, hidden_layer_sizes=[64],
    learning_rate_init=0.0001, learning_rate = "invscaling", batch_size=100, momentum=0.9, activation='relu', solver = "sgd", 
    early_stopping=True, n_iter_no_change=10)
clf.fit(xTrain_scaled, y)
loss_curve = clf.loss_curve_ + [np.NaN] * (maxIter - len(clf.loss_curve_))
loss_curve_df['SGD with decreasing lr'] = pd.DataFrame(loss_curve)

#Adam optimizer with default L2 regularization and decay rates
clf = MLPClassifier(random_state=42, alpha=0.0001, max_iter= maxIter, hidden_layer_sizes=[64],
    learning_rate_init=0.0001, batch_size=100, activation='relu', solver = "adam", early_stopping=True, n_iter_no_change=10)
clf.fit(xTrain_scaled, y)
loss_curve = clf.loss_curve_ + [np.NaN] * (maxIter - len(clf.loss_curve_))
loss_curve_df['Adam'] = pd.DataFrame(loss_curve)

title = "Loss curves"
sns.lineplot(data=loss_curve_df, dashes=False)
plt.title(title)

#### this could mean that the classifier could overfit very fast

## Grid Search
#### mplement a grid search with 5-fold cross validation using the GridSearchCV to find the best hyperparameter settings for the MLPClassifier . Make sure to include both the Adam optimizer and the SGD variant with momentum in the grid.

In [None]:
scaler = StandardScaler(copy=True)
xTrain_scaled = scaler.fit_transform(x, y)

from sklearn.model_selection import GridSearchCV
parameters = {  "hidden_layer_sizes":[[50,1],[64,1],[100,1], [50,2],[64,2],[100,2]], 
                "learning_rate_init":[0.0001, 0.001, 0.01],
                "activation":['identity', 'logistic', 'tanh', 'relu'],
                "random_state":[42],"early_stopping":[True],
                "alpha":[0.0001, 0.001, 0.01],
                "solver":["sgd","adam","lbfgs"]}

mlp = MLPClassifier()
clf = GridSearchCV(mlp, parameters)
clf.fit(xTrain_scaled,y)

print(clf.best_estimator_.score)
print("best score: ",clf.score(xTrain_scaled,y))