# Data set preprocessing

In [None]:
# Importing libraries
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score, make_scorer

In [None]:
feature_names = ['c_delete', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7','a8', 'a9', 'target1','target2']
data = pd.read_csv('data/ml_cup_tr.csv', skiprows=7, names=feature_names)
data=data.drop(['c_delete'], axis=1)

In [None]:
X= data[['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7','a8', 'a9']].copy()
y= data[['target1', 'target2']].copy()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [None]:
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.4f} (std: {1:.4f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# Mean Euclidean Error

In [None]:
def MEE(y_true, y_pred):
    # error[i] vector error of pattern i
    error = y_pred - y_true
    return np.mean(np.linalg.norm(error, axis = -1))

In [None]:
MEE_score = make_scorer(MEE, greater_is_better = False)

In [None]:
import keras.backend as K
def MEE_tf(y_true, y_pred):
    return K.mean(K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1)))

# Scikeras

In [None]:
from scikeras.wrappers import KerasRegressor

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras import regularizers, initializers
 
def create_model(hidden_layers_sizes, lambda_reg, activation, weight_init):
    n_input_ = 9
    model = Sequential()
    model.add(Input(shape = (n_input_, )))
    for hidden_layer_size in hidden_layers_sizes:
        model.add(Dense(hidden_layer_size, activity_regularizer = regularizers.L2(lambda_reg),
                        activation = activation, kernel_initializer = weight_init))
    model.add(Dense(2, activation = 'linear'))
    return model

In [None]:
clf = KerasRegressor(
    model = create_model,
    weight_init = tf.keras.initializers.glorot_uniform,
    batch_size = None,
    activation = 'tanh',
    validation_split = 0.1,
    loss = MEE_tf,
    metrics = MEE_tf,
    optimizer = 'sgd',
    optimizer__momentum = 0.5,
    optimizer__learning_rate = 0.5,
    lambda_reg = 0.01,
    hidden_layers_sizes = (30, 30),
    epochs = 500,
    callbacks = tf.keras.callbacks.EarlyStopping,
    callbacks__monitor="val_loss",
    callbacks__patience = 50,
    callbacks__min_delta = 0.001,
    callbacks__restore_best_weights = True
)

In [None]:
clf.get_params()

In [None]:
start = time.time()

param_list = {
    'hidden_layers_sizes': [(10, 10), (30, 30), (50, 50), (100, 100),
                            (10, 10, 10), (30, 30, 30), (50, 50, 50), (100, 100, 100)],
    'weight_init': [tf.keras.initializers.glorot_uniform, tf.keras.initializers.random_uniform],
    'batch_size': [None] + [1, 50, 100],
    'activation': ['tanh'],
    'epochs': [500],
    'optimizer': ['sgd'],
    'optimizer__learning_rate': [0.00001, 0.001, 0.01] + [x/10 for x in range(1, 10)],
    'optimizer__momentum': [0] + [x/10 for x in range(1, 10)],
    'lambda_reg': [0.0001, 0.001, 0.01, 0.1],
    'shuffle': [True],
    'loss': [MEE_tf]
}

grid_search = GridSearchCV(estimator = clf, param_grid = param_list,
                           n_jobs = -1, verbose = 5, scoring= MEE_score, refit = True,
                           cv = KFold(n_splits = 5, shuffle = True, random_state = 32))

grid_search.fit(x_train, y_train)
res = grid_search.cv_results_

duration = time.time() - start 
print(f'Executed in {duration // 3600:.0f} hours {(duration % 3600)//60:.0f} minutes {duration % 60:.6f} seconds.')

In [None]:
report(res, n_top = 10)

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
y_pred = grid_search.predict(x_train)
print('Training MEE:', MEE(y_train, y_pred))

In [None]:
y_pred = grid_search.predict(x_test)
print('Test MEE', MEE(y_test, y_pred))

In [None]:
history = grid_search.best_estimator_.history_

In [None]:
# summarize history for loss
plt.plot(history['loss'])
plt.plot(history['val_loss'],linestyle="dashed",color='red')
plt.ylabel('MEE',fontsize=15)
plt.xlabel('Epoch',fontsize=15)
plt.xlim(0, 100)
plt.ylim(1, 30)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(['TRAINING', 'TEST'], loc='upper right')
plt.show()