# Data set preprocessing

In [None]:
# Importing libraries
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score, make_scorer

In [None]:
feature_names = ['c_delete', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7','a8', 'a9', 'target1','target2']
data = pd.read_csv('data/ml_cup_tr.csv', skiprows=7, names=feature_names)
data=data.drop(['c_delete'], axis=1)

In [None]:
X= data[['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7','a8', 'a9']].copy()
y= data[['target1', 'target2']].copy()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [None]:
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.4f} (std: {1:.4f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# Mean Euclidean Error

In [None]:
def MEE(y_true, y_pred):
    # error[i] vector error of pattern i
    error = y_pred - y_true
    return np.mean(np.linalg.norm(error, axis = -1))

In [None]:
MEE_score = make_scorer(MEE, greater_is_better = False)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
clf = KNeighborsRegressor()

In [None]:
start = time.time()
param_list = {'n_neighbors': [x for x in range(1, 201)],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]
             }
grid_search = GridSearchCV(estimator = clf, param_grid = param_list,
                           n_jobs = -1, verbose = 5, scoring=MEE_score, refit = True,
                           cv = KFold(n_splits = 5, shuffle = True, random_state = 32))
grid_search.fit(x_train, y_train)
res = grid_search.cv_results_

duration = time.time() - start 
print(f'Executed in {duration // 3600:.0f} hours {(duration % 3600)//60:.0f} minutes {duration % 60:.6f} seconds.')

In [None]:
report(res, n_top = 3)

In [None]:
grid_search.best_estimator_

In [None]:
y_pred = grid_search.predict(x_train)
print(MEE(y_train, y_pred))

In [None]:
y_pred = grid_search.predict(x_test)
print(MEE(y_test, y_pred))