# ML CUP 2022

## Regression based on k-nearest neighbors

This notebook creates a k-nearest neighbors for regression (KNR) model to generalize the problem of the ML cup 2022. It searches the best combination of hyperparameters performing a grid searches over a given range of values.

Hyperparameters considered for the grid search:

1. n_neighbors
2. algorithm

### Loading libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import linalg as LA

from sklearn.metrics import make_scorer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

import joblib

import math
import random

In [2]:
# choosing a seed for reproducibility
seed = 1
random.seed(seed)
np.random.seed(seed)

### Definition of the Mean Euclidean Distance

In [3]:
def my_mean_euclidean_distance(y_true, y_pred):
    points = len(y_true)
    tot_sum = 0
    for i in range (points):
        tot_sum += LA.norm(y_true[i] - y_pred[i])
    
    return tot_sum / points

In [4]:
mean_euclidean_distance = make_scorer(my_mean_euclidean_distance, greater_is_better=False)

### Loading data

In [5]:
colnames = ['id', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'target1', 'target2']
mlcup_tr = pd.read_csv("./dataset/ml_cup22/ML-CUP22-TR.csv", sep = ",", names=colnames)
mlcup_tr = mlcup_tr.iloc[1:, :]
mlcup_tr = mlcup_tr.drop('id', axis=1)

In [6]:
x_mlcup_tr = mlcup_tr.iloc[:, 0:9].values
y_mlcup_tr = mlcup_tr.iloc[:, 9:11].values

We used the function below to normalize our training set (both input and target) according to a min-max normalization

In [7]:
x_cols = len(x_mlcup_tr[0])

max_col_value_x = [None]*x_cols
max_vl = None

min_col_value_x = [None]*x_cols
min_vl = None

for i in range(x_cols):
    col = x_mlcup_tr[:, i]
    max_vl = np.amax(col)
    min_vl = np.amin(col)
    
    x_mlcup_tr[:, i] = (x_mlcup_tr[:, i] - min_vl) / (max_vl - min_vl)
    
    max_col_value_x[i] = max_vl
    min_col_value_x[i] = min_vl
    
    
    
y_cols = len(y_mlcup_tr[0])

max_col_value_y = [None]*y_cols

min_col_value_y = [None]*y_cols

for i in range(y_cols):
    col = y_mlcup_tr[:, i]
    max_vl = np.amax(col)
    min_vl = np.amin(col)
    
    y_mlcup_tr[:, i] = (y_mlcup_tr[:, i] - min_vl) / (max_vl - min_vl)
    
    max_col_value_y[i] = max_vl
    min_col_value_y[i] = min_vl 

y1_mlcup_tr = y_mlcup_tr[:, 0]
y2_mlcup_tr = y_mlcup_tr[:, 1]

We create a validation set, splitting the training set in 80% and 20%.

In [8]:
subset_size = int(0.2 * len(x_mlcup_tr))

index = np.random.choice(len(x_mlcup_tr), subset_size, replace=False)
index = np.sort(index)

x_val_set = x_mlcup_tr[index]
x_tr_set_1 = np.delete(x_mlcup_tr, index, 0)

y_val_set = y_mlcup_tr[index]
y_tr_set_1 = np.delete(y_mlcup_tr, index, 0)

### Grid search

In [9]:
n_neighbors = np.arange(1, 50)
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']

param_grid = dict(
    n_neighbors = n_neighbors,
    algorithm = algorithm
)

grid = GridSearchCV(
    KNeighborsRegressor(),
    param_grid = param_grid,
    cv = 5,
    scoring = mean_euclidean_distance,
    verbose = 4,
    n_jobs = -1
)

grid.fit(x_mlcup_tr, y_mlcup_tr)

print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

Fitting 5 folds for each of 196 candidates, totalling 980 fits
[CV 1/5] END ....algorithm=auto, n_neighbors=1;, score=-0.105 total time=   0.0s
[CV 5/5] END ....algorithm=auto, n_neighbors=2;, score=-0.082 total time=   0.0s
[CV 3/5] END ....algorithm=auto, n_neighbors=3;, score=-0.079 total time=   0.0s
[CV 1/5] END ....algorithm=auto, n_neighbors=4;, score=-0.083 total time=   0.0s
[CV 4/5] END ....algorithm=auto, n_neighbors=4;, score=-0.080 total time=   0.0s
[CV 2/5] END ....algorithm=auto, n_neighbors=5;, score=-0.079 total time=   0.0s
[CV 1/5] END ....algorithm=auto, n_neighbors=6;, score=-0.081 total time=   0.0s
[CV 1/5] END ....algorithm=auto, n_neighbors=7;, score=-0.080 total time=   0.0s
[CV 3/5] END ....algorithm=auto, n_neighbors=8;, score=-0.075 total time=   0.0s
[CV 1/5] END ....algorithm=auto, n_neighbors=9;, score=-0.080 total time=   0.0s
[CV 2/5] END ....algorithm=auto, n_neighbors=9;, score=-0.075 total time=   0.0s
[CV 5/5] END ...algorithm=auto, n_neighbors=11

[CV 3/5] END ....algorithm=auto, n_neighbors=2;, score=-0.084 total time=   0.0s
[CV 1/5] END ...algorithm=auto, n_neighbors=13;, score=-0.077 total time=   0.0s
[CV 2/5] END ...algorithm=auto, n_neighbors=13;, score=-0.075 total time=   0.0s
[CV 1/5] END ...algorithm=auto, n_neighbors=17;, score=-0.076 total time=   0.0s
[CV 2/5] END ...algorithm=auto, n_neighbors=17;, score=-0.074 total time=   0.0s
[CV 3/5] END ...algorithm=auto, n_neighbors=17;, score=-0.072 total time=   0.0s
[CV 4/5] END ...algorithm=auto, n_neighbors=17;, score=-0.074 total time=   0.0s
[CV 4/5] END ...algorithm=auto, n_neighbors=22;, score=-0.073 total time=   0.0s
[CV 5/5] END ...algorithm=auto, n_neighbors=22;, score=-0.069 total time=   0.0s
[CV 1/5] END ...algorithm=auto, n_neighbors=23;, score=-0.078 total time=   0.0s
[CV 2/5] END ...algorithm=auto, n_neighbors=23;, score=-0.073 total time=   0.0s
[CV 5/5] END ...algorithm=auto, n_neighbors=29;, score=-0.070 total time=   0.0s
[CV 1/5] END ...algorithm=au

The best parameters are {'algorithm': 'auto', 'n_neighbors': 23} with a score of -0.07283


In [10]:
knr = KNeighborsRegressor(algorithm = grid.best_params_['algorithm'],
                          n_neighbors = grid.best_params_['n_neighbors']
                         )
knr.fit(x_tr_set_1, y_tr_set_1)

KNeighborsRegressor(n_neighbors=23)

In [11]:
pred_label_knr = knr.predict(x_val_set)

In [12]:
pred_label_knr_tr = knr.predict(x_tr_set_1)

After we train the model, we denormalize the data and we evaluate the error.

In [13]:
#Denormalization
def deNormalizer(pred_labels, max_col_value_y, min_col_value_y):
    
    y_cols = len(pred_labels[0])
    
    for i in range(y_cols):
        
        pred_labels[:, i] = pred_labels[:, i] * (max_col_value_y[i] - min_col_value_y[i]) + min_col_value_y[i]
    
    return pred_labels

In [14]:
pred_label_knr_tr = deNormalizer(pred_label_knr_tr, max_col_value_y, min_col_value_y)
y_tr_set = deNormalizer(y_tr_set_1, max_col_value_y, min_col_value_y)

In [15]:
# Mean euclidean distance
points = y_tr_set.shape[0]
tot_sum = 0
for i in range (points):
    tot_sum += math.sqrt(math.pow((y_tr_set[i][0] - pred_label_knr_tr[i][0]), 2)
                         + math.pow((y_tr_set[i][1] - pred_label_knr_tr[i][1]), 2))
    
print('MEE on the training set:', tot_sum / points)

MEE on the training set: 1.3781088468580778


In [16]:
pred_label_knr

array([[0.37987719, 0.17685152],
       [0.93990764, 0.39060937],
       [0.88080673, 0.35804888],
       [0.35758162, 0.19948018],
       [0.74362188, 0.2856674 ],
       [0.79355306, 0.30744053],
       [0.40587826, 0.18358569],
       [0.28420717, 0.27783784],
       [0.78993427, 0.31122679],
       [0.112643  , 0.75732543],
       [0.15552388, 0.44926625],
       [0.40087494, 0.15914776],
       [0.76662316, 0.28973743],
       [0.91002785, 0.37505166],
       [0.12108578, 0.62239325],
       [0.13066939, 0.83493572],
       [0.1310542 , 0.69479609],
       [0.16707611, 0.41153356],
       [0.76468853, 0.3015651 ],
       [0.11661514, 0.68516266],
       [0.14709936, 0.45967736],
       [0.16051346, 0.4005277 ],
       [0.74575319, 0.29117079],
       [0.12618976, 0.76575467],
       [0.41905836, 0.16350461],
       [0.13064521, 0.52004162],
       [0.77120143, 0.30261521],
       [0.40886038, 0.17709666],
       [0.1180526 , 0.62921257],
       [0.13378148, 0.51123134],
       [0.

In [17]:
pred_label_knr = deNormalizer(pred_label_knr, max_col_value_y, min_col_value_y)
y_val_set = deNormalizer(y_val_set, max_col_value_y, min_col_value_y)

In [18]:
# Mean euclidean distance
points = y_val_set.shape[0]
tot_sum = 0
for i in range (points):
    tot_sum += math.sqrt(math.pow((y_val_set[i][0] - pred_label_knr[i][0]), 2)
                         + math.pow((y_val_set[i][1] - pred_label_knr[i][1]), 2))
    
print('MEE on the validation set:', tot_sum / points)

MEE on the validation set: 1.4329183586114744


### Saving the model

In [19]:
joblib.dump(knr, './results/ml_cup/KNR/knr.z')

['./results/ml_cup/KNR/knr.z']