# ML CUP 2022

## Kernel Ridge Regression

This notebook creates two Kernel Ridge Regression (KRR) models to generalize the problem of the ML cup 2022. It searches the best combination of hyperparameters performing a grid searches over a given range of values. Two different models are given as output in this phase, one for each target, and for both models there is a tuning phase based on the same hyperparameters.

Hyperparameters considered for the grid search:

1. kernel
1. alpha
2. gamma (only for rbf and poly kernels)
3. degree (only for poly kernel)

Model selection performed using a cross validation.\
The model assessment phase is not included in this notebook.

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import linalg as LA

from sklearn.metrics import make_scorer
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV

import joblib

import math

### Definition of the Mean Euclidean Distance

In [2]:
def my_mean_euclidean_distance(y_true, y_pred):
    points = len(y_true)
    tot_sum = 0
    for i in range (points):
        tot_sum += LA.norm(y_true[i] - y_pred[i])
    
    return tot_sum / points

In [3]:
mean_euclidean_distance = make_scorer(my_mean_euclidean_distance, greater_is_better=False)

### Loading data

In [4]:
colnames = ['id', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'target1', 'target2']
mlcup_tr = pd.read_csv("./dataset/ml_cup22/ML-CUP22-TR.csv", sep = ",", names=colnames)
mlcup_tr = mlcup_tr.iloc[1:, :]
mlcup_tr = mlcup_tr.drop('id', axis=1)

In [5]:
x_mlcup_tr = mlcup_tr.iloc[:, 0:9].values
y_mlcup_tr = mlcup_tr.iloc[:, 9:11].values

We used the function below to normalize our training set (both input and target) according to a min-max normalization

In [6]:
x_cols = len(x_mlcup_tr[0])

max_col_value_x = [None]*x_cols
max_vl = None

min_col_value_x = [None]*x_cols
min_vl = None

for i in range(x_cols):
    col = x_mlcup_tr[:, i]
    max_vl = np.amax(col)
    min_vl = np.amin(col)
    
    x_mlcup_tr[:, i] = (x_mlcup_tr[:, i] - min_vl) / (max_vl - min_vl)
    
    max_col_value_x[i] = max_vl
    min_col_value_x[i] = min_vl
    
    
    
y_cols = len(y_mlcup_tr[0])

max_col_value_y = [None]*y_cols

min_col_value_y = [None]*y_cols

for i in range(y_cols):
    col = y_mlcup_tr[:, i]
    max_vl = np.amax(col)
    min_vl = np.amin(col)
    
    y_mlcup_tr[:, i] = (y_mlcup_tr[:, i] - min_vl) / (max_vl - min_vl)
    
    max_col_value_y[i] = max_vl
    min_col_value_y[i] = min_vl 

We create a validation set, splitting the training set in 80% and 20%.

In [7]:
subset_size = int(0.2 * len(x_mlcup_tr))

index = np.random.choice(len(x_mlcup_tr), subset_size, replace=False)
index = np.sort(index)

x_val_set = x_mlcup_tr[index]
x_tr_set_1 = np.delete(x_mlcup_tr, index, 0)

y_val_set = y_mlcup_tr[index]
y_tr_set_1 = np.delete(y_mlcup_tr, index, 0)

y1_val_set = y_val_set[:, 0]
y2_val_set = y_val_set[:, 1]

y1_tr_set_1 = y_tr_set_1[:, 0]
y2_tr_set_1 = y_tr_set_1[:, 1]

In [8]:
y1_mlcup_tr = y_mlcup_tr[:, 0]
y2_mlcup_tr = y_mlcup_tr[:, 1]

### Grid search for target 1
#### rbf kernel

In [9]:
alpha_range = np.logspace(-9, 0, 30, base = 2)
gamma_range = np.logspace(-9, 3, 10, base = 2)

param_grid = [
    {'alpha': alpha_range, 'gamma': gamma_range},
    {'alpha': alpha_range}
]

kr = GridSearchCV(
    KernelRidge(kernel="rbf"),
    param_grid = param_grid,
    cv = 4,
    scoring = mean_euclidean_distance,
    n_jobs = -1
)

kr.fit(x_mlcup_tr, y1_mlcup_tr)

print(
    "The best parameters are %s with a score of %0.5f"
    % (kr.best_params_, kr.best_score_)
)

The best parameters are {'alpha': 0.04921262949370557, 'gamma': 1.259921049894872} with a score of -0.03563


In [10]:
rbf_krr_1 = KernelRidge(
    kernel = 'rbf',
    alpha = kr.best_params_['alpha'],
    gamma = kr.best_params_['gamma']
)

rbf_krr_1.fit(x_tr_set_1, y1_tr_set_1)

#### linear kernel

In [11]:
alpha_range = np.logspace(-9, 0, 30, base = 2)

param_grid = dict(
    alpha = alpha_range
)

kr = GridSearchCV(
    KernelRidge(kernel="linear"),
    param_grid = param_grid,
    cv = 4,
    scoring = mean_euclidean_distance,
    n_jobs = -1
)

kr.fit(x_mlcup_tr, y1_mlcup_tr)

print(
    "The best parameters are %s with a score of %0.5f"
    % (kr.best_params_, kr.best_score_)
)

The best parameters are {'alpha': 1.0} with a score of -0.06531


In [12]:
linear_krr_1 = KernelRidge(
    kernel = 'linear',
    alpha = kr.best_params_['alpha']
)

linear_krr_1.fit(x_tr_set_1, y1_tr_set_1)

#### polynomial kernel

In [13]:
degree_range = np.arange(2, 8, 1)
alpha_range = np.logspace(-9, 0, 10, base = 2)
gamma_range = np.logspace(-9, 3, 10, base = 2)

param_grid = dict(
    degree = degree_range,
    alpha = alpha_range,
    gamma = gamma_range
)

param_grid = [
    {'alpha': alpha_range, 'gamma': gamma_range, 'degree': degree_range},
    {'alpha': alpha_range, 'degree': degree_range}
]

kr = GridSearchCV(
    KernelRidge(kernel = 'poly'),
    param_grid = param_grid,
    cv = 4,
    scoring = mean_euclidean_distance,
    n_jobs = -1
)

kr.fit(x_mlcup_tr, y1_mlcup_tr)

print(
    "The best parameters are %s with a score of %0.5f"
    % (kr.best_params_, kr.best_score_)
)

The best parameters are {'alpha': 0.03125, 'degree': 7, 'gamma': 0.19842513149602486} with a score of -0.03726


In [14]:
poly_krr_1 = KernelRidge(
    kernel = 'poly',
    alpha = kr.best_params_['alpha'],
    gamma = kr.best_params_['gamma'],
    degree = kr. best_params_['degree']
)

poly_krr_1.fit(x_tr_set_1, y1_tr_set_1)

### Grid search for target 2
#### rbf kernel

In [15]:
alpha_range = np.logspace(-9, 0, 30, base = 2)
gamma_range = np.logspace(-9, 3, 10, base = 2)

param_grid = [
    {'alpha': alpha_range, 'gamma': gamma_range},
    {'alpha': alpha_range}
]

kr = GridSearchCV(
    KernelRidge(kernel="rbf"),
    param_grid = param_grid,
    cv = 4,
    scoring = mean_euclidean_distance,
    n_jobs = -1
)

kr.fit(x_mlcup_tr, y2_mlcup_tr)

print(
    "The best parameters are %s with a score of %0.5f"
    % (kr.best_params_, kr.best_score_)
)

The best parameters are {'alpha': 0.14427533157324834, 'gamma': 1.259921049894872} with a score of -0.05701


In [16]:
rbf_krr_2 = KernelRidge(
    kernel = 'rbf',
    alpha = kr.best_params_['alpha'],
    gamma = kr.best_params_['gamma']
)

rbf_krr_2.fit(x_tr_set_1, y2_tr_set_1)

#### linear kernel

In [17]:
alpha_range = np.logspace(-9, 0, 30, base = 2)

param_grid = dict(
    alpha = alpha_range
)

kr = GridSearchCV(
    KernelRidge(kernel="linear"),
    param_grid = param_grid,
    cv = 4,
    scoring = mean_euclidean_distance,
    n_jobs = -1
)

kr.fit(x_mlcup_tr, y1_mlcup_tr)

print(
    "The best parameters are %s with a score of %0.5f"
    % (kr.best_params_, kr.best_score_)
)

The best parameters are {'alpha': 1.0} with a score of -0.06531


In [18]:
linear_krr_2 = KernelRidge(
    kernel = 'linear',
    alpha = kr.best_params_['alpha']
)

linear_krr_2.fit(x_tr_set_1, y2_tr_set_1)

#### polynomial kernel

In [19]:
degree_range = np.arange(2, 8, 1)
alpha_range = np.logspace(-9, 0, 10, base = 2)
gamma_range = np.logspace(-9, 3, 10, base = 2)

param_grid = [
    {'alpha': alpha_range, 'gamma': gamma_range, 'degree': degree_range},
    {'alpha': alpha_range, 'degree': degree_range}
]

kr = GridSearchCV(
    KernelRidge(kernel = 'poly'),
    param_grid = param_grid,
    cv = 4,
    scoring = mean_euclidean_distance,
    n_jobs = -1
)

kr.fit(x_mlcup_tr, y1_mlcup_tr)

print(
    "The best parameters are %s with a score of %0.5f"
    % (kr.best_params_, kr.best_score_)
)

The best parameters are {'alpha': 0.03125, 'degree': 7, 'gamma': 0.19842513149602486} with a score of -0.03726


In [20]:
poly_krr_2 = KernelRidge(
    kernel = 'poly',
    alpha = kr.best_params_['alpha'],
    gamma = kr.best_params_['gamma'],
    degree = kr. best_params_['degree']
)

poly_krr_2.fit(x_tr_set_1, y2_tr_set_1)

## Model selection
### Target 1

In [21]:
mee_1 = my_mean_euclidean_distance(y1_val_set, rbf_krr_1.predict(x_val_set))
print("MEE score: %0.5f" %(mee_1))

MEE score: 0.03589


In [22]:
mee_1 = my_mean_euclidean_distance(y1_val_set, linear_krr_1.predict(x_val_set))
print("MEE score: %0.5f" %(mee_1))

MEE score: 0.06392


In [23]:
mee_1 = my_mean_euclidean_distance(y1_val_set, poly_krr_1.predict(x_val_set))
print("MEE score: %0.5f" %(mee_1))

MEE score: 0.03746


### Target 2

In [24]:
mee_2 = my_mean_euclidean_distance(y2_val_set, rbf_krr_2.predict(x_val_set))
print("MEE score: %0.5f" %(mee_2))

MEE score: 0.05794


In [25]:
mee_2 = my_mean_euclidean_distance(y2_val_set, linear_krr_2.predict(x_val_set))
print("MEE score: %0.5f" %(mee_2))

MEE score: 0.12996


In [26]:
mee_2 = my_mean_euclidean_distance(y2_val_set, poly_krr_2.predict(x_val_set))
print("MEE score: %0.5f" %(mee_2))

MEE score: 0.06035


Since the best results are given by the Kernel ridge regression classifier with the rbf kernel on both first and second target, we choose these two models.

In [27]:
joblib.dump(rbf_krr_1, './results/ml_cup/KRR/rbf_krr_1.z')
joblib.dump(rbf_krr_2, './results/ml_cup/KRR/rbf_krr_2.z')

['./results/ml_cup/KRR/rbf_krr_2.z']

### MEE on both targets on the training set

In [28]:
krr1 = joblib.load('./results/ml_cup/KRR/rbf_krr_1.z')
krr2 = joblib.load('./results/ml_cup/KRR/rbf_krr_2.z')

In [29]:
pred_label_krr_1 = krr1.predict(x_tr_set_1)
pred_label_krr_2 = krr2.predict(x_tr_set_1)
pred_label_krr = np.vstack((pred_label_krr_1, pred_label_krr_2)).T

In [30]:
#Denormalization
def deNormalizer(pred_labels, max_col_value_y, min_col_value_y):
    
    y_cols = len(pred_labels[0])
    
    for i in range(y_cols):
        
        pred_labels[:, i] = pred_labels[:, i] * (max_col_value_y[i] - min_col_value_y[i]) + min_col_value_y[i]
    
    return pred_labels

In [31]:
pred_label_krr = deNormalizer(pred_label_krr, max_col_value_y, min_col_value_y)
y_tr_set_1 = deNormalizer(y_tr_set_1, max_col_value_y, min_col_value_y)

In [32]:
# Mean euclidean distance
points = y_tr_set_1.shape[0]
tot_sum = 0
for i in range (points):
    tot_sum += math.sqrt(math.pow((y_tr_set_1[i][0] - pred_label_krr[i][0]), 2)
                         + math.pow((y_tr_set_1[i][1] - pred_label_krr[i][1]), 2))
    
print('MEE on the training set:', tot_sum / points)

MEE on the training set: 1.3289122644526825


### MEE on both targets on the validation set

In [33]:
pred_label_krr_1 = krr1.predict(x_val_set)
pred_label_krr_2 = krr2.predict(x_val_set)
pred_label_krr = np.vstack((pred_label_krr_1, pred_label_krr_2)).T

In [34]:
pred_label_krr = deNormalizer(pred_label_krr, max_col_value_y, min_col_value_y)
y_val_set = deNormalizer(y_val_set, max_col_value_y, min_col_value_y)

In [35]:
# Mean euclidean distance
points = y_val_set.shape[0]
tot_sum = 0
for i in range (points):
    tot_sum += math.sqrt(math.pow((y_val_set[i][0] - pred_label_krr[i][0]), 2)
                         + math.pow((y_val_set[i][1] - pred_label_krr[i][1]), 2))
    
print('MEE on the validation set:', tot_sum / points)

MEE on the validation set: 1.5024676670674562
