In [3]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import learning_curve
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import statistics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import joblib

#to save models
import pickle


# Def of MME error
def mee(y_true, y_pred):
    return np.mean(np.sqrt(np.sum(np.square(y_true - y_pred), axis=1)))
MEE = make_scorer(mee, greater_is_better=False)


## Dataset
colnames = ['id', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'class1', 'class2']
mlcup_tr = pd.read_csv("../data/ML-CUP22-INTERNAL-TR.csv", sep = ",", names=colnames)
mlcup_tr.drop("id", axis=1, inplace=True)

x_mlcup_tr = mlcup_tr.iloc[:, 0:9].values
y_mlcup_tr = mlcup_tr.iloc[:, 9:11].values

## KNR

In [4]:

param_grid = {
    'n_neighbors': np.arange(2,90),
    'algorithm': ["auto","brute","kd_tree","ball_tree"]
}


knr = KNeighborsRegressor()

grid = GridSearchCV(
    knr,
    param_grid,
    cv=5,
    scoring=MEE,
    verbose=4,
    refit = True
)
grid.fit(x_mlcup_tr, y_mlcup_tr)

Fitting 5 folds for each of 352 candidates, totalling 1760 fits
[CV 1/5] END ....algorithm=auto, n_neighbors=2;, score=-1.806 total time=   0.0s
[CV 2/5] END ....algorithm=auto, n_neighbors=2;, score=-1.668 total time=   0.0s
[CV 3/5] END ....algorithm=auto, n_neighbors=2;, score=-1.663 total time=   0.0s
[CV 4/5] END ....algorithm=auto, n_neighbors=2;, score=-1.755 total time=   0.0s
[CV 5/5] END ....algorithm=auto, n_neighbors=2;, score=-1.656 total time=   0.0s
[CV 1/5] END ....algorithm=auto, n_neighbors=3;, score=-1.711 total time=   0.0s
[CV 2/5] END ....algorithm=auto, n_neighbors=3;, score=-1.567 total time=   0.0s
[CV 3/5] END ....algorithm=auto, n_neighbors=3;, score=-1.588 total time=   0.0s
[CV 4/5] END ....algorithm=auto, n_neighbors=3;, score=-1.682 total time=   0.0s
[CV 5/5] END ....algorithm=auto, n_neighbors=3;, score=-1.532 total time=   0.0s
[CV 1/5] END ....algorithm=auto, n_neighbors=4;, score=-1.639 total time=   0.0s
[CV 2/5] END ....algorithm=auto, n_neighbors=

## Validation score

In [5]:
print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)

The best parameters are {'algorithm': 'auto', 'n_neighbors': 18} with a score of -1.44924


In [6]:
knr = grid.best_estimator_
knr

In [7]:
## Check error on TR
# MEE for tr
y_pred = knr.predict(x_mlcup_tr)
meeTR = mee(y_mlcup_tr, y_pred)
print("MEE on the training set: %0.5f" %(meeTR))

# MSE for tr
mseTR = mean_squared_error(y_mlcup_tr, y_pred)
print("MSE on the training set: %0.5f" %(mseTR))


MEE on the training set: 1.36982
MSE on the training set: 1.60983


In [8]:
joblib.dump(knr, './../savedModels/knr_mlcup.z')

['savedModels/knr_mlcup.z']