### Libraries

In [1]:
import pandas as pd
from IPython.core.display import display
from src.utils.preprocessing import standardize, dummy_code, oversample, remove_correlated_features
from src.utils.get_data import import_data
from src.utils.train import hyperparameter_tuning_cv
from src.utils.config import *
from sklearn.metrics import f1_score


In [2]:
%load_ext autoreload
%autoreload 1

### Load Data

In [3]:
DATA_PATH = '../../data'
coarse_data, coarse_labels = import_data(DATA_PATH, segmentation_type = 'coarse', is_user_features=True)
fine_data, fine_labels = import_data(DATA_PATH, segmentation_type = 'fine', is_user_features=True)
no_data, no_labels = import_data(DATA_PATH, segmentation_type = 'no', is_user_features=True)

# For later processing rename the index
no_data.index = no_data.index.rename('subject')

### Preprocessing

In [4]:
# rename
X_coarse = coarse_data
X_fine = fine_data
X_no = no_data

#### Normalisation

In [5]:
# For case where is_user_features=True
X_coarse = standardize(X_coarse, 0, -3)
X_fine = standardize(X_fine, 0, -3)
X_no = standardize(X_no, 0, -3)

#### Dummy code categorical features

In [6]:
X_coarse = dummy_code(X_coarse, columns = ['Gender', 'Resp_Condition', 'Symptoms'])
X_fine = dummy_code(X_fine, columns = ['Gender', 'Resp_Condition', 'Symptoms'])
X_no = dummy_code(X_no, columns = ['Gender', 'Resp_Condition', 'Symptoms'])

#### Drop correlated features

In [7]:
# Drop features with a Pearson correlation > 0.9 to prevent multicolinearity
X_coarse = remove_correlated_features(X_coarse, 0.95)

### Grid search

In [8]:
knn_results = hyperparameter_tuning_cv(model='knn', data=X_coarse, labels=coarse_labels.Label, cv_k=5,
                                       params=KNN_PARAMS, metric=f1_score)

display(knn_results)

display(knn_results.iloc[knn_results[f1_score.__name__].agg(pd.Series.idxmax)])


Unnamed: 0,n_neighbors,oversampling,f1_score
0,3,True,0.454197
1,3,False,0.348949
2,4,True,0.443779
3,4,False,0.221902
4,5,True,0.45273
5,5,False,0.291581
6,6,True,0.441285
7,6,False,0.19287
8,7,True,0.440777
9,7,False,0.262932


n_neighbors            3
oversampling        True
f1_score        0.454197
Name: 0, dtype: object