In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold, cross_validate

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Analysis
Once the data is clean and ready, we should:
- select appropriate methods
- select appropriate parameters for them
- set up grid search and crossvalidation
- run the analysis

In [None]:
X = np.load("X.npy")
y = np.load("y.npy")

## Selection of methods
In general, before applying large and expensive models, it is good to try simple linear models. More than one model should be used :)

In this case, we are performing multiclass classification. 

In [None]:
inner_cv = KFold(n_splits = 3, shuffle=True)
outer_cv = KFold(n_splits = 10, shuffle=True)

## KNN

In [None]:
hyperparams_KNN = {
    'n_neighbors' : [2,5,10,25],
    'weights' : ['uniform', 'distance'],
}

grid_KNN = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=hyperparams_KNN,
    scoring="roc_auc_ovr_weighted",
    cv = inner_cv,
)

scores_KNN = cross_validate(grid_KNN, X=X, y=y, cv=outer_cv, scoring="roc_auc_ovr_weighted", return_train_score=True)

## DT

In [None]:
hyperparams_DT = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : [None, 5, 10, 20],
}

grid_DT = ...

scores_DT = ...

## RF

In [None]:
hyperparams_RF = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [None, 5, 10, 20],
    'n_estimators' : [20,50,100]
}

grid_RF = ...

scores_RF = ...

## MLP

In [None]:
hyperparams_NN = {
    'hidden_layer_sizes' : [(50,50,), (10,10,), (100,), (100,100)],
    'activation' : ['relu', 'logistic'],
}

grid_NN = ...

scores_NN = ...

In [None]:
def cross_val_to_df(scores, model_name):
    df = pd.DataFrame({
        "cv": range(1, len(scores["train_score"]) + 1),  # CV fold index
        "train_auc": scores["train_score"],
        "test_auc": scores["test_score"],
        "fit_time": scores["fit_time"],
        "model": model_name
    })
    return df

# Combine into a single DataFrame
df_all = pd.concat(
    [
        cross_val_to_df(scores_RF, "RF"),
        cross_val_to_df(scores_DT, "DT"),
        cross_val_to_df(scores_KNN, "KNN"),
        cross_val_to_df(scores_NN, "MLP"),        
        ], ignore_index=True)

df_all

Use seaborn's `boxplot` to plot the train and test performance, as well as the fit time for each model.

# Additional considerations
We can then be interested into analysing:
- the feature importance of the best model
- the decision boundary (if applicable)


In this case, note that cross_validate can return one estimator per fold. We can then select them fold by fold and calculate the permutation feature importance.

In [None]:
scores_KNN = cross_validate(grid_KNN, X=X, y=y, cv=outer_cv, scoring="roc_auc_ovr_weighted", return_train_score=True, return_estimator=True)

knn = scores_KNN["estimator"][0]

In [None]:
from sklearn.inspection import permutation_importance
imp = permutation_importance(knn, X, y, n_repeats=20)

In [None]:
plt.boxplot(imp.importances.T, vert=False)
plt.yticks(range(1,5), ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)'])
plt.xlabel("Permutation importance")
plt.show()