# Model Selection
Nested cross-validation with stratified 10-fold cross-validation and grid search is used for model selection.

## Library Imports

In [None]:
import os

import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

RAND_STATE = 0

## Importing the Train Set

In [None]:
datasets_folder = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/datasets"

In [None]:
X_train = pd.read_csv(os.path.join(datasets_folder, "obesity_X_train.csv"), index_col=0)
X_train

In [None]:
y_train = pd.read_csv(os.path.join(datasets_folder, "obesity_y_train.csv"), index_col=0)["Obese"]
y_train

### Dropping the `Weight` Column
The obesity level is highly correlated with the weight. This makes it fairly easy for models to achieve high accuracy, but makes explanations uninsightful because everyone knows obesity depends on weight. Hence, we make a variant of `X_train` that does not have the `Weight` column:

In [None]:
X_train_no_weight = X_train.drop("Weight", axis=1)
X_train_no_weight

We will cross-validate for `X_train` and `X_train_no_weight` separately.

## Models

Logistic regression, random forests, decision trees and support vector machines are selected for comparison.

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=RAND_STATE),
    'Random Forest': RandomForestClassifier(random_state=RAND_STATE),
    'Decision Tree': DecisionTreeClassifier(random_state=RAND_STATE),
    'SVM': SVC(probability=True, random_state=RAND_STATE)
}

## Hyperparameter Grids

We will be using a grid search, so we define the hyperparameter grids for tuning each model.

In [None]:
param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100]},
    'Random Forest': {'n_estimators': [50, 100, 200, 300, 400], 'max_depth': [None, 5, 10, 20, 30]},
    'Decision Tree': {'max_depth': [None, 5, 10, 20, 30], 'min_samples_split': [2, 5, 10, 20]},
    'SVM': {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly']}
}

## Performing Nested Cross-Validation

We perform nested cross-validation using 10 folds (stratified) and a grid search for hyperparameter tuning.

In [None]:
# Performs nested cross-validation for a model.
def cross_validate(model_name, model, X):
    # Use stratified k-fold with k = 10.
    inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RAND_STATE)
    
    # Perform hyperparameter tuning using a grid search.
    search_cv = GridSearchCV(
        model,
        param_grids[model_name],
        cv=inner_cv,
        scoring='accuracy',
        n_jobs=-3 # Use 2 less than the number of CPUs
    )
    search_cv.fit(X, y_train)
    
    return model_name, search_cv

for X_name, X in {"X_train": X_train, "X_train_no_weight": X_train_no_weight}.items():
    print(f"Dataframe: {X_name}\n")
    for model_name, search_cv in (cross_validate(model_name, model, X) for model_name, model in models.items()):
        print(f"Model: {model_name}")
        print(f"Best Score: {search_cv.best_score_:.3f}")
        print(f"Best Parameters: {search_cv.best_params_}\n\n")

All models, except logistic regression, with their best parameters achieved high accuracy on both sets.