# Data Training

## Library Imports

In [None]:
import os

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np

## Importing the Dataset

In [None]:
csv_path = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/datasets/obesityData.csv"

ob_df = pd.read_csv(csv_path) # ob_df --> obesity dataframe

## Preprocessing

### Splitting the Dataset
The dataset is split into a training set and test set with a ratio of 70% for the training set size and 30% for the test set size. The split is stratified according to the obesity level.

In [None]:
RAND_STATE = 0

X = ob_df.drop(columns=["NObeyesdad"])
y = ob_df["NObeyesdad"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, stratify=y, random_state=RAND_STATE)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

### Feature Scaling
Standardisation is used to scale the numerical features. Note that the scaler is fitted to the train set, not the test set, to prevent "leaking" of the test set.

In [None]:
numerical_features = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]

scaler = StandardScaler().fit(X_train[numerical_features])

X_train[numerical_features] = scaler.transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [None]:
X_train

In [None]:
X_test

### Categorical Feature Encoding
The categorical features need to be encoded, using either label encoding or one-hot encoding.

#### Target Feature
The target feature (obesity level) is encoded using label encoding.

In [None]:
le = LabelEncoder()
le.fit(ob_df["NObeyesdad"])

# Update the y dataframes.
y_train.loc[:] = le.transform(y_train)
y_test.loc[:] = le.transform(y_test)

In [None]:
y_train

In [None]:
y_test

#### Non-Target Features
We now encode the other categorical features. In this case, they all happen to be nominal, hence one-hot encoding is used.
The first column for each feature after encoding is dropped.
First, we create the one-hot encoder:

In [None]:
nominal_features = [
    "Gender",
    "family_history_with_overweight",
    "FAVC",
    "CAEC",
    "SMOKE",
    "SCC",
    "CALC",
    "MTRANS"
]

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe.fit(ob_df[nominal_features]);

Next, we transform the categorical features' values in `X_train` and `X_test` using the encoder.
The original `X_train` and `X_test` dataframes are then modified to use the columns from the one-hot encoding. The old non-encoded features are dropped.

In [None]:
ohe_train_transformed = ohe.transform(X_train[nominal_features])

# Convert back into a dataframe.
pd_train_transformed = pd.DataFrame(ohe_train_transformed, columns=ohe.get_feature_names_out(), index=X_train.index)

X_train = pd.concat([X_train.drop(nominal_features, axis=1), pd_train_transformed], axis=1)

In [None]:
ohe_train_transformed = ohe.transform(X_test[nominal_features])

# Convert back into a dataframe.
pd_train_transformed = pd.DataFrame(ohe_train_transformed, columns=ohe.get_feature_names_out(), index=X_test.index)

X_test = pd.concat([X_test.drop(nominal_features, axis=1), pd_train_transformed], axis=1)

In [None]:
X_train

In [None]:
X_test

## Model Selection
Nested cross-validation with stratified 10-fold cross-validation and grid search is used for model selection.

### Imports

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import multiprocessing as mp

RAND_STATE = 0

### Models

Logistic regression, random forests, decision trees and support vector machines are selected for comparison.

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=RAND_STATE),
    'Random Forest': RandomForestClassifier(random_state=RAND_STATE),
    'Decision Tree': DecisionTreeClassifier(random_state=RAND_STATE),
    'SVM': SVC(kernel='rbf', probability=True, random_state=RAND_STATE)
}

### Hyperparameter Grids

We will be using a grid search, so we define the hyperparameter grids for tuning each model.

In [None]:
param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100]},
    'Random Forest': {'n_estimators': [50, 100, 200, 300, 400], 'max_depth': [None, 5, 10, 20, 30]},
    'Decision Tree': {'max_depth': [None, 5, 10, 20, 30], 'min_samples_split': [2, 5, 10, 20]},
    'SVM': {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly']}
}

### Performing Nested Cross-Validation

We perform nested cross-validation using 10 folds (stratified) and a grid search for hyperparameter tuning.

In [None]:
# Performs nested cross-validation for a model.
def cross_validate(model_name, model):
    # Use stratified k-fold with k = 10.
    inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=RAND_STATE)
    
    # Perform hyperparameter tuning using a random search.
    search_cv = GridSearchCV(
        model,
        param_grids[model_name],
        cv=inner_cv,
        scoring='accuracy',
        n_jobs=-3 # Use 2 less than the number of CPUs
    )
    search_cv.fit(X_train, y_train.astype("int"))
    
    return model_name, search_cv

for model_name, search_cv in (cross_validate(model_name, model) for model_name, model in models.items()):
    print(f"Model: {model_name}")
    print(f"Best Score: {search_cv.best_score_:.3f}")
    print(f"Best Parameters: {search_cv.best_params_}")
    print()