# Obesity Classification Model Training and Evaluation Notebook
This notebook handles:
- Model Training
- Hyper-Parameter Tuning
- Model Evaluation

In [533]:
import pandas as pd
from narwhals import Boolean

X_train = pd.read_csv("data/preprocessed data/X_train.csv")
X_val = pd.read_csv("data/preprocessed data/X_val.csv")
X_test = pd.read_csv("data/preprocessed data/X_test.csv")
y_train = pd.read_csv("data/preprocessed data/y_train.csv", header=None).to_numpy().ravel()
y_val = pd.read_csv("data/preprocessed data/y_val.csv", header=None).to_numpy().ravel()
y_test = pd.read_csv("data/preprocessed data/y_test.csv", header=None).to_numpy().ravel()
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((1251, 33), (418, 33), (418, 33), (1251,), (418,), (418,))

## Model Evaluation

In [534]:
model_evaluations = pd.DataFrame(
    columns=["Train_Accuracy", "Val_Accuracy", "Test_Accuracy", "Balanced_Accuracy", "Precision", "Recall", "F1_Score"])
model_evaluations.index.name = "Model"
model_evaluations.head()

Unnamed: 0_level_0,Train_Accuracy,Val_Accuracy,Test_Accuracy,Balanced_Accuracy,Precision,Recall,F1_Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [535]:
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score
import joblib


def evaluate_model(model_name: str, model: BaseEstimator) -> None:
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    model_evaluations.loc[model_name, "Train_Accuracy"] = accuracy_score(y_train, train_pred)
    model_evaluations.loc[model_name, "Val_Accuracy"] = accuracy_score(y_val, val_pred)
    model_evaluations.loc[model_name, "Test_Accuracy"] = accuracy_score(y_test, test_pred)
    model_evaluations.loc[model_name, "Balanced_Accuracy"] = balanced_accuracy_score(y_test, test_pred)
    model_evaluations.loc[model_name, "Precision"] = precision_score(y_test, test_pred, average="weighted",
                                                                     zero_division=0)
    model_evaluations.loc[model_name, "Recall"] = recall_score(y_test, test_pred, average="weighted", zero_division=0)
    model_evaluations.loc[model_name, "F1_Score"] = f1_score(y_test, test_pred, average="weighted", zero_division=0)
    joblib.dump(model, f"models/{model_name}.pkl")
    return model_evaluations.loc[model_name]

## GridSearch Cross-Validation Hyper-Parameter Tuning

In [536]:
from sklearn.model_selection import GridSearchCV


def hyper_parameter_tune(model: BaseEstimator, param_grid: dict, verbose: bool = True) -> BaseEstimator:
    grid_search = GridSearchCV(model, param_grid, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    if verbose:
        print(grid_search.best_params_)
    return grid_search.best_estimator_

## Model Training

### Gaussian Naive Bayes Classifier

In [537]:
from sklearn.naive_bayes import GaussianNB

GaussianNB_model = GaussianNB()
params = {"var_smoothing": [1e-9, 1e-6, 1e-3, 1e-2, 1e-1]}
GaussianNB_model = hyper_parameter_tune(GaussianNB_model, params)
evaluate_model("GaussianNB", GaussianNB_model)

{'var_smoothing': 0.01}


Train_Accuracy       0.776179
Val_Accuracy         0.746411
Test_Accuracy        0.760766
Balanced_Accuracy    0.759609
Precision            0.752953
Recall               0.760766
F1_Score             0.752344
Name: GaussianNB, dtype: object

### Logistic Regression Classifier

In [538]:
from sklearn.linear_model import LogisticRegression

LogisticRegression_model = LogisticRegression(random_state=42, n_jobs=-1)
params = {"C": [1.0, 0.1, 10]}
LogisticRegression_model = hyper_parameter_tune(LogisticRegression_model, params)
evaluate_model("LogisticRegression", LogisticRegression_model)

{'C': 10}


Train_Accuracy       0.973621
Val_Accuracy         0.954545
Test_Accuracy        0.942584
Balanced_Accuracy    0.941219
Precision            0.942584
Recall               0.942584
F1_Score             0.942245
Name: LogisticRegression, dtype: object

### K-Nearest Neighbors Classifier

In [539]:
from sklearn.neighbors import KNeighborsClassifier

KNeighborsClassifier_model = KNeighborsClassifier(n_jobs=-1)
params = {"n_neighbors": [3, 5, 7]}
KNeighborsClassifier_model = hyper_parameter_tune(KNeighborsClassifier_model, params)
evaluate_model("KNeighborsClassifier", KNeighborsClassifier_model)

{'n_neighbors': 3}


Train_Accuracy       0.936851
Val_Accuracy         0.866029
Test_Accuracy        0.870813
Balanced_Accuracy    0.865684
Precision            0.867696
Recall               0.870813
F1_Score             0.864696
Name: KNeighborsClassifier, dtype: object

### Decision Tree Classifier

In [540]:
from sklearn.tree import DecisionTreeClassifier

DecisionTreeClassifier_model = DecisionTreeClassifier(random_state=42)
params = {"max_depth": [3, 5, 7]}
DecisionTreeClassifier_model = hyper_parameter_tune(DecisionTreeClassifier_model, params)
evaluate_model("DecisionTreeClassifier", DecisionTreeClassifier_model)

{'max_depth': 3}


Train_Accuracy       0.973621
Val_Accuracy         0.966507
Test_Accuracy        0.966507
Balanced_Accuracy    0.965671
Precision            0.966969
Recall               0.966507
F1_Score             0.966333
Name: DecisionTreeClassifier, dtype: object

### Support Vector Classifier

In [541]:
from sklearn.svm import SVC

SVC_model = SVC(random_state=42)
params = {"kernel": ["linear", "poly", "rbf"], "C": [1.0, 0.5, 1.5]}
SVC_model = hyper_parameter_tune(SVC_model, params)
evaluate_model("SVC", SVC_model)

{'C': 1.5, 'kernel': 'linear'}


Train_Accuracy       0.979217
Val_Accuracy         0.964115
Test_Accuracy         0.95933
Balanced_Accuracy    0.958815
Precision            0.960101
Recall                0.95933
F1_Score             0.959346
Name: SVC, dtype: object

### Random Forest Classifier

In [542]:
from sklearn.ensemble import RandomForestClassifier

RandomForestClassifier_model = RandomForestClassifier(max_depth=5, random_state=42, n_jobs=-1)
params = {"ccp_alpha": [0.01, 0.09, 0.011]}
RandomForestClassifier_model = hyper_parameter_tune(RandomForestClassifier_model, params)
evaluate_model("RandomForestClassifier", RandomForestClassifier_model)

{'ccp_alpha': 0.01}


Train_Accuracy       0.960032
Val_Accuracy         0.947368
Test_Accuracy        0.954545
Balanced_Accuracy    0.952911
Precision            0.954612
Recall               0.954545
F1_Score             0.954223
Name: RandomForestClassifier, dtype: object

### Gradient Boosting Classifier

In [543]:
from sklearn.ensemble import GradientBoostingClassifier

GradientBoostingClassifier_model = GradientBoostingClassifier(random_state=42)
params = {"learning_rate": [0.1, 0.09, 0.11], "ccp_alpha": [0.01, 0.09, 0.11]}
GradientBoostingClassifier_model = hyper_parameter_tune(GradientBoostingClassifier_model, params)
evaluate_model("GradientBoostingClassifier", GradientBoostingClassifier_model)

{'ccp_alpha': 0.01, 'learning_rate': 0.1}


Train_Accuracy       0.966427
Val_Accuracy         0.961722
Test_Accuracy        0.956938
Balanced_Accuracy    0.956722
Precision             0.95876
Recall               0.956938
F1_Score             0.956869
Name: GradientBoostingClassifier, dtype: object

### Multilayer Perceptron Classifier

In [544]:
from sklearn.neural_network import MLPClassifier

MLPClassifier_model = MLPClassifier(early_stopping=True, random_state=42)
params = {"hidden_layer_sizes": [(100,), (50, 50), (25, 10, 5)]}
MLPClassifier_model = hyper_parameter_tune(MLPClassifier_model, params)
evaluate_model("MLPClassifier", MLPClassifier_model)

{'hidden_layer_sizes': (50, 50)}


Train_Accuracy        0.97442
Val_Accuracy         0.949761
Test_Accuracy        0.952153
Balanced_Accuracy    0.950776
Precision            0.952637
Recall               0.952153
F1_Score             0.952018
Name: MLPClassifier, dtype: object

In [545]:
model_evaluations

Unnamed: 0_level_0,Train_Accuracy,Val_Accuracy,Test_Accuracy,Balanced_Accuracy,Precision,Recall,F1_Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GaussianNB,0.776179,0.746411,0.760766,0.759609,0.752953,0.760766,0.752344
LogisticRegression,0.973621,0.954545,0.942584,0.941219,0.942584,0.942584,0.942245
KNeighborsClassifier,0.936851,0.866029,0.870813,0.865684,0.867696,0.870813,0.864696
DecisionTreeClassifier,0.973621,0.966507,0.966507,0.965671,0.966969,0.966507,0.966333
SVC,0.979217,0.964115,0.95933,0.958815,0.960101,0.95933,0.959346
RandomForestClassifier,0.960032,0.947368,0.954545,0.952911,0.954612,0.954545,0.954223
GradientBoostingClassifier,0.966427,0.961722,0.956938,0.956722,0.95876,0.956938,0.956869
MLPClassifier,0.97442,0.949761,0.952153,0.950776,0.952637,0.952153,0.952018
