# Obesity Classification Model Training and Evaluation Notebook
This notebook handles:
- Model Training
- Hyper-Parameter Tuning

In [1]:
import pandas as pd

X_train = pd.read_csv("data/preprocessed data/X_train.csv")
X_val = pd.read_csv("data/preprocessed data/X_val.csv")
X_test = pd.read_csv("data/preprocessed data/X_test.csv")
y_train = pd.read_csv("data/preprocessed data/y_train.csv", header=None).to_numpy().ravel()
y_val = pd.read_csv("data/preprocessed data/y_val.csv", header=None).to_numpy().ravel()
y_test = pd.read_csv("data/preprocessed data/y_test.csv", header=None).to_numpy().ravel()
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((1251, 21), (418, 21), (418, 21), (1251,), (418,), (418,))

## Model Evaluation

In [2]:
from Model_Evaluation import ModelsEvaluator

models_evaluator = ModelsEvaluator()

## GridSearch Cross-Validation Hyper-Parameter Tuning

In [3]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV

param_grid = dict()


def hyper_parameter_tune(model_name: str, model: BaseEstimator, verbose: bool = True) -> BaseEstimator:
    grid_search = GridSearchCV(model, param_grid[model_name], n_jobs=-1)
    grid_search.fit(X_train, y_train)
    param_grid[model_name]["best_params"] = grid_search.best_params_
    if verbose:
        print(grid_search.best_params_)
    return grid_search.best_estimator_

## Model Training

### Gaussian Naive Bayes Classifier

In [4]:
from sklearn.naive_bayes import GaussianNB

GaussianNB_model = GaussianNB()
param_grid["GaussianNB"] = {"var_smoothing": [1e-9, 1e-6, 1e-3, 1e-2, 1e-1]}
GaussianNB_model = hyper_parameter_tune("GaussianNB", GaussianNB_model)
models_evaluator.evaluate("GaussianNB", GaussianNB_model, save_model=True)

{'var_smoothing': 0.01}


Train_Accuracy       0.821743
Val_Accuracy         0.794258
Test_Accuracy        0.811005
Balanced_Accuracy    0.810421
Precision            0.806936
F1_Score              0.80523
Name: GaussianNB, dtype: object

### Logistic Regression Classifier

In [5]:
from sklearn.linear_model import LogisticRegression

LogisticRegression_model = LogisticRegression(random_state=42, n_jobs=-1)
param_grid["LogisticRegression"] = {"C": [1.0, 0.1, 10]}
LogisticRegression_model = hyper_parameter_tune("LogisticRegression", LogisticRegression_model)
models_evaluator.evaluate("LogisticRegression", LogisticRegression_model, save_model=True)

{'C': 10}


Train_Accuracy        0.96243
Val_Accuracy         0.949761
Test_Accuracy        0.956938
Balanced_Accuracy    0.955753
Precision            0.957775
F1_Score             0.956974
Name: LogisticRegression, dtype: object

### K-Nearest Neighbors Classifier

In [6]:
from sklearn.neighbors import KNeighborsClassifier

KNeighbors_model = KNeighborsClassifier(n_jobs=-1)
param_grid["KNeighbors"] = {"n_neighbors": [3, 5, 7]}
KNeighbors_model = hyper_parameter_tune("KNeighbors", KNeighbors_model)
models_evaluator.evaluate("KNeighbors", KNeighbors_model, save_model=True)

{'n_neighbors': 3}


Train_Accuracy       0.930456
Val_Accuracy         0.892344
Test_Accuracy         0.84689
Balanced_Accuracy    0.840845
Precision            0.848875
F1_Score             0.838273
Name: KNeighbors, dtype: object

### Decision Tree Classifier

In [7]:
from sklearn.tree import DecisionTreeClassifier

DecisionTree_model = DecisionTreeClassifier(random_state=42)
param_grid["DecisionTree"] = {"max_depth": [3, 5, 7]}
DecisionTree_model = hyper_parameter_tune("DecisionTree", DecisionTree_model)
models_evaluator.evaluate("DecisionTree", DecisionTree_model, save_model=True)

{'max_depth': 7}


Train_Accuracy            1.0
Val_Accuracy         0.976077
Test_Accuracy        0.978469
Balanced_Accuracy    0.976967
Precision            0.978897
F1_Score             0.978374
Name: DecisionTree, dtype: object

### Support Vector Classifier

In [8]:
from sklearn.svm import SVC

SVC_model = SVC(random_state=42)
param_grid["SVC"] = {"kernel": ["linear", "poly", "rbf"], "C": [1.0, 0.5, 1.5]}
SVC_model = hyper_parameter_tune("SVC", SVC_model)
models_evaluator.evaluate("SVC", SVC_model, save_model=True)

{'C': 1.5, 'kernel': 'linear'}


Train_Accuracy       0.972022
Val_Accuracy         0.954545
Test_Accuracy        0.961722
Balanced_Accuracy    0.960664
Precision            0.962115
F1_Score             0.961684
Name: SVC, dtype: object

### Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier

RandomForest_model = RandomForestClassifier(max_depth=5, random_state=42, n_jobs=-1)
param_grid["RandomForest"] = {"ccp_alpha": [0.01, 0.09, 0.011]}
RandomForest_model = hyper_parameter_tune("RandomForest", RandomForest_model)
models_evaluator.evaluate("RandomForest", RandomForest_model, save_model=True)

{'ccp_alpha': 0.011}


Train_Accuracy       0.960831
Val_Accuracy         0.956938
Test_Accuracy        0.947368
Balanced_Accuracy    0.944758
Precision            0.947893
F1_Score             0.946929
Name: RandomForest, dtype: object

### Gradient Boosting Classifier

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

GradientBoosting_model = GradientBoostingClassifier(random_state=42)
param_grid["GradientBoosting"] = {"learning_rate": [0.1, 0.09, 0.11], "ccp_alpha": [0.01, 0.09, 0.11]}
GradientBoosting_model = hyper_parameter_tune("GradientBoosting", GradientBoosting_model)
models_evaluator.evaluate("GradientBoosting", GradientBoosting_model, save_model=True)

{'ccp_alpha': 0.01, 'learning_rate': 0.1}


Train_Accuracy       0.965627
Val_Accuracy         0.961722
Test_Accuracy         0.95933
Balanced_Accuracy    0.958763
Precision             0.96065
F1_Score             0.959117
Name: GradientBoosting, dtype: object

### Multilayer Perceptron Classifier

In [11]:
from sklearn.neural_network import MLPClassifier

MLP_model = MLPClassifier(early_stopping=True, random_state=42)
param_grid["MLP"] = {"hidden_layer_sizes": [(100,), (50, 50), (25, 10, 5)]}
MLP_model = hyper_parameter_tune("MLP", MLP_model)
models_evaluator.evaluate("MLP", MLP_model, save_model=True)

{'hidden_layer_sizes': (50, 50)}


Train_Accuracy       0.964828
Val_Accuracy         0.913876
Test_Accuracy        0.937799
Balanced_Accuracy      0.9355
Precision            0.937798
F1_Score             0.937682
Name: MLP, dtype: object

In [12]:
models_evaluator.get_all_evaluations()

Unnamed: 0_level_0,Train_Accuracy,Val_Accuracy,Test_Accuracy,Balanced_Accuracy,Precision,F1_Score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GaussianNB,0.821743,0.794258,0.811005,0.810421,0.806936,0.80523
LogisticRegression,0.96243,0.949761,0.956938,0.955753,0.957775,0.956974
KNeighbors,0.930456,0.892344,0.84689,0.840845,0.848875,0.838273
DecisionTree,1.0,0.976077,0.978469,0.976967,0.978897,0.978374
SVC,0.972022,0.954545,0.961722,0.960664,0.962115,0.961684
RandomForest,0.960831,0.956938,0.947368,0.944758,0.947893,0.946929
GradientBoosting,0.965627,0.961722,0.95933,0.958763,0.96065,0.959117
MLP,0.964828,0.913876,0.937799,0.9355,0.937798,0.937682


In [13]:
import joblib

joblib.dump(param_grid, "utils/param_grid.pkl")
joblib.dump(models_evaluator, "utils/model_evaluator.pkl")

['utils/model_evaluator.pkl']