In [8]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef
from xgboost import XGBClassifier

In [9]:
rawData = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')
rawData

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [10]:
trainDataSize = len(rawData.index)
features = rawData.columns
missingTrainDataCount = len(rawData.loc[rawData.isnull().any(axis=1)].index)
print('Training data size: ',trainDataSize)
print('Features: ',features.to_list())
print('Missing data count: ',missingTrainDataCount)

Training data size:  2111
Features:  ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad']
Missing data count:  0


In [11]:
cleanedData = rawData.dropna(axis=1)
X = cleanedData.drop('NObeyesdad',axis=1)
y = cleanedData['NObeyesdad']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_val_enc   = label_encoder.transform(y_val)

print("Categorical:", categorical_features)
print("Numeric:", numeric_features)
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ]
)
pip_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=150))
])
pip_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])
pip_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])
pip_gnb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])
pip_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])
pip_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

Categorical: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
Numeric: ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']


In [12]:
def getMetrics(pipeline, X_test, y_test):
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)
    # -----------------------------
    # Metrics
    # -----------------------------
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    precision = precision_score(y_test, y_pred,average='weighted')
    recall = recall_score(y_test, y_pred,average='weighted')
    f1 = f1_score(y_test, y_pred,average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)

    return {'Accuracy':round(accuracy,4), 'AUC':round(auc,4),'Precision':round(precision, 4),'Recall': round(recall,4),'F1-Score': round(f1,4), 'Matthews Correlation Coefficient': round(mcc, 4)}

In [13]:
pip_lr.fit(X_train, y_train)

lr_metrics = getMetrics(pip_lr, X_val, y_val)

joblib.dump(pip_lr, 'saved_models/lr.pkl')

['saved_models/lr.pkl']

In [14]:
pip_dt.fit(X_train, y_train)

dt_metrics = getMetrics(pip_dt, X_val, y_val)

joblib.dump(pip_dt, 'saved_models/dt.pkl')

['saved_models/dt.pkl']

In [15]:
pip_knn.fit(X_train, y_train)

knn_metrics = getMetrics(pip_knn, X_val, y_val)

joblib.dump(pip_knn, 'saved_models/knn.pkl')

['saved_models/knn.pkl']

In [16]:
pip_gnb.fit(X_train, y_train)

gnb_metrics = getMetrics(pip_gnb, X_val, y_val)

joblib.dump(pip_gnb, 'saved_models/gnb.pkl')

['saved_models/gnb.pkl']

In [17]:
pip_rf.fit(X_train, y_train)

rf_metrics = getMetrics(pip_rf, X_val, y_val)

joblib.dump(pip_rf, 'saved_models/rf.pkl')

['saved_models/rf.pkl']

In [18]:
pip_xgb.fit(X_train, y_train_enc)

xgb_metrics = getMetrics(pip_xgb, X_val, y_val_enc)

joblib.dump(pip_xgb, 'saved_models/xgb.pkl')
joblib.dump(label_encoder, 'saved_models/label_encoder.pkl')

['saved_models/label_encoder.pkl']

In [24]:
pd.DataFrame({'Logistic Regression':lr_metrics,'Decision Tree Classifier': dt_metrics,'K-Nearest Neighbor Classifier': knn_metrics,'Naive Bayes Classifier - Gaussian': gnb_metrics,'Ensemble Model - Random Forest': rf_metrics,'Ensemble Model - XGBoost': xgb_metrics}).T

Unnamed: 0,Accuracy,AUC,Precision,Recall,F1-Score,Matthews Correlation Coefficient
Logistic Regression,0.8723,0.9881,0.8769,0.8723,0.8699,0.8522
Decision Tree Classifier,0.9362,0.9628,0.9367,0.9362,0.9362,0.9254
K-Nearest Neighbor Classifier,0.8203,0.9579,0.8118,0.8203,0.802,0.7938
Naive Bayes Classifier - Gaussian,0.5154,0.8911,0.5315,0.5154,0.4625,0.4473
Ensemble Model - Random Forest,0.9409,0.9959,0.9431,0.9409,0.9414,0.9311
Ensemble Model - XGBoost,0.9551,0.9985,0.9564,0.9551,0.955,0.9477
