## Import Libraries

In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

## Data Exploration

In [146]:
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")

In [147]:
df.shape

(2111, 17)

In [148]:
df.head(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [149]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.315964,1.70162,86.586035,2.418986,2.685651,2.008053,1.010313,0.657861
std,6.357078,0.093368,26.191163,0.533996,0.778079,0.61295,0.850613,0.608926
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,20.0,1.63,65.47,2.0,2.66,1.585,0.125,0.0
50%,23.0,1.7,83.0,2.39,3.0,2.0,1.0,0.625
75%,26.0,1.77,107.43,3.0,3.0,2.48,1.67,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [150]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   int64  
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [151]:
df.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [152]:
df["NObeyesdad"].value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

## Preprocessing

In [153]:
nominal_cats = [
    "Gender", 
    "family_history_with_overweight", 
    "FAVC", 
    "CAEC", 
    "SMOKE", 
    "SCC", 
    "CALC", 
    "MTRANS", 
]

ordinal_cats = ["FCVC", "NCP", "CH2O", "FAF", "TUE"]

In [154]:
cat_encoder = OneHotEncoder()
df_encoded = pd.DataFrame(cat_encoder.fit_transform(df[nominal_cats]).toarray(),
                            columns=cat_encoder.get_feature_names_out(),
                            index=df.index)
df = df.drop(columns=nominal_cats).join(df_encoded)
df

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad,Gender_Female,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21,1.62,64.00,2.0,3.0,2.00,0.00,1.000,Normal_Weight,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,21,1.52,56.00,3.0,3.0,3.00,3.00,0.000,Normal_Weight,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,23,1.80,77.00,2.0,3.0,2.00,2.00,1.000,Normal_Weight,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,27,1.80,87.00,3.0,3.0,2.00,2.00,0.000,Overweight_Level_I,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,22,1.78,89.80,2.0,1.0,2.00,0.00,0.000,Overweight_Level_II,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,21,1.71,131.41,3.0,3.0,1.73,1.68,0.906,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2107,22,1.75,133.74,3.0,3.0,2.01,1.34,0.599,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2108,23,1.75,133.69,3.0,3.0,2.05,1.41,0.646,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2109,24,1.74,133.35,3.0,3.0,2.85,1.14,0.586,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [155]:
for col in ordinal_cats:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
df

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad,Gender_Female,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21,1.62,64.00,80,176,100,0,567,Normal_Weight,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,21,1.52,56.00,179,176,200,256,0,Normal_Weight,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,23,1.80,77.00,80,176,100,197,567,Normal_Weight,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,27,1.80,87.00,179,176,100,197,0,Overweight_Level_I,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,22,1.78,89.80,80,0,100,0,0,Overweight_Level_II,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,21,1.71,131.41,179,176,73,167,512,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2107,22,1.75,133.74,179,176,101,133,323,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2108,23,1.75,133.69,179,176,105,140,344,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2109,24,1.74,133.35,179,176,185,113,317,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [156]:
num_cols = df.select_dtypes(include=['number']).columns

min_max_scaler = MinMaxScaler(feature_range=(0, 1))
df[num_cols] = min_max_scaler.fit_transform(df[num_cols])
df

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad,Gender_Female,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,0.148936,0.320755,0.186567,0.446927,0.690196,0.500,0.000000,0.698276,Normal_Weight,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.148936,0.132075,0.126866,1.000000,0.690196,1.000,1.000000,0.000000,Normal_Weight,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.191489,0.660377,0.283582,0.446927,0.690196,0.500,0.769531,0.698276,Normal_Weight,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.276596,0.660377,0.358209,1.000000,0.690196,0.500,0.769531,0.000000,Overweight_Level_I,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.170213,0.622642,0.379104,0.446927,0.000000,0.500,0.000000,0.000000,Overweight_Level_II,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0.148936,0.490566,0.689627,1.000000,0.690196,0.365,0.652344,0.630542,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2107,0.170213,0.566038,0.707015,1.000000,0.690196,0.505,0.519531,0.397783,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2108,0.191489,0.566038,0.706642,1.000000,0.690196,0.525,0.546875,0.423645,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2109,0.212766,0.547170,0.704104,1.000000,0.690196,0.925,0.441406,0.390394,Obesity_Type_III,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Model Training

In [157]:
X = df.drop(columns=["NObeyesdad"])  # Features
y = df["NObeyesdad"]  # Target

model = LogisticRegression(max_iter=1000, multi_class="multinomial", solver="lbfgs")
model.fit(X, y)
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)



## Performance Metrics

In [158]:
score = accuracy_score(y_test, y_pred)
print(score)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Classification Report
print(classification_report(y_test, y_pred))

0.8652482269503546
[[56  0  0  0  0  0  0]
 [ 9 38  0  0  0 12  3]
 [ 0  0 69  7  0  0  2]
 [ 0  0  2 56  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 0  4  0  0  0 41 11]
 [ 0  0  3  0  0  4 43]]
                     precision    recall  f1-score   support

Insufficient_Weight       0.86      1.00      0.93        56
      Normal_Weight       0.90      0.61      0.73        62
     Obesity_Type_I       0.93      0.88      0.91        78
    Obesity_Type_II       0.89      0.97      0.93        58
   Obesity_Type_III       1.00      1.00      1.00        63
 Overweight_Level_I       0.72      0.73      0.73        56
Overweight_Level_II       0.73      0.86      0.79        50

           accuracy                           0.87       423
          macro avg       0.86      0.87      0.86       423
       weighted avg       0.87      0.87      0.86       423

