In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import pickle


In [13]:
df = pd.read_csv("Dhaka_Obesity.csv")   # ganti sesuai nama file
df.head()


Unnamed: 0,Gender,Age,Height (m),Weight (kg),Family history of overweight,High caloric food consumption,Vegetable consumption frequency,Daily main meals frequency,Between-meal food consumption frequency,Smoking,Alcohol intake,Daily water intake,Monitor calories,Physical exercise,Daily device usage duration,Mode of transportation,Obesity level
0,Male,29,1.65,101.0,Yes,Yes,Sometimes,Three,Frequently,No,I do not drink,Between 1 and 2 L,No,I do not have,More than 5 hours,Private Car,Obesity_Type_II
1,Female,25,1.65,53.0,No,No,Always,Three,Always,No,I do not drink,Between 1 and 2 L,No,Almost Everyday,More than 5 hours,Public Transportation,Normal_Weight
2,Male,23,1.7,70.0,No,No,Always,Between 1-2,Sometimes,Yes,Sometimes,Less than a liter,No,I do not have,More than 5 hours,Public Transportation,Normal_Weight
3,Male,22,1.68,112.0,Yes,Yes,Sometimes,Three,Frequently,No,I do not drink,Between 1 and 2 L,No,I do not have,More than 5 hours,Public Transportation,Obesity_Type_II
4,Male,19,1.75,67.0,No,Yes,Always,Three,Sometimes,Yes,I do not drink,Between 1 and 2 L,No,2 or 4 days,3-5 hours,Bike,Normal_Weight


In [14]:
df.columns = [
    "Gender",
    "Age",
    "Height",
    "Weight",
    "FamilyHistory",
    "HighCaloricFood",
    "Vegetables",
    "MainMeals",
    "Snacks",
    "Smoking",
    "Alcohol",
    "Water",
    "Monitor",
    "Exercise",
    "Devices",
    "Transport",
    "Obesity"
]


In [15]:
le_dict = {}

for col in df.columns:
    if df[col].dtype == "object":
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        le_dict[col] = le

# cek hasil
df.head()


Unnamed: 0,Gender,Age,Height,Weight,FamilyHistory,HighCaloricFood,Vegetables,MainMeals,Snacks,Smoking,Alcohol,Water,Monitor,Exercise,Devices,Transport,Obesity
0,1,29,1.65,101.0,1,1,2,2,1,0,2,0,0,4,2,1,3
1,0,25,1.65,53.0,0,0,0,2,0,0,2,0,0,3,2,2,1
2,1,23,1.7,70.0,0,0,0,0,3,1,3,1,0,4,2,2,1
3,1,22,1.68,112.0,1,1,2,2,1,0,2,0,0,4,2,2,3
4,1,19,1.75,67.0,0,1,0,2,3,1,2,0,0,1,1,0,1


In [16]:
X = df.drop("Obesity", axis=1)
y = df["Obesity"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# SMOTE
sm = SMOTE()
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sm)
X_test_scaled = scaler.transform(X_test)


In [17]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8878718535469108
              precision    recall  f1-score   support

           0       0.87      0.96      0.91        70
           1       0.90      0.84      0.87       111
           2       0.89      0.87      0.88        78
           3       0.89      0.89      0.89        54
           4       0.94      1.00      0.97        51
           5       0.84      0.84      0.84        73

    accuracy                           0.89       437
   macro avg       0.89      0.90      0.89       437
weighted avg       0.89      0.89      0.89       437



In [18]:
with open("logistic_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("labelencoders.pkl", "wb") as f:
    pickle.dump(le_dict, f)