# Alfredo Winston - 2702297776

In [63]:
from sklearn import set_config
set_config(display="diagram")

from warnings import filterwarnings
filterwarnings('ignore')

# Read Data

In [64]:
import pandas as pd

In [65]:
df = pd.read_csv("ObesityDataSet2.csv")

df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,24,1.58,65.29,yes,no,2.03,2.74,Sometimes,no,2.0,no,1.28,1.019,no,Public_Transportation,Overweight_Level_II
1,Male,23,1.65,66.0,no,no,3.0,3.0,Sometimes,no,2.0,no,3.0,0.0,no,Public_Transportation,Normal_Weight
2,Female,21,1.69,51.26,yes,yes,3.0,3.18,Frequently,no,1.91,no,0.48,0.625,no,Public_Transportation,Insufficient_Weight
3,Female,22,1.69,65.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,Public_Transportation,Normal_Weight
4,Female,23,1.61,82.64,yes,yes,2.96,1.0,Sometimes,no,2.98,no,0.74,2.0,Sometimes,Public_Transportation,Obesity_Type_I


In [66]:
# Checking Data Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056 entries, 0 to 1055
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          1056 non-null   object 
 1   Age                             1056 non-null   object 
 2   Height                          1056 non-null   float64
 3   Weight                          1056 non-null   float64
 4   family_history_with_overweight  1056 non-null   object 
 5   FAVC                            1056 non-null   object 
 6   FCVC                            1019 non-null   float64
 7   NCP                             1056 non-null   float64
 8   CAEC                            1056 non-null   object 
 9   SMOKE                           1056 non-null   object 
 10  CH2O                            1056 non-null   float64
 11  SCC                             1056 non-null   object 
 12  FAF                             10

In [67]:
# Checking Duplicate Data
duplicates = df.duplicated().sum()
duplicates

10

In [68]:
# Checking Data with Missing Value
df.isnull().sum()

Gender                             0
Age                                0
Height                             0
Weight                             0
family_history_with_overweight     0
FAVC                               0
FCVC                              37
NCP                                0
CAEC                               0
SMOKE                              0
CH2O                               0
SCC                                0
FAF                                0
TUE                                0
CALC                               0
MTRANS                            26
NObeyesdad                         0
dtype: int64

In [69]:
# Checking All Value per Column
for i in df.columns:
    print(f"Value counts untuk {i}:")
    print(df[i].value_counts())
    print()

Value counts untuk Gender:
Gender
Male      532
Female    524
Name: count, dtype: int64

Value counts untuk Age:
Age
21          128
23          118
18          107
26          102
19           79
22           73
20           69
24           42
17           36
25           35
30           29
31           29
38           23
27           21
33           19
34           15
32           14
29           13
16           12
37           11
39           11
28            9
41            9
35            9
40            8
21 years      4
55            3
44            3
36            2
46            2
19 years      2
45            2
23 years      2
22 years      2
42            2
33 years      1
48            1
51            1
43            1
17 years      1
15            1
26 years      1
36 years      1
20 years      1
25 years      1
14            1
Name: count, dtype: int64

Value counts untuk Height:
Height
1.75    63
1.70    62
1.65    51
1.76    50
1.62    44
1.63    43
1.60    41
1.71    3

In [70]:
# Drop Duplicate Data
df = df.drop_duplicates()

In [71]:
# Change value column age
df['Age'] = df['Age'].replace("20 years", "20")
df['Age'] = df['Age'].replace("21 years", "21")
df['Age'] = df['Age'].replace("19 years", "19")
df['Age'] = df['Age'].replace("23 years", "23")
df['Age'] = df['Age'].replace("22 years", "22")
df['Age'] = df['Age'].replace("33 years", "33")
df['Age'] = df['Age'].replace("17 years", "17")
df['Age'] = df['Age'].replace("26 years", "26")
df['Age'] = df['Age'].replace("36 years", "36")
df['Age'] = df['Age'].replace("25 years", "25")

In [72]:
df['Age'] = df['Age'].astype('float')

# Split Data

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
X = df.drop('NObeyesdad', axis = 1)
y = df['NObeyesdad']

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
num_columns = ['Height','Weight','FCVC','NCP','CH2O','FAF','TUE']
cat_columns = ['Gender','Age','family_history_with_overweight','FAVC','SMOKE','SCC','MTRANS']
ordinal_columns = ['CAEC','CALC']

# Pre-Processing

In [77]:
from sklearn.pipeline import Pipeline
import pickle

In [78]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder

LabelEncode = LabelEncoder()

num_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
 ])

ordinal_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [79]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transform, num_columns),
        ('cat', cat_transform, cat_columns),
        ('ordinal', ordinal_transform, ordinal_columns)
    ]
)

In [80]:
preprocessor

In [81]:
preprocessor.fit(X_train)

with open("preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

# Model XGBoost

In [82]:
import xgboost as xgb

XGBestimator = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(random_state=42))
])

XGBestimator

In [83]:
y_trained_encoded = LabelEncode.fit_transform(y_train)
y_test_encoded = LabelEncode.fit_transform(y_test)

In [84]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(LabelEncode, f)

In [85]:
XGBestimator.fit(X_train, y_trained_encoded)

In [86]:
y_pred_XGB = XGBestimator.predict(X_test)

In [87]:
from sklearn.metrics import classification_report

report = classification_report(y_test_encoded, y_pred_XGB, target_names = ['Insufficient_Weight','Normal_Weight','Obesity_Type_I','Obesity_Type_II','Obesity_Type_III','Overweight_Level_I','Overweight_Level_II'])
print(report)

                     precision    recall  f1-score   support

Insufficient_Weight       1.00      0.96      0.98        26
      Normal_Weight       0.93      0.98      0.95        41
     Obesity_Type_I       0.92      0.97      0.95        36
    Obesity_Type_II       1.00      1.00      1.00        27
   Obesity_Type_III       1.00      1.00      1.00        28
 Overweight_Level_I       0.81      0.77      0.79        22
Overweight_Level_II       0.86      0.80      0.83        30

           accuracy                           0.93       210
          macro avg       0.93      0.93      0.93       210
       weighted avg       0.93      0.93      0.93       210



# Model Random Forest

In [88]:
from sklearn.ensemble import RandomForestClassifier

In [89]:
RFestimator = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

RFestimator

In [90]:
RFestimator.fit(X_train, y_trained_encoded)

In [91]:
y_pred_RF = RFestimator.predict(X_test)

In [92]:
from sklearn.metrics import classification_report

report = classification_report(y_test_encoded, y_pred_RF, target_names = ['Insufficient_Weight','Normal_Weight','Obesity_Type_I','Obesity_Type_II','Obesity_Type_III','Overweight_Level_I','Overweight_Level_II'])
print(report)

                     precision    recall  f1-score   support

Insufficient_Weight       0.93      0.96      0.94        26
      Normal_Weight       0.89      0.80      0.85        41
     Obesity_Type_I       0.82      0.89      0.85        36
    Obesity_Type_II       1.00      0.96      0.98        27
   Obesity_Type_III       1.00      1.00      1.00        28
 Overweight_Level_I       0.69      0.82      0.75        22
Overweight_Level_II       0.78      0.70      0.74        30

           accuracy                           0.87       210
          macro avg       0.87      0.88      0.87       210
       weighted avg       0.87      0.87      0.87       210



Berdasarkan hasil evaluasi model, XGBoost menunjukkan performa yang lebih unggul dibandingkan Random Forest. XGBoost mencapai akurasi 93%, sementara Random Forest hanya 87%. Selain itu, macro avg dan weighted avg F1-score XGBoost mencapai 0.93, sementara Random Forest hanya 0.87. Hal ini menunjukkan XGBoost merupakan model yang lebih baik dibandingkan Random Forest untuk dataset kali ini.

In [93]:
xgb_model = XGBestimator.named_steps['classifier']

with open("xgb_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)