In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Lokasi file
file_path = "data/obesity.csv"
# Memuat dataset
df = pd.read_csv(file_path)

print("--- 1. Import & Load Dataset ---")
print("\n5 Baris Pertama Dataset:")
print(df.head())
print("\nInformasi Dataset (Tipe Data dan Non-Null Counts):")
df.info()

--- 1. Import & Load Dataset ---

5 Baris Pertama Dataset:
   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes   
2  3.0  Sometimes    no   2.0   no  2.0  1.0  Frequently   
3  3.0  Sometimes    no   2.0   no  2.0  0.0  Frequently   
4  1.0  Sometimes    no   2.0   no  0.0  0.0   Sometimes   

                  MTRANS           NObeyesdad  
0  Public_Transportation        Normal_Weight  
1  Public_Transportat

In [2]:
# Cek bentuk dataset
print("\nBentuk Dataset (Baris, Kolom):", df.shape)

# Cek deskripsi statistik
print("\nDeskripsi Statistik (Semua Fitur):")
print(df.describe(include='all'))

# Cek duplikasi data
duplicates = df.duplicated().sum()
print(f"\nJumlah Duplikasi Data: {duplicates}")
if duplicates > 0:
    df = df.drop_duplicates().reset_index(drop=True)
    print("Duplikasi telah dihapus.")
    print("Bentuk Dataset Setelah Duplikasi Dihapus:", df.shape)

# Identifikasi fitur numerik dan kategorikal
numeric_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
target_column = 'NObeyesdad'

# Visualisasi 1: Histogram untuk semua fitur numerik
# Grafik telah disimpan sebagai 'histograms_numeric_features.png'
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_features):
    plt.subplot(3, 3, i + 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Histogram Fitur: {col}')
    plt.xlabel(col)
    plt.ylabel('Frekuensi')
plt.tight_layout()
plt.savefig('histograms_numeric_features.png')
plt.close()

# Visualisasi 2: Countplot untuk target
# Grafik telah disimpan sebagai 'countplot_target.png'
plt.figure(figsize=(10, 6))
sns.countplot(y=df[target_column], order=df[target_column].value_counts().index, palette='viridis')
plt.title(f'Countplot untuk Target: {target_column}')
plt.xlabel('Jumlah')
plt.ylabel('Tingkat Obesitas')
plt.tight_layout()
plt.savefig('countplot_target.png')
plt.close()

# Cek korelasi antar fitur numerik (heatmap)
# Grafik telah disimpan sebagai 'heatmap_correlation_numeric.png'
correlation_matrix = df[numeric_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Korelasi Antar Fitur Numerik')
plt.tight_layout()
plt.savefig('heatmap_correlation_numeric.png')
plt.close()


Bentuk Dataset (Baris, Kolom): (2111, 17)

Deskripsi Statistik (Semua Fitur):
       Gender          Age       Height       Weight  \
count    2111  2111.000000  2111.000000  2111.000000   
unique      2          NaN          NaN          NaN   
top      Male          NaN          NaN          NaN   
freq     1068          NaN          NaN          NaN   
mean      NaN    24.312600     1.701677    86.586058   
std       NaN     6.345968     0.093305    26.191172   
min       NaN    14.000000     1.450000    39.000000   
25%       NaN    19.947192     1.630000    65.473343   
50%       NaN    22.777890     1.700499    83.000000   
75%       NaN    26.000000     1.768464   107.430682   
max       NaN    61.000000     1.980000   173.000000   

       family_history_with_overweight  FAVC         FCVC          NCP  \
count                            2111  2111  2111.000000  2111.000000   
unique                              2     2          NaN          NaN   
top                          


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(y=df[target_column], order=df[target_column].value_counts().index, palette='viridis')


In [3]:
# Pisahkan fitur (X) dan target (y)
X = df.drop(columns=[target_column])
y = df[target_column]

# Split dataset menjadi data training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# Preprocessor untuk fitur numerik: Normalisasi (StandardScaler)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessor untuk fitur kategorikal: Encoding (OneHotEncoder)
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Gabungkan preprocessors menggunakan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False # Untuk output nama kolom yang lebih bersih
)
preprocessor.set_output(transform="pandas")

X_train shape: (1669, 16), y_train shape: (1669,)
X_test shape: (418, 16), y_test shape: (418,)


0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [4]:
# Definisikan model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Buat Pipeline: Preprocessing + Model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Train model pada data training
print("Melatih model...")
pipeline.fit(X_train, y_train)
print("Model selesai dilatih.")

# Prediksi pada data test
y_pred = pipeline.predict(X_test)

Melatih model...
Model selesai dilatih.


In [5]:
print("\n--- 5. Evaluasi Model ---")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Model (RandomForest): {accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix (visualisasi heatmap)
# Grafik telah disimpan sebagai 'confusion_matrix_heatmap.png'
cm = confusion_matrix(y_test, y_pred)
class_names = pipeline.classes_
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix')
plt.xlabel('Prediksi Kelas')
plt.ylabel('Kelas Aktual')
plt.tight_layout()
plt.savefig('confusion_matrix_heatmap.png')
plt.close()

# Feature Importance
feature_importances = pipeline['classifier'].feature_importances_
feature_names_processed = pipeline['preprocessor'].get_feature_names_out()
importance_series = pd.Series(feature_importances, index=feature_names_processed).sort_values(ascending=False)

# Tampilkan top 10 feature importance
print("\nFeature Importance (Top 10):")
print(importance_series.head(10))

# Visualisasi Feature Importance
# Grafik telah disimpan sebagai 'feature_importance.png'
plt.figure(figsize=(12, 6))
sns.barplot(x=importance_series.head(10).values, y=importance_series.head(10).index, palette='viridis')
plt.title('Top 10 Feature Importance (RandomForest)')
plt.xlabel('Kepentingan Fitur')
plt.ylabel('Fitur')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()


--- 5. Evaluasi Model ---
Accuracy Model (RandomForest): 0.9378

Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       1.00      0.94      0.97        53
      Normal_Weight       0.78      0.91      0.84        57
     Obesity_Type_I       0.99      0.97      0.98        70
    Obesity_Type_II       1.00      1.00      1.00        60
   Obesity_Type_III       1.00      0.98      0.99        65
 Overweight_Level_I       0.91      0.87      0.89        55
Overweight_Level_II       0.91      0.86      0.88        58

           accuracy                           0.94       418
          macro avg       0.94      0.94      0.94       418
       weighted avg       0.94      0.94      0.94       418


Feature Importance (Top 10):
Weight           0.273263
Age              0.094382
FCVC             0.085312
Height           0.077979
NCP              0.051711
FAF              0.049592
TUE              0.046541
CH2O             0.044848


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=importance_series.head(10).values, y=importance_series.head(10).index, palette='viridis')


In [6]:
# Simpan pipeline model ke file menggunakan joblib.dump
model_filename = 'model_obesity.pkl'
joblib.dump(pipeline, model_filename)
print(f"\n--- 6. Simpan Model ---")
print(f"Model Pipeline (RandomForestClassifier) telah disimpan ke file: {model_filename}")


--- 6. Simpan Model ---
Model Pipeline (RandomForestClassifier) telah disimpan ke file: model_obesity.pkl
