In [None]:
# FINAL - Klasifikasi Diabetes dengan Dataset Kaggle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# 1. Load Dataset Kaggle
url = "https://raw.githubusercontent.com/didtar/ml-dataset-diabetes/main/diabetes_binary_health_indicators_BRFSS2015.csv"
df = pd.read_csv(url)
print("Ukuran data:", df.shape)

# 2. Cek dan ringkas data
print(df.head())
print(df.info())
print(df['Diabetes_binary'].value_counts())

# 3. Pisahkan fitur dan label
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

# 4. Standarisasi
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split Train-Test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 6. Balancing dengan SMOTE
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 7. Optuna: Hyperparameter Tuning XGBoost
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    }

    model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train_sm, y_train_sm)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 8. Evaluasi Model Terbaik
print("Best Params:", study.best_params)
best_model = XGBClassifier(**study.best_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
best_model.fit(X_train_sm, y_train_sm)
y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("\nAkurasi Akhir:", acc)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

if acc >= 0.85:
    print("✓ Model berhasil mencapai ≥ 85% akurasi 🎉")
else:
    print("✗ Masih di bawah 85%, coba lebih banyak tuning atau feature engineering.")


[I 2025-07-24 22:03:32,651] A new study created in memory with name: no-name-e5abe85b-aeb0-4ece-8497-0f3667748f96
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 22:03:32,882] Trial 0 finished with value: 0.7077922077922078 and parameters: {'n_estimators': 174, 'max_depth': 7, 'learning_rate': 0.07478313094530792, 'subsample': 0.8674511104623697, 'colsample_bytree': 0.6195358822482193}. Best is trial 0 with value: 0.7077922077922078.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 22:03:33,126] Trial 1 finished with value: 0.7402597402597403 and parameters: {'n_estimators': 375, 'max_depth': 6, 'learning_rate': 0.01008029600841594, 'subsample': 0.6386782251454993, 'colsample_bytree': 0.825307119778613}. Best is trial 1 with value: 0.7402597402597403.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 22:03:33


Akurasi Akhir: 0.7468

Best Params: {'n_estimators': 202, 'max_depth': 8, 'learning_rate': 0.1993936081371597, 'subsample': 0.9432130427529863, 'colsample_bytree': 0.751398280639586}

Confusion Matrix:
 [[75 24]
 [15 40]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.79        99
           1       0.62      0.73      0.67        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

✗ Akurasi masih di bawah 85%, tuning lebih lanjut diperlukan.
