In [5]:
# FINAL OPTIMIZED: SMOTE + Optuna + XGBoost (Akurasi target 85%)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# 1. Load Dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

# 2. Bersihkan data: ubah 0 jadi NaN lalu isi median
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zero:
    df[col] = df[col].replace(0, np.nan)
    df[col] = df[col].fillna(df[col].median())

# 3. Fitur dan label
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# 4. Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 6. SMOTE
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 7. Objective Function Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    }

    model = XGBClassifier(
        **params,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
    model.fit(X_train_sm, y_train_sm)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

# 8. Jalankan Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 9. Evaluasi Model Terbaik
print("Best Parameters:", study.best_params)
best_model = XGBClassifier(**study.best_params, use_label_encoder=False, eval_metric='logloss')
best_model.fit(X_train_sm, y_train_sm)
y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("\nAkurasi Akhir:", acc)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

if acc >= 0.85:
    print("✓ Model berhasil mencapai ≥ 85% akurasi 🎉")
else:
    print("✗ Akurasi masih di bawah 85%, lanjutkan tuning.")


[I 2025-07-24 14:32:09,429] A new study created in memory with name: no-name-f2ed585a-b867-4b76-b7f5-67c380dda298
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 14:32:09,592] Trial 0 finished with value: 0.7207792207792207 and parameters: {'n_estimators': 279, 'max_depth': 9, 'learning_rate': 0.15296888273512782, 'subsample': 0.6776749995288943, 'colsample_bytree': 0.6628220537287431}. Best is trial 0 with value: 0.7207792207792207.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 14:32:09,683] Trial 1 finished with value: 0.7272727272727273 and parameters: {'n_estimators': 288, 'max_depth': 3, 'learning_rate': 0.06024347160310355, 'subsample': 0.6511823089248472, 'colsample_bytree': 0.9472324578636556}. Best is trial 1 with value: 0.7272727272727273.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 14:32:0

Best Parameters: {'n_estimators': 373, 'max_depth': 8, 'learning_rate': 0.1350726261405996, 'subsample': 0.9966545959871098, 'colsample_bytree': 0.7883851624400517}

Akurasi Akhir: 0.7077922077922078

Confusion Matrix:
 [[70 29]
 [16 39]]

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.71      0.76        99
           1       0.57      0.71      0.63        55

    accuracy                           0.71       154
   macro avg       0.69      0.71      0.70       154
weighted avg       0.73      0.71      0.71       154

✗ Akurasi masih di bawah 85%, lanjutkan tuning.
