In [None]:
# FINAL: Optuna + SMOTE + XGBoost for Diabetes Classification (Akurasi Target ≥ 85%)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import optuna

# 1. Load Dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

# 2. Ganti nilai 0 menjadi NaN lalu isi dengan median
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zero:
    df[col] = df[col].replace(0, np.nan)
    df[col] = df[col].fillna(df[col].median())

# 3. Fitur dan target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# 4. Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 6. SMOTE
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 7. Optuna Optimization
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'random_state': 42
    }

    model = XGBClassifier(**params)
    model.fit(X_train_sm, y_train_sm)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

# 8. Jalankan optimasi (n_trials bisa ditambah agar lebih optimal)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 9. Evaluasi Model Terbaik
print("Best Parameters:", study.best_params_)

best_model = XGBClassifier(**study.best_params_, use_label_encoder=False, eval_metric='logloss')
best_model.fit(X_train_sm, y_train_sm)

y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Akurasi Akhir:", acc)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

if acc >= 0.85:
    print("✓ Model berhasil capai akurasi ≥ 85% ✅")
else:
    print("✗ Masih di bawah 85%, coba tuning lebih lanjut.")


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-24 14:19:25,055] A new study created in memory with name: no-name-d1fc3e52-5062-4d8c-b3f2-e10c661e99b1
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 14:19:25,220] Trial 0 finished with value: 0.7467532467532467 and parameters: {'n_estimators': 221, 'max_depth': 3, 'learning_rate': 0.19934060066277917, 'subsample': 0.716071487835042, 'colsample_bytree': 0.6101802191598891, 'gamma': 3.4254630565247606, 'reg_alpha': 3.124266763228278, 'reg_lambda': 4.043710309081582}. Best is trial 0 with value: 0.7467532467532467.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 14:19:25,324] Trial 1 finished with value: 0.7272727272727273 and parameters: {'n_estimators': 397, 'max_depth': 3, 'learning_rate': 0.016443281874645676, 'subsample': 0.8408706239621341, 'colsample_bytree': 0.9442347641376332, 'gamma': 1.9206158630

AttributeError: 'Study' object has no attribute 'best_params_'