In [1]:
# FINAL OPTIMIZED: SMOTE + Optuna + XGBoost (Akurasi target 85%)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# 1. Load Dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

# 2. Bersihkan data: ubah 0 jadi NaN lalu isi median
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zero:
    df[col] = df[col].replace(0, np.nan)
    df[col] = df[col].fillna(df[col].median())

# 3. Fitur dan label
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# 4. Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 6. SMOTE
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 7. Objective Function Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 400),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    }

    model = XGBClassifier(
        **params,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
    model.fit(X_train_sm, y_train_sm)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

# 8. Jalankan Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 9. Evaluasi Model Terbaik
print("Best Parameters:", study.best_params)
best_model = XGBClassifier(**study.best_params, use_label_encoder=False, eval_metric='logloss')
best_model.fit(X_train_sm, y_train_sm)
y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("\nAkurasi Akhir:", acc)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

if acc >= 0.85:
    print("✓ Model berhasil mencapai ≥ 85% akurasi 🎉")
else:
    print("✗ Akurasi masih di bawah 85%, lanjutkan tuning.")


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-24 21:57:39,050] A new study created in memory with name: no-name-7a263f51-b9db-4af8-857d-0acf36608757
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 21:57:39,325] Trial 0 finished with value: 0.7207792207792207 and parameters: {'n_estimators': 140, 'max_depth': 4, 'learning_rate': 0.06078249685731696, 'subsample': 0.924205707863728, 'colsample_bytree': 0.9425200694537555}. Best is trial 0 with value: 0.7207792207792207.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-24 21:57:39,512] Trial 1 finished with value: 0.7402597402597403 and parameters: {'n_estimators': 167, 'max_depth': 10, 'learning_rate': 0.014814455445553498, 'subsample': 0.8616027029276205, 'colsample_bytree': 0.9482110307591304}. Best is trial 1 with value: 0.7402597402597403.
Parameters: { "use_label_encoder" } are not used.

  bst.update(d

Best Parameters: {'n_estimators': 276, 'max_depth': 10, 'learning_rate': 0.045039131152442555, 'subsample': 0.9825797873314692, 'colsample_bytree': 0.9987384708574336}

Akurasi Akhir: 0.7467532467532467

Confusion Matrix:
 [[73 26]
 [13 42]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.74      0.79        99
           1       0.62      0.76      0.68        55

    accuracy                           0.75       154
   macro avg       0.73      0.75      0.74       154
weighted avg       0.77      0.75      0.75       154

✗ Akurasi masih di bawah 85%, lanjutkan tuning.
