In [None]:
# Final Project ML - Klasifikasi Diabetes (85% Akurasi)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# 1. Load Dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

# 2. Ganti nilai 0 menjadi NaN → median
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zero:
    df[col] = df[col].replace(0, np.nan)
    df[col] = df[col].fillna(df[col].median())

# 3. Cek data
print("Jumlah duplikat:", df.duplicated().sum())
print("Missing values:\n", df.isnull().sum())

# 4. Fitur dan target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# 5. Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 7. SMOTE (balancing training set)
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 8. Hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=30, cv=3,
                            scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)
search.fit(X_train_sm, y_train_sm)

# 9. Evaluasi
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("\nBest Parameters:", search.best_params_)
print("Akurasi Akhir:", acc)
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

# 10. Kesimpulan
if acc >= 0.85:
    print("✓ Akurasi ≥ 85% — Lulus syarat tugas!")
else:
    print("✗ Akurasi masih di bawah 85%, pertimbangkan tuning lebih lanjut.")


Jumlah duplikat: 0
Missing values:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
Fitting 3 folds for each of 27 candidates, totalling 81 fits

Best Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Akurasi XGBoost (Tuned): 0.7272727272727273

=== Confusion Matrix ===
[[70 29]
 [13 42]]

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.84      0.71      0.77        99
           1       0.59      0.76      0.67        55

    accuracy                           0.73       154
   macro avg       0.72      0.74      0.72       154
weighted avg       0.75      0.73      0.73       154


=== Kesimpulan ===
✗ Masih di bawah 85%, coba lebih banyak parameter/tuning lanjut.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
