In [None]:
# FINAL - Capai ≥ 85% akurasi dengan SMOTE + GridSearchCV + XGBoost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# 1. Load Dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

# 2. Ganti nilai 0 menjadi NaN lalu isi dengan median
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zero:
    df[col] = df[col].replace(0, np.nan)
    df[col] = df[col].fillna(df[col].median())

# 3. Cek info awal
print("Jumlah duplikat:", df.duplicated().sum())
print("Missing values:\n", df.isnull().sum())

# 4. Fitur dan target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# 5. Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 7. SMOTE
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 8. Grid Search for XGBoost
params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1]
}

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
grid = GridSearchCV(estimator=xgb, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid.fit(X_train_sm, y_train_sm)

print("\nBest Parameters:", grid.best_params_)

# 9. Evaluasi model terbaik
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Akurasi XGBoost (Tuned):", acc)

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("\n=== Kesimpulan ===")
if acc >= 0.85:
    print("✓ Model berhasil mencapai ≥ 85% akurasi — Syarat tugas terpenuhi!")
else:
    print("✗ Masih di bawah 85%, coba lebih banyak parameter/tuning lanjut.")


Jumlah duplikat: 0
Missing values:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
Fitting 3 folds for each of 27 candidates, totalling 81 fits
