In [None]:
# FINAL PROJECT - Optimized All Models for >85% Accuracy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
path = r"C:\Users\ASUS\.cache\kagglehub\datasets\alexteboul\diabetes-health-indicators-dataset\versions\1\diabetes_binary_health_indicators_BRFSS2015.csv"
df = pd.read_csv(path)

# Drop duplicates
df = df.drop_duplicates()

# Features and target
X = df.drop(columns="Diabetes_binary")
y = df["Diabetes_binary"]

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Logistic Regression (tuned)
lr = LogisticRegression(C=0.1, max_iter=1000)
lr.fit(X_train_sm, y_train_sm)
acc_lr = accuracy_score(y_test, lr.predict(X_test))

# Random Forest (tuned)
rf = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_split=5, random_state=42)
rf.fit(X_train_sm, y_train_sm)
acc_rf = accuracy_score(y_test, rf.predict(X_test))

# XGBoost (tuned)
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6,
                    subsample=0.8, colsample_bytree=1,
                    use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_sm, y_train_sm)
acc_xgb = accuracy_score(y_test, xgb.predict(X_test))

print(f"Akurasi Logistic Regression: {acc_lr}")
print(f"Akurasi Random Forest: {acc_rf}")
print(f"Akurasi XGBoost: {acc_xgb}")
