In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [49]:
# 1️⃣ Load dataset
df = pd.read_csv("datasets/diabetes_clean.csv")

In [50]:
# 2️⃣ Target and features
target = "diabetes"
X = df.drop(columns=[target])
y = df[target]

In [51]:
# 3️⃣ Quick EDA focused on modeling decisions
# - Check imbalance
imbalance = y.value_counts(normalize=True)
print("Target distribution:\n", imbalance)

Target distribution:
 diabetes
0    0.651042
1    0.348958
Name: proportion, dtype: float64


In [52]:
# - Identify numerical features for scaling
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

In [53]:
# 4️⃣ Train-test split with stratification (important for imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [54]:
# 5️⃣ Feature scaling (RandomForest is tree-based so optional; good habit for other models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled = scaler.transform(X_test[numerical_cols])

# Replace numerical columns with scaled versions
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_cols, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_cols, index=X_test.index)
X_train_final = X_train.copy()
X_test_final = X_test.copy()
X_train_final[numerical_cols] = X_train_scaled_df
X_test_final[numerical_cols] = X_test_scaled_df

In [55]:
# 6️⃣ Baseline Model: RandomForest with simple tuning
model = RandomForestClassifier(
    n_estimators=200,          # more trees for stability
    max_depth=5,               # prevent overfitting
    class_weight='balanced',   # handle target imbalance
    random_state=42
)
model.fit(X_train_final, y_train)

# 7️⃣ Predictions
y_pred = model.predict(X_test_final)

# 8️⃣ Evaluation
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.87      0.78      0.82       100
           1       0.66      0.78      0.71        54

    accuracy                           0.78       154
   macro avg       0.76      0.78      0.77       154
weighted avg       0.79      0.78      0.78       154

=== Confusion Matrix ===
[[78 22]
 [12 42]]


In [56]:
# 9️⃣ Optional: Cross-validation for robustness
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
print("5-fold CV F1 scores:", cv_scores)
print("Mean CV F1:", cv_scores.mean())


5-fold CV F1 scores: [0.71929825 0.69724771 0.67226891 0.69026549 0.64516129]
Mean CV F1: 0.6848483273294647
