In [1]:
import pandas as pd
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE # <-- This is the new library

print("Loading dataset...")
df = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')

# All columns are already numbers, no get_dummies() needed!
# 1. Define Features (X) and Target (y)
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

Loading dataset...


In [2]:
MODEL_COLUMNS = X.columns.tolist()
with open('diabetes_model_columns.json', 'w') as f:
    json.dump(MODEL_COLUMNS, f)

In [3]:
# 2. Split data BEFORE balancing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# 3. Scale the data (important for many models)
# This standardizes all features to have a similar scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# 4. --- Handle Class Imbalance with SMOTE ---
print(f"Balancing data with SMOTE...")
smote = SMOTE(random_state=42)
# We ONLY fit SMOTE on the training data
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

Balancing data with SMOTE...


[WinError 2] The system cannot find the file specified
  File "C:\Users\mhdro\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\mhdro\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\mhdro\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                   

In [6]:
#5. Train the Classifier
print("Training Random Forest Classifier...")
# Use RandomForestClassifier this time
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1) 
model.fit(X_train_balanced, y_train_balanced)

# 6. Evaluate the model
print("Evaluating model...")
y_pred = model.predict(X_test_scaled)

Training Random Forest Classifier...
Evaluating model...


In [7]:
# A classification report is better than just "accuracy"
# It shows precision/recall for each class (0, 1, and 2)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.88      0.93      0.90     42795
         1.0       0.02      0.00      0.01       944
         2.0       0.42      0.35      0.38      6997

    accuracy                           0.83     50736
   macro avg       0.44      0.43      0.43     50736
weighted avg       0.80      0.83      0.81     50736

