In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [5]:
df = pd.read_csv("data/diabetes.csv")
print(df.head())

   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4           0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0        4.0   
1     

In [6]:
# Define features (X) and target (y)
X = df.drop(columns=["Diabetes_012"])
y = df["Diabetes_012"]

In [7]:
# Preprocessing for categorical data
categorical_cols = ["CholCheck", "Smoker", "HeartDiseaseorAttack", "PhysActivity", 
                    "Fruits", "Veggies", "HvyAlcoholConsump", "AnyHealthcare", 
                    "NoDocbcCost", "GenHlth", "Sex", "Education", "Income"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), categorical_cols),
        ("num", StandardScaler(), ["BMI", "MentHlth", "PhysHlth", "Age"])
    ],
    remainder="passthrough"
)

In [8]:
# Define the machine learning pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("scaler", StandardScaler()),  # Optional, useful for models that benefit from scaling
    ("model", RandomForestClassifier(random_state=42, n_estimators=100))
])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [11]:
# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8411778618732261

Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.97      0.91     42741
         1.0       0.00      0.00      0.00       926
         2.0       0.48      0.18      0.27      7069

    accuracy                           0.84     50736
   macro avg       0.45      0.38      0.39     50736
weighted avg       0.79      0.84      0.81     50736


Confusion Matrix:
 [[41383    47  1311]
 [  837     0    89]
 [ 5767     7  1295]]
