In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [42]:
import pandas as pd
import sqlite3

# csv into dataframe
csv_path = 'data/diabetes.csv'
df = pd.read_csv(csv_path)

# SQLite in-memory database
conn = sqlite3.connect(':memory:')

# df into SQL
df.to_sql('diabetes_data', conn, if_exists='replace', index=False)

# select only the needed columns
query = '''
SELECT Diabetes_012, CholCheck, Smoker, HeartDiseaseorAttack, PhysActivity,
       Fruits, Veggies, HvyAlcoholConsump, AnyHealthcare, NoDocbcCost, GenHlth,
       Sex, Education, Income, "BMI", "MentHlth", "PhysHlth", "Age"
FROM diabetes_data
WHERE Diabetes_012 IS NOT NULL
  AND CholCheck IS NOT NULL
  AND Smoker IS NOT NULL
  AND HeartDiseaseorAttack IS NOT NULL
  AND PhysActivity IS NOT NULL
  AND Fruits IS NOT NULL
  AND Veggies IS NOT NULL
  AND HvyAlcoholConsump IS NOT NULL
  AND AnyHealthcare IS NOT NULL
  AND NoDocbcCost IS NOT NULL
  AND GenHlth IS NOT NULL
  AND Sex IS NOT NULL
  AND Education IS NOT NULL
  AND Income IS NOT NULL
  AND BMI IS NOT NULL
  AND MentHlth IS NOT NULL
  AND PhysHlth IS NOT NULL
  AND Age IS NOT NULL;
'''

# load cleaned data into new df
clean_df = pd.read_sql_query(query, conn)
clean_df.head()

Unnamed: 0,Diabetes_012,CholCheck,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,Sex,Education,Income,BMI,MentHlth,PhysHlth,Age
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,0.0,4.0,3.0,40.0,18.0,15.0,9.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,6.0,1.0,25.0,0.0,0.0,7.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,0.0,4.0,8.0,28.0,30.0,30.0,9.0
3,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,3.0,6.0,27.0,0.0,0.0,11.0
4,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,5.0,4.0,24.0,3.0,0.0,11.0


In [43]:
# Define features (X) and target (y)
X = clean_df.drop(columns=["Diabetes_012"])
y = clean_df["Diabetes_012"]

In [44]:
# Preprocessing for categorical data
categorical_cols = ["CholCheck", "Smoker", "HeartDiseaseorAttack", "PhysActivity", 
                    "Fruits", "Veggies", "HvyAlcoholConsump", "AnyHealthcare", 
                    "NoDocbcCost", "GenHlth", "Sex", "Education", "Income"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), categorical_cols),
        ("num", StandardScaler(), ["BMI", "MentHlth", "PhysHlth", "Age"])
    ],
    remainder="passthrough"
)

In [45]:
# Define the machine learning pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("scaler", StandardScaler()),  # Optional, useful for models that benefit from scaling
    ("model", RandomForestClassifier(random_state=42, n_estimators=100))
])

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [47]:
pipeline.fit(X_train, y_train)

In [48]:
# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8346341847997477

Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.96      0.91     42741
         1.0       0.02      0.00      0.00       926
         2.0       0.43      0.17      0.25      7069

    accuracy                           0.83     50736
   macro avg       0.44      0.38      0.39     50736
weighted avg       0.79      0.83      0.80     50736


Confusion Matrix:
 [[41111    83  1547]
 [  824     2   100]
 [ 5823    13  1233]]
