# Import Necessary Packages

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Import Dataset

In [3]:
# Load clean + composites dataset
df = pd.read_csv("dataset/final_2024_dataset/brfss2024_clean_readable_with_state_plus_composites.csv", low_memory=False)

df.shape, df.columns

((453241, 36),
 Index(['Physical_Activity', 'Physically_Unhealthy_Days',
        'Ever_Smoked_100_Cigarettes', 'Current_Smoking_Frequency',
        'Smoking_Status_Category', 'At_Risk_Smoking', 'At_Risk_Smoking.1',
        'Alcohol_Use_Frequency', 'Average_Drinks_Per_Day',
        'Heavy_Drinking_Flag', 'Any_Alcohol_Past_30Days', 'Self_Rated_Health',
        'Physically_Unhealthy_Days_Count', 'Mentally_Unhealthy_Days_Count',
        'Limited_Health_Days_Due_To_Health', 'Body_Mass_Index', 'BMI_Category',
        'At_Risk_BMI', 'At_Risk_BMI.1', 'Weight_Pounds', 'Biological_Sex',
        'Age_Code', 'Race_Ethnicity_Group', 'Education_Level',
        'Household_Income_Category', 'At_Risk_BMI.2', 'At_Risk_BMI.3',
        'At_Risk_Smoking.2', 'At_Risk_Smoking.3', 'State_Code', 'State_Name',
        'Unhealthy_Days_Total', 'Alcohol_Risk_Level', 'SES_Score',
        'target_3class', 'SEQNO'],
       dtype='object'))

# Define target and feature sets (Model-2: behavioral + demo + anthrop)

In [4]:
# Target: 3-class diabetes status
y = df["target_3class"].astype(int)

# Drop columns you do NOT want to use as predictors
drop_cols = [
    "target_3class",
    "Diabetes_Status_Binary",  # if present
    "State_Code", "State_Name" # not used in model
]

X = df.drop(columns=[c for c in drop_cols if c in df.columns])
X.shape

(453241, 33)

# Define numeric vs categorical features from composite columns

In [5]:
numeric_features = [
    # health days / counts
    "Physically_Unhealthy_Days",          # if numeric
    "Physically_Unhealthy_Days_Count",
    "Mentally_Unhealthy_Days_Count",
    "Limited_Health_Days_Due_To_Health",
    "Unhealthy_Days_Total",

    # anthropometric
    "Body_Mass_Index",                    # numeric BMI
    "Weight_Pounds",                      # weight in lbs

    # alcohol quantity / risk score
    "Average_Drinks_Per_Day",
    "SES_Score"                           # socio-economic score (numeric)
]

categorical_features = [
    # behavioral – activity, smoking, alcohol
    "Physical_Activity",
    "Ever_Smoked_100_Cigarettes",
    "Current_Smoking_Frequency",
    "Smoking_Status_Category",
    "At_Risk_Smoking",
    "At_Risk_Smoking.1",
    "At_Risk_Smoking.2",
    "At_Risk_Smoking.3",
    "Alcohol_Use_Frequency",
    "Heavy_Drinking_Flag",
    "Any_Alcohol_Past_30Days",
    "Alcohol_Risk_Level",

    # self-rated health
    "Self_Rated_Health",

    # BMI flags
    "BMI_Category",
    "At_Risk_BMI",
    "At_Risk_BMI.1",
    "At_Risk_BMI.2",
    "At_Risk_BMI.3",

    # demographic
    "Biological_Sex",
    "Age_Code",
    "Race_Ethnicity_Group",
    "Education_Level",
    "Household_Income_Category",
]

# Safety: keep only columns that actually exist in X
numeric_features = [c for c in numeric_features if c in X.columns]
categorical_features = [c for c in categorical_features if c in X.columns]

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)
print("X shape:", X.shape)

Numeric features: ['Physically_Unhealthy_Days', 'Physically_Unhealthy_Days_Count', 'Mentally_Unhealthy_Days_Count', 'Limited_Health_Days_Due_To_Health', 'Unhealthy_Days_Total', 'Body_Mass_Index', 'Weight_Pounds', 'Average_Drinks_Per_Day', 'SES_Score']
Categorical features: ['Physical_Activity', 'Ever_Smoked_100_Cigarettes', 'Current_Smoking_Frequency', 'Smoking_Status_Category', 'At_Risk_Smoking', 'At_Risk_Smoking.1', 'At_Risk_Smoking.2', 'At_Risk_Smoking.3', 'Alcohol_Use_Frequency', 'Heavy_Drinking_Flag', 'Any_Alcohol_Past_30Days', 'Alcohol_Risk_Level', 'Self_Rated_Health', 'BMI_Category', 'At_Risk_BMI', 'At_Risk_BMI.1', 'At_Risk_BMI.2', 'At_Risk_BMI.3', 'Biological_Sex', 'Age_Code', 'Race_Ethnicity_Group', 'Education_Level', 'Household_Income_Category']
X shape: (453241, 33)


# Train-test split (stratified)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((362592, 33), (90649, 33))

# Build the preprocessing transformer

In [7]:
# Numeric: median impute + standardize
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical: most-frequent impute + one-hot
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor

# Fit preprocessor and transform train/test

In [8]:
# Fit on training data only
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc  = preprocessor.transform(X_test)

X_train_proc.shape, X_test_proc.shape

((362592, 214), (90649, 214))

In [9]:
# Convert to dense if sparse
if not isinstance(X_train_proc, np.ndarray):
    X_train_proc = X_train_proc.toarray()

if not isinstance(X_test_proc, np.ndarray):
    X_test_proc = X_test_proc.toarray()

# Apply SMOTE on the processed training data

In [10]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_proc, y_train)

print("Before SMOTE:", Counter(y_train))
print("After  SMOTE:", Counter(y_train_sm))

Before SMOTE: Counter({0: 300899, 2: 52647, 1: 9046})
After  SMOTE: Counter({0: 300899, 1: 300899, 2: 300899})


# Define and train XGBoost (Model-2)

In [11]:
xgb2 = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    n_jobs=-1,
    random_state=42
)

xgb2.fit(X_train_sm, y_train_sm)

# Model Evaluation

In [12]:
y_pred = xgb2.predict(X_test_proc)
y_proba = xgb2.predict_proba(X_test_proc)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
try:
    print("ROC-AUC (macro):", roc_auc_score(y_test, y_proba, multi_class="ovo", average="macro"))
except Exception as e:
    print("ROC-AUC error:", e)


Accuracy: 0.8073227503888626
              precision    recall  f1-score   support

           0       0.87      0.91      0.89     75226
           1       0.21      0.00      0.00      2261
           2       0.39      0.36      0.38     13162

    accuracy                           0.81     90649
   macro avg       0.49      0.42      0.42     90649
weighted avg       0.79      0.81      0.79     90649

ROC-AUC (macro): 0.673848204253722


# Build a single pipeline for inference

In [13]:

xgb_pipeline_model2 = Pipeline(steps=[
    ("preprocessor", preprocessor),  # already fitted
    ("model", xgb2)                  # already fitted on SMOTE-augmented data
])

xgb_pipeline_model2

# Save the pipeline for Streamlit

In [14]:
joblib.dump(xgb_pipeline_model2, "xgb_pipeline_model2.joblib")

['xgb_pipeline_model2.joblib']