# Baseline Lifestyle Risk Models

Train interpretable baseline classifiers on the processed BRFSS dataset to estimate high-risk probabilities.



In [34]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

DATA_PATH = Path("../data/processed/brfss_lifestyle_risk.parquet")
STAGED_DIR = Path("../data/staged/brfss/2021/llcp2021_dataset")

CHRONIC_GROUPS = {
    "cardio": ["CVDINFR4", "CVDCRHD4", "CVDSTRK3"],
    "metabolic": ["DIABETE4"],
    "respiratory": ["ASTHMA3"],
    "renal": ["CHCKDNY2"],
}

ESSENTIAL_COLS = ["GENHLTH", "_SMOKER3", "DRNKANY5", "EXERANY2", "_AGEG5YR"]
NUMERICAL_IMPUTE_COLS = [
    "_BMI5",
    "AVEDRNK3",
    "DRNK3GE5",
    "MAXDRNKS",
    "WEIGHT2",
    "HEIGHT3",
    "PHYSHLTH",
    "MENTHLTH",
    "_DRNKWK1",
]
CATEGORICAL_FILL_COLS = ["POORHLTH", "SMOKE100", "SMOKDAY2", "ECIGNOW1", "INCOME3"]
FEATURE_COLS = [
    "_STATE",
    "FMONTH",
    "SEXVAR",
    "_AGEG5YR",
    "GENHLTH",
    "PHYSHLTH",
    "MENTHLTH",
    "POORHLTH",
    "EXERANY2",
    "SMOKE100",
    "SMOKDAY2",
    "_SMOKER3",
    "_RFSMOK3",
    "ECIGNOW1",
    "DRNKANY5",
    "AVEDRNK3",
    "DRNK3GE5",
    "MAXDRNKS",
    "_DRNKWK1",
    "_BMI5",
    "_BMI5CAT",
    "WEIGHT2",
    "HEIGHT3",
    "INCOME3",
]


def build_processed_dataset() -> pd.DataFrame:
    parts = sorted(STAGED_DIR.glob("part-*.parquet"))
    if not parts:
        raise FileNotFoundError(f"No staged parquet parts found in {STAGED_DIR}")
    staged_df = pd.concat(pd.read_parquet(part) for part in parts)

    for group, cols in CHRONIC_GROUPS.items():
        staged_df[f"{group.upper()}_FLAG"] = staged_df[cols].eq(1.0).any(axis=1)

    flag_cols = [col for col in staged_df.columns if col.endswith("_FLAG")]
    staged_df["HIGH_RISK"] = staged_df[flag_cols].any(axis=1)

    dataset = staged_df[FEATURE_COLS + ["HIGH_RISK"]].copy()
    dataset = dataset.dropna(subset=ESSENTIAL_COLS)
    dataset[NUMERICAL_IMPUTE_COLS] = dataset[NUMERICAL_IMPUTE_COLS].fillna(
        dataset[NUMERICAL_IMPUTE_COLS].median()
    )
    dataset[CATEGORICAL_FILL_COLS] = (
        dataset[CATEGORICAL_FILL_COLS].fillna("Unknown").astype(str)
    )

    DATA_PATH.parent.mkdir(parents=True, exist_ok=True)
    dataset.to_parquet(DATA_PATH, index=False)
    return dataset


def load_dataset() -> pd.DataFrame:
    if DATA_PATH.exists():
        existing = pd.read_parquet(DATA_PATH)
        if existing.isna().any().any():
            print("Detected missing values in processed dataset. Rebuilding...")
            return build_processed_dataset()
        return existing
    print(f"Processed dataset not found at {DATA_PATH}. Rebuilding from staged data...")
    return build_processed_dataset()


df = load_dataset()
print(df.shape)
df.head()



(429083, 25)


Unnamed: 0,_STATE,FMONTH,SEXVAR,_AGEG5YR,GENHLTH,PHYSHLTH,MENTHLTH,POORHLTH,EXERANY2,SMOKE100,...,AVEDRNK3,DRNK3GE5,MAXDRNKS,_DRNKWK1,_BMI5,_BMI5CAT,WEIGHT2,HEIGHT3,INCOME3,HIGH_RISK
0,1.0,1.0,2.0,70-74,Poor,20.0,10.0,88.0,No,1.0,...,2.0,88.0,2.0,0.0,1454.0,Underweight,72.0,411.0,5.0,True
1,1.0,1.0,2.0,65-69,Good,88.0,88.0,Unknown,Yes,2.0,...,2.0,88.0,2.0,0.0,2744.0,Unknown,7777.0,506.0,77.0,True
2,1.0,1.0,2.0,70-74,Very good,88.0,88.0,Unknown,No,2.0,...,2.0,88.0,2.0,0.0,2829.0,Overweight,170.0,505.0,3.0,True
3,1.0,1.0,2.0,60-64,Very good,88.0,10.0,88.0,Yes,2.0,...,3.0,1.0,6.0,300.0,3347.0,Obese,195.0,504.0,7.0,True
4,1.0,1.0,1.0,75-79,Poor,30.0,88.0,30.0,Yes,2.0,...,2.0,88.0,2.0,0.0,2873.0,Overweight,206.0,511.0,4.0,True


In [35]:
target_col = "HIGH_RISK"
feature_cols = [col for col in df.columns if col != target_col]

categorical_cols = [
    "_STATE",
    "FMONTH",
    "SEXVAR",
    "_AGEG5YR",
    "GENHLTH",
    "POORHLTH",
    "EXERANY2",
    "SMOKE100",
    "SMOKDAY2",
    "_SMOKER3",
    "_RFSMOK3",
    "ECIGNOW1",
    "DRNKANY5",
    "_BMI5CAT",
    "INCOME3",
]
numeric_cols = [col for col in feature_cols if col not in categorical_cols]

X = df[feature_cols]
y = df[target_col].astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Valid shape:", X_valid.shape)



Train shape: (343266, 24)
Valid shape: (85817, 24)


In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_cols),
        ("numeric", numeric_transformer, numeric_cols),
    ]
)

log_reg_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", LogisticRegression(max_iter=1000, class_weight="balanced")),
    ]
)

log_reg_pipeline.fit(X_train, y_train)



In [37]:
y_pred_proba_lr = log_reg_pipeline.predict_proba(X_valid)[:, 1]
y_pred_lr = (y_pred_proba_lr >= 0.5).astype(int)

print("Logistic Regression Metrics")
print("ROC AUC:", roc_auc_score(y_valid, y_pred_proba_lr))
print("Accuracy:", accuracy_score(y_valid, y_pred_lr))
print(classification_report(y_valid, y_pred_lr))



Logistic Regression Metrics
ROC AUC: 0.7376213756832115
Accuracy: 0.6856217299602643
              precision    recall  f1-score   support

           0       0.81      0.70      0.75     58068
           1       0.51      0.66      0.58     27749

    accuracy                           0.69     85817
   macro avg       0.66      0.68      0.66     85817
weighted avg       0.71      0.69      0.69     85817



In [38]:
from xgboost import XGBClassifier

xgb_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "model",
            XGBClassifier(
                n_estimators=300,
                max_depth=4,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                objective="binary:logistic",
                eval_metric="auc",
                random_state=42,
            ),
        ),
    ]
)

xgb_pipeline.fit(X_train, y_train)



In [39]:
y_pred_proba_xgb = xgb_pipeline.predict_proba(X_valid)[:, 1]
y_pred_xgb = (y_pred_proba_xgb >= 0.5).astype(int)

print("XGBoost Metrics")
print("ROC AUC:", roc_auc_score(y_valid, y_pred_proba_xgb))
print("Accuracy:", accuracy_score(y_valid, y_pred_xgb))
print(classification_report(y_valid, y_pred_xgb))



XGBoost Metrics
ROC AUC: 0.7425304565312676
Accuracy: 0.7305428994255218
              precision    recall  f1-score   support

           0       0.75      0.90      0.82     58068
           1       0.64      0.38      0.48     27749

    accuracy                           0.73     85817
   macro avg       0.70      0.64      0.65     85817
weighted avg       0.72      0.73      0.71     85817



In [40]:
metrics = pd.DataFrame(
    [
        {
            "model": "logistic_regression",
            "roc_auc": roc_auc_score(y_valid, y_pred_proba_lr),
            "accuracy": accuracy_score(y_valid, y_pred_lr),
        },
        {
            "model": "xgboost",
            "roc_auc": roc_auc_score(y_valid, y_pred_proba_xgb),
            "accuracy": accuracy_score(y_valid, y_pred_xgb),
        },
    ]
)
metrics


Unnamed: 0,model,roc_auc,accuracy
0,logistic_regression,0.737621,0.685622
1,xgboost,0.74253,0.730543


In [41]:
feature_names = log_reg_pipeline.named_steps["preprocess"].get_feature_names_out()
coef = log_reg_pipeline.named_steps["model"].coef_[0]
log_reg_importance = (
    pd.Series(coef, index=feature_names)
    .sort_values(ascending=False)
    .head(15)
)
log_reg_importance


categorical__GENHLTH_Poor      0.935560
categorical___AGEG5YR_80+      0.728180
categorical___AGEG5YR_75-79    0.667531
categorical__GENHLTH_Fair      0.561276
categorical___AGEG5YR_70-74    0.557212
categorical___AGEG5YR_65-69    0.338129
categorical__INCOME3_1.0       0.259279
numeric___BMI5                 0.181584
categorical___AGEG5YR_60-64    0.179319
categorical___STATE_72.0       0.175938
categorical__INCOME3_2.0       0.167150
categorical___BMI5CAT_Obese    0.149518
categorical___STATE_44.0       0.135506
categorical__ECIGNOW1_2.0      0.125689
categorical___STATE_47.0       0.120047
dtype: float64

In [42]:
xgb_importance = (
    pd.Series(xgb_pipeline.named_steps["model"].feature_importances_, index=feature_names)
    .sort_values(ascending=False)
    .head(15)
)
xgb_importance


categorical__GENHLTH_Fair         0.178565
categorical__GENHLTH_Poor         0.139799
categorical__GENHLTH_Good         0.077105
categorical__GENHLTH_Excellent    0.071871
categorical__GENHLTH_Very good    0.062826
numeric__PHYSHLTH                 0.037874
categorical___BMI5CAT_Obese       0.021160
categorical__EXERANY2_Yes         0.020324
categorical___AGEG5YR_75-79       0.019483
categorical___AGEG5YR_70-74       0.019410
categorical__DRNKANY5_No          0.018505
categorical___AGEG5YR_80+         0.018230
categorical__SMOKDAY2_3.0         0.014712
numeric___DRNKWK1                 0.011908
categorical___AGEG5YR_30-34       0.011691
dtype: float32