# Cross-Validation & Model Selection

## Setup

In [1]:
# Setup

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.dummy import DummyRegressor

## Load Data

In [2]:
# load the clean datasets

df = pd.read_csv("data_train.csv")
df.sample()
df.columns

Index(['census_tract', 'StCoFIPS2019', 'StAbbr', 'walkability_index',
       'Pop2018', 'HU2018', 'HH2018', 'employment_mix',
       'employment_residential_mix', 'intersection_density',
       'transit_accessibility', 'employment_mix_ranked',
       'employment_residential_mix_ranked', 'intersection_density_ranked',
       'transit_accessibility_ranked', 'median_income', 'percent_unemployed',
       'percent_below_poverty', 'percent_bachelor_and_higher',
       'percent_over_65', 'percent_commute_car', 'percent_commute_transit',
       'percent_white', 'percent_black', 'percent_native_american',
       'percent_asian', 'percent_pacific_islander', 'statedesc', 'countyname',
       'total_population', 'arthritis_crudeprev', 'arthritis_crude95ci',
       'high_blood_pressure_prevalence', 'high_blood_pressure_95ci',
       'cancer_prevalence', 'cancer_95ci', 'current_asthma_prevalence',
       'current_asthma_95ci', 'coronary_heart_disease_prevalence',
       'coronary_heart_disease_95ci'

## Model cross-validation

In [4]:
# 1) Config
features = [
    "employment_mix_ranked",
    "employment_residential_mix_ranked",
    "intersection_density_ranked",
    "transit_accessibility_ranked",
]

outcomes = [
    "high_cholesterol_prevalence",
    "depression_prevalence",
    "high_blood_pressure_prevalence",
    "obesity_prevalence",
    "coronary_heart_disease_prevalence",
    "cancer_prevalence",
]

group_col = "StCoFIPS2019"
n_splits = 5

# Preprocessing (numeric)
pre = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler()),
    ]), features)
], remainder="drop")

# Base estimators / pipelines
pipe_linear = Pipeline([("pre", pre), ("lr", LinearRegression())])
pipe_pca_linear = Pipeline([("pre", pre), ("pca", PCA()), ("lr", LinearRegression())])
pipe_ridge  = Pipeline([("pre", pre), ("ridge", Ridge())])
pipe_lasso  = Pipeline([("pre", pre), ("lasso", Lasso(max_iter=10000))])

# Small grids
grid_pca   = {"pca__n_components": [2, 3, 4]}                # you have 4 features
grid_ridge = {"ridge__alpha": [0.1, 1.0, 10.0, 100.0]}
grid_lasso = {"lasso__alpha": [0.001, 0.01, 0.1, 1.0]}

# CV splitter (grouped by county)
gkf = GroupKFold(n_splits=n_splits)

# Helper: convert neg MSE scores to RMSE mean & sd
def mean_rmse_from_neg_mse(scores):
    mses = -np.array(scores, dtype=float)
    return float(np.sqrt(mses).mean()), float(np.sqrt(mses).std())

rows = []

for target in outcomes:
    cols_needed = features + [target, group_col]
    d = df.dropna(subset=cols_needed).copy()
    if d.empty:
        rows.append({"target": target, "model": "N/A", "rmse_mean": np.nan, "rmse_sd": np.nan, "details": "No rows after dropna"})
        continue

    X = d[features]
    y = d[target]
    groups = d[group_col]

    # 0) Dummy baseline (predicts mean per train fold)
    cv_dummy = cross_validate(
        DummyRegressor(strategy="mean"),
        X, y,
        cv=gkf.split(X, y, groups=groups),
        scoring="neg_mean_squared_error",
        n_jobs=-1, return_train_score=False
    )
    rmse_mean, rmse_sd = mean_rmse_from_neg_mse(cv_dummy["test_score"])
    rows.append({"target": target, "model": "Dummy(mean)", "rmse_mean": rmse_mean, "rmse_sd": rmse_sd, "details": ""})

    # 1) Linear (no tuning)
    cv_lin = cross_validate(
        pipe_linear, X, y,
        cv=gkf.split(X, y, groups=groups),
        scoring="neg_mean_squared_error",
        n_jobs=-1, return_train_score=False
    )
    rmse_mean, rmse_sd = mean_rmse_from_neg_mse(cv_lin["test_score"])
    rows.append({"target": target, "model": "Linear", "rmse_mean": rmse_mean, "rmse_sd": rmse_sd, "details": ""})

    # 2) PCA -> Linear (tune n_components)
    gs_pca = GridSearchCV(
        pipe_pca_linear, grid_pca,
        scoring="neg_mean_squared_error",
        cv=gkf.split(X, y, groups=groups),
        n_jobs=-1
    )
    gs_pca.fit(X, y)
    rows.append({
        "target": target, "model": "PCA+Linear",
        "rmse_mean": float(np.sqrt(-gs_pca.best_score_)), "rmse_sd": np.nan,
        "details": f"best n_components={gs_pca.best_params_.get('pca__n_components')}"
    })

    # 3) Ridge (tune alpha)
    gs_ridge = GridSearchCV(
        pipe_ridge, grid_ridge,
        scoring="neg_mean_squared_error",
        cv=gkf.split(X, y, groups=groups),
        n_jobs=-1
    )
    gs_ridge.fit(X, y)
    rows.append({
        "target": target, "model": "Ridge",
        "rmse_mean": float(np.sqrt(-gs_ridge.best_score_)), "rmse_sd": np.nan,
        "details": f"best alpha={gs_ridge.best_params_.get('ridge__alpha')}"
    })

    # 4) Lasso (tune alpha)
    gs_lasso = GridSearchCV(
        pipe_lasso, grid_lasso,
        scoring="neg_mean_squared_error",
        cv=gkf.split(X, y, groups=groups),
        n_jobs=-1
    )
    gs_lasso.fit(X, y)
    rows.append({
        "target": target, "model": "Lasso",
        "rmse_mean": float(np.sqrt(-gs_lasso.best_score_)), "rmse_sd": np.nan,
        "details": f"best alpha={gs_lasso.best_params_.get('lasso__alpha')}"
    })

# Summarize & save
results = pd.DataFrame(rows).sort_values(["target", "rmse_mean"])
print(results)
results.to_csv("results_cv_by_outcome.csv", index=False)
print("\nSaved results to results_cv_by_outcome.csv")


                               target        model  rmse_mean   rmse_sd  \
26                  cancer_prevalence       Linear   1.721985  0.035342   
28                  cancer_prevalence        Ridge   1.722342       NaN   
29                  cancer_prevalence        Lasso   1.722347       NaN   
27                  cancer_prevalence   PCA+Linear   1.722347       NaN   
25                  cancer_prevalence  Dummy(mean)   1.867657  0.023959   
21  coronary_heart_disease_prevalence       Linear   1.939565  0.043248   
22  coronary_heart_disease_prevalence   PCA+Linear   1.940020       NaN   
23  coronary_heart_disease_prevalence        Ridge   1.940044       NaN   
24  coronary_heart_disease_prevalence        Lasso   1.940049       NaN   
20  coronary_heart_disease_prevalence  Dummy(mean)   2.033882  0.031109   
6               depression_prevalence       Linear   3.259604  0.111994   
7               depression_prevalence   PCA+Linear   3.260497       NaN   
8               depressio