In [None]:
"""
Ridge Regression Forecasting for State-Month Health Outcomes

Notes:
- 'state', 'month', 'year' are ID-like fields (identifiers), not predictive features.
- We only use true numeric environmental / meteorological / urban features as inputs.
"""

import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ==============================
# 0. Load data
# ==============================
IN_CSV  = "./Final_Master.csv"
OUT_CSV = "./Final_Master_with_preds.csv"

df = pd.read_csv(IN_CSV)

# Ensure ID columns are strings
df["month"] = df["month"].astype(str)   # format: "YYYY-MM"
df["state"] = df["state"].astype(str)   # full state name

# Recompute year / month_num from 'month' for convenience, but treat them as IDs, not features
dt = pd.to_datetime(df["month"] + "-01")
df["year"] = dt.dt.year.astype(int)
df["month_num"] = dt.dt.month.astype(int)

print("Total rows:", len(df))
print("Columns:", df.columns.tolist())

# ==============================
# 1. Targets and feature set
# ==============================
# Monthly health outcomes (counts per state-month)
target_cols = ["ihd_deaths", "copd_deaths", "asthma_deaths"]

# ID-like columns (NOT used as model features)
id_cols = ["state", "month", "year", "month_num"]

# Automatically select numeric feature columns, excluding:
# - targets
# - ID-like numeric columns: 'year', 'month_num'
num_all = df.select_dtypes(include=[np.number]).columns.tolist()
exclude = set(target_cols + ["year", "month_num"])
num_features = [c for c in num_all if c not in exclude]

print("\nNumber of numeric features used:", len(num_features))
print("Example features:", num_features[:10])

# Initialize prediction columns
for t in target_cols:
    pred_col = t.replace("_deaths", "_pred")
    if pred_col not in df.columns:
        df[pred_col] = np.nan

# ==============================
# 2. Time-based split configuration
# ==============================
TRAIN_END     = "2023-12"
VAL_START     = "2024-01"
VAL_END       = "2025-07"
FUTURE_START  = "2025-08"

# ==============================
# 3. Train / evaluate / forecast per target
# ==============================
for target_col in target_cols:
    print(f"\n================= Target: {target_col} =================")
    pred_col = target_col.replace("_deaths", "_pred")

    # 3-1. Filter to valid observed rows (non-null, non-negative target, complete features)
    df_obs = df[df[target_col].notna()].copy()
    df_obs = df_obs[df_obs[target_col] >= 0]
    df_obs = df_obs.dropna(subset=num_features)

    if df_obs.empty:
        print(f"  -> No valid observed data for {target_col}. Skipping.")
        continue

    # Transform target to log-scale: log(deaths + 1)
    df_obs[target_col + "_log1p"] = np.log1p(df_obs[target_col].values)

    # Time-based train / validation split
    train_mask = df_obs["month"] <= TRAIN_END
    val_mask   = (df_obs["month"] >= VAL_START) & (df_obs["month"] <= VAL_END)

    train = df_obs[train_mask].copy()
    val   = df_obs[val_mask].copy()

    print(f"  Observed samples: {len(df_obs)}")
    print(f"  → Train: {len(train)}, Val: {len(val)}")

    # ----------------------
    # 3-2. Validation (if available)
    # ----------------------
    if len(train) > 0 and len(val) > 0:
        X_train = train[num_features]
        y_train_log = train[target_col + "_log1p"].values

        X_val = val[num_features]
        y_val = val[target_col].values                     # original counts

        model = Ridge(alpha=1.0, random_state=42)
        model.fit(X_train, y_train_log)

        # Predict in log-space and back-transform to counts
        val_pred_log = model.predict(X_val)
        val_pred = np.expm1(val_pred_log)
        val_pred = np.clip(val_pred, 0, None)              # enforce non-negative

        mae  = mean_absolute_error(y_val, val_pred)
        mse  = mean_squared_error(y_val, val_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_val, val_pred)

        print("  [Validation performance (count scale)]")
        print(f"    MAE : {mae:,.3f}")
        print(f"    RMSE: {rmse:,.3f}")
        print(f"    R^2 : {r2:,.4f}")
    else:
        print("  → Not enough train/validation data; skipping formal validation and training on all observed data only.")

    # ----------------------
    # 3-3. Final model: train on all observed data up to 2025-07
    # ----------------------
    full_obs = df_obs[df_obs["month"] <= VAL_END].copy()
    if full_obs.empty:
        print("  → No observed data in the training window. Skipping forecasting.")
        continue

    X_all = full_obs[num_features]
    y_all_log = full_obs[target_col + "_log1p"].values

    final_model = Ridge(alpha=1.0, random_state=42)
    final_model.fit(X_all, y_all_log)

    # ----------------------
    # 3-4. Forecast future window (2025-08+)
    # ----------------------
    df_future = df[df["month"] >= FUTURE_START].copy()
    df_future_feat = df_future.dropna(subset=num_features).copy()

    if df_future_feat.empty:
        print("  → No usable future rows for features. Skipping forecasting.")
        continue

    X_future = df_future_feat[num_features]
    y_future_log_pred = final_model.predict(X_future)
    y_future_pred = np.expm1(y_future_log_pred)
    y_future_pred = np.clip(y_future_pred, 0, None)

    # Write predictions back
    df.loc[df_future_feat.index, pred_col] = y_future_pred

    print(f"  → Future forecast complete: {len(df_future_feat)} rows updated in '{pred_col}'")

# ==============================
# 4. Save results
# ==============================
print("\nSample future forecasts (>= 2030-01):")
print(
    df[df["month"] >= "2030-01"]
      [["state", "month", "ihd_pred", "copd_pred", "asthma_pred"]]
      .head(10)
)

df.to_csv(OUT_CSV, index=False)
print(f"\n>>> Saved with predictions to: {OUT_CSV}")


### Model Comparison

In [None]:
"""
Model comparison for state-month health outcomes (IHD / COPD / Asthma)

We compare three regression models:
- Ridge Regression (with log(deaths + 1) target)
- Random Forest Regressor
- Gradient Boosting Regressor

Setup:
- Input:  Final_Master.csv
- Targets: ihd_deaths, copd_deaths, asthma_deaths
- Features: numeric environmental / urban / meteorological variables
           (exclude state/month/year/month_num and the targets)
- Time-based split:
    * Train:     <= 2023-12
    * Validation: 2024-01 ~ 2025-07
- Metrics (on validation set, in count space):
    * MAE, RMSE, R^2
- Outputs:
    * CSV with model performances
    * Figures comparing models across targets
"""

import os
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# ==============================
# 0. Paths & basic setup
# ==============================
IN_CSV  = "./Final_Master.csv"
METRIC_CSV = "./model_metrics_health.csv"
FIGDIR = "./figures"
os.makedirs(FIGDIR, exist_ok=True)

# ==============================
# 1. Load data
# ==============================
df = pd.read_csv(IN_CSV)

# Ensure ID-like columns are strings
df["month"] = df["month"].astype(str)   # "YYYY-MM"
df["state"] = df["state"].astype(str)   # full state name

# (Optional) recompute year and month_num from 'month'
dt = pd.to_datetime(df["month"] + "-01")
df["year"] = dt.dt.year.astype(int)
df["month_num"] = dt.dt.month.astype(int)

print("Total rows:", len(df))
print("Columns:", df.columns.tolist())

# ==============================
# 2. Targets and feature set
# ==============================
target_cols = ["ihd_deaths", "copd_deaths", "asthma_deaths"]

# ID-like columns (not used as predictive features)
id_cols = ["state", "month", "year", "month_num"]

# All numeric columns
num_all = df.select_dtypes(include=[np.number]).columns.tolist()

# Exclude targets and ID-like numeric columns from feature set
exclude = set(target_cols + ["year", "month_num"])
num_features = [c for c in num_all if c not in exclude]

print("\nNumber of numeric features used:", len(num_features))
print("Example features:", num_features[:10])

# ==============================
# 3. Time-based split configuration
# ==============================
TRAIN_END     = "2023-12"  # training up to this month
VAL_START     = "2024-01"  # validation range start
VAL_END       = "2025-07"  # validation range end

# (We are focusing here on model comparison on validation.
# Forecasting to future months can be added later if needed.)

# ==============================
# 4. Model definitions
# ==============================
def get_models():
    """
    Return a dict of model_name -> sklearn estimator
    All models will be trained on log(deaths + 1).
    """
    models = {
        "Ridge": Ridge(alpha=1.0),
        "RandomForest": RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            min_samples_leaf=2,
            n_jobs=-1,
            random_state=42,
        ),
        "GradientBoosting": GradientBoostingRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=3,
            random_state=42,
        ),
    }
    return models

# ==============================
# 5. Training & evaluation loop
# ==============================
metrics_records = []  # to store (target, model, MAE, RMSE, R2)

for target_col in target_cols:
    print(f"\n================= Target: {target_col} =================")

    # 5-1. Filter valid observed rows
    df_obs = df[df[target_col].notna()].copy()
    df_obs = df_obs[df_obs[target_col] >= 0]
    df_obs = df_obs.dropna(subset=num_features)

    if df_obs.empty:
        print(f"  -> No valid observed data for {target_col}. Skipping.")
        continue

    # log(deaths + 1) transform to stabilize variance
    log_col = target_col + "_log1p"
    df_obs[log_col] = np.log1p(df_obs[target_col].values)

    # 5-2. Time-based train / validation split
    train_mask = df_obs["month"] <= TRAIN_END
    val_mask   = (df_obs["month"] >= VAL_START) & (df_obs["month"] <= VAL_END)

    train = df_obs[train_mask].copy()
    val   = df_obs[val_mask].copy()

    print(f"  Observed samples in total: {len(df_obs)}")
    print(f"  → Train: {len(train)}, Val: {len(val)}")

    if len(train) == 0 or len(val) == 0:
        print("  → Not enough train or validation data; skipping comparison.")
        continue

    X_train = train[num_features]
    y_train_log = train[log_col].values

    X_val = val[num_features]
    y_val = val[target_col].values  # in original count space

    models = get_models()

    for model_name, model in models.items():
        print(f"    >> Fitting model: {model_name}")
        model.fit(X_train, y_train_log)

        # Predict in log-space and transform back
        val_pred_log = model.predict(X_val)
        val_pred = np.expm1(val_pred_log)
        val_pred = np.clip(val_pred, 0, None)  # enforce count >= 0

        mae  = mean_absolute_error(y_val, val_pred)
        mse  = mean_squared_error(y_val, val_pred)
        rmse = np.sqrt(mse)
        r2   = r2_score(y_val, val_pred)

        print(f"       MAE : {mae:,.3f}")
        print(f"       RMSE: {rmse:,.3f}")
        print(f"       R^2 : {r2:,.4f}")

        metrics_records.append({
            "target": target_col,
            "model": model_name,
            "MAE": mae,
            "RMSE": rmse,
            "R2": r2,
        })

# ==============================
# 6. Save metrics to CSV
# ==============================
if len(metrics_records) == 0:
    print("\nNo metrics were computed (possibly no valid train/val data).")
else:
    metrics_df = pd.DataFrame(metrics_records)
    metrics_df.to_csv(METRIC_CSV, index=False)
    print(f"\n>>> Metrics saved to: {METRIC_CSV}")
    print(metrics_df)

    # ==============================
    # 7. Plot comparison figures
    # ==============================
    # We will create one figure per metric (MAE, RMSE, R2),
    # with one subplot per target and bars for each model.

    metrics_to_plot = ["MAE", "RMSE", "R2"]

    for metric in metrics_to_plot:
        fig, axes = plt.subplots(
            1,
            len(target_cols),
            figsize=(5 * len(target_cols), 4),
            squeeze=False
        )
        axes = axes[0]  # unpack 1D

        for i, target in enumerate(target_cols):
            sub = metrics_df[metrics_df["target"] == target]
            if sub.empty:
                axes[i].set_title(f"{target} (no data)")
                axes[i].axis("off")
                continue

            x = np.arange(len(sub))
            axes[i].bar(sub["model"], sub[metric])
            axes[i].set_title(f"{target} - {metric}")
            axes[i].set_xlabel("Model")
            axes[i].set_ylabel(metric)
            axes[i].tick_params(axis='x', rotation=30)

        fig.tight_layout()
        out_path = os.path.join(FIGDIR, f"model_compare_{metric}.png")
        plt.savefig(out_path, dpi=150)
        plt.close(fig)
        print(f">>> Figure saved: {out_path}")
