In [None]:
# Ensure dependencies are installed when running in hosted notebooks
%pip install -r requirements.txt

## Data setup

Place `BP1234-ONSET-WCOND-NUMID.csv` (or `BP1234-ONSET.csv`) here before running.


In [1]:
import pandas as pd
import numpy as np
import os
from collections.abc import Iterable
from typing import List
# --- mount (safe) ---
import urllib.request
from pathlib import Path

BASE_URL = "https://raw.githubusercontent.com/FelixZhan/AtyAN/main/"
HELPER_FILES = [
    "analysis_utils.py",
    "requirements.txt",
    "BP1234-ONSET-WCOND-NUMID.csv"
]

DATA_FILE_PREFERRED = "BP1234-ONSET-WCOND-NUMID.csv"
DATA_FILE_FALLBACK = "BP1234-ONSET.csv"

for filename in HELPER_FILES:
    dest = Path(filename)
    if dest.exists():
        print(f"{filename} already present, skipping download.")
        continue
    print(f"Downloading {filename}...")
    urllib.request.urlretrieve(f"{BASE_URL}{filename}", dest)

data_candidates = [DATA_FILE_PREFERRED, DATA_FILE_FALLBACK]
data_file = None
for cand in data_candidates:
    if Path(cand).exists():
        data_file = cand
        break

if data_file is None:
    data_file = DATA_FILE_FALLBACK
    try:
        print(f"{DATA_FILE_PREFERRED} and {DATA_FILE_FALLBACK} not found locally; attempting to download {data_file} from GitHub.")
        urllib.request.urlretrieve(f"{BASE_URL}{data_file}", data_file)
    except Exception as exc:
        raise RuntimeError(
            f"Could not find {DATA_FILE_PREFERRED} locally and download of {data_file} failed; "
            "place the dataset in this folder and re-run."
        ) from exc

print(f"Using dataset: {data_file}")
df = pd.read_csv(data_file, low_memory=False)
ID_COL = "id"

def cols_exist(df: pd.DataFrame, cols: Iterable[str]) -> List[str]:
    return [c for c in cols if c in df.columns]
# Ensure ID exists
if ID_COL not in df.columns:
    for cand in ["id","ID"]:
        if cand in df.columns:
            ID_COL = cand; break

!pip install -q -r requirements.txt


analysis_utils.py already present, skipping download.
requirements.txt already present, skipping download.
BP1234-ONSET-WCOND-NUMID.csv already present, skipping download.
Using dataset: BP1234-ONSET-WCOND-NUMID.csv
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m554.0/554.0 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.5/14.5 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.3/487.3 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.1/225.1 kB[0m [31m

## Imports and shared setup


In [2]:
import re
import numpy as np
import pandas as pd
from pathlib import Path

# Toggle BRF components on/off
RUN_BRF = False

# from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm


In [3]:
# Study condition cleaning and dummy coding (BP vs Control vs Healthy Weight)
COND_MAP = {
    "peer delivered": "BP",
    "ebody": "BP",
    "clincian delivered": "BP",
    "clinician delivered": "BP",
    "diss. (bp)": "BP",
    "exp writing": "BP",
    "control/video control": "Control",
    "healthy weight": "Healthy Weight",
}

COND_CANONICAL = {
    "BP": "BP",
    "Control": "Control",
    "Healthy Weight": "Healthy Weight",
}

def clean_and_encode_condition(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean 'study_cond' into three levels and add two dummy vectors:
      - cond_bp: 1 = BP trial condition, 0 = Control/Healthy Weight
      - cond_hw: 1 = Healthy Weight, 0 = Control/BP
    Control is the reference (0,0).
    Raises if there are missing/unmapped values.
    """
    if "study_cond" not in df.columns:
        raise KeyError("Missing 'study_cond' column; use dataset with condition labels.")

    cond_raw = df["study_cond"]
    if cond_raw.isna().any() or cond_raw.astype(str).str.strip().eq("").any():
        raise ValueError("Found missing/blank entries in 'study_cond'; expected none.")

    cond_norm = cond_raw.astype(str).str.strip().str.lower()
    cond_clean = cond_norm.map(COND_MAP)

    if cond_clean.isna().any():
        bad_vals = sorted(cond_raw.loc[cond_clean.isna()].unique())
        raise ValueError(f"Unmapped 'study_cond' values: {bad_vals}")

    df = df.copy()
    df["cond_clean"] = cond_clean.map(COND_CANONICAL).astype("category")
    df["cond_bp"] = (df["cond_clean"] == "BP").astype(int)
    df["cond_hw"] = (df["cond_clean"] == "Healthy Weight").astype(int)

    return df


In [4]:
# Helper functions and feature construction (no analysis_utils dependency)
RISK_SUFFIXES = ["tii", "bs", "dres", "socf", "dep", "intbmi"]
PRODROMAL_FEATURE_NAMES = ["BE", "CB", "WSO", "FEAR", "FAT", "LEB"]
ONSET_WEIGHT_LABEL = "mBMI"
TRUTHY_STRINGS = {"TRUE", "T", "YES", "Y", "1", "PRESENT"}
FALSY_STRINGS = {"FALSE", "F", "NO", "N", "0", "ABSENT"}

def _coerce_numeric(series: pd.Series) -> pd.Series:
    return pd.to_numeric(series, errors="coerce")

def _coerce_boolean(series: pd.Series) -> pd.Series:
    if series is None:
        return pd.Series(pd.NA, dtype="boolean")
    as_str = series.astype(str).str.strip().str.upper()
    out = pd.Series(pd.NA, index=series.index, dtype="boolean")
    out[as_str.isin(TRUTHY_STRINGS)] = True
    out[as_str.isin(FALSY_STRINGS)] = False
    numeric = pd.to_numeric(series, errors="coerce")
    out[numeric.notna()] = numeric[numeric.notna()] != 0
    return out

def _match_prefix_columns(df: pd.DataFrame, prefixes):
    cols = []
    for prefix in prefixes:
        pattern = re.compile(rf"^{re.escape(prefix)}[._-]?", flags=re.IGNORECASE)
        cols.extend([c for c in df.columns if pattern.match(c)])
    return cols

def has_cols(df: pd.DataFrame, prefixes) -> pd.Series:
    matched = _match_prefix_columns(df, prefixes)
    if not matched:
        return pd.Series(False, index=df.index)
    sub = df[matched]
    truthy = sub.apply(_coerce_boolean)
    numeric = sub.apply(_coerce_numeric)
    numeric_present = numeric.notna() & numeric.ne(0)
    return (truthy.fillna(False) | numeric_present.fillna(False)).any(axis=1)

def engineer_wave_prodromals(work: pd.DataFrame, wave: int) -> list:
    cols = []
    base = pd.Series(np.nan, index=work.index)
    be_src = f"w{wave}ede1a"
    work[f"BE_w{wave}"] = _coerce_numeric(work.get(be_src, base))
    cols.append(f"BE_w{wave}")
    cb_cols = [c for c in [f"w{wave}ed8a", f"w{wave}ed9a", f"w{wave}ed10a", f"w{wave}ed11a"] if c in work.columns]
    if cb_cols:
        cb_block = work[cb_cols].apply(_coerce_numeric)
        work[f"CB_w{wave}"] = cb_block.max(axis=1, skipna=True)
        work.loc[cb_block.notna().sum(axis=1) == 0, f"CB_w{wave}"] = np.nan
    else:
        work[f"CB_w{wave}"] = np.nan
    cols.append(f"CB_w{wave}")
    work[f"WSO_w{wave}"] = _coerce_numeric(work.get(f"w{wave}ed15a", base))
    work[f"FEAR_w{wave}"] = _coerce_numeric(work.get(f"w{wave}ed17a", base))
    work[f"FAT_w{wave}"] = _coerce_numeric(work.get(f"w{wave}ed19a", base))
    cols.extend([f"WSO_w{wave}", f"FEAR_w{wave}", f"FAT_w{wave}"])
    mbmi_pct = _coerce_numeric(work.get(f"w{wave}mbmi_pct", base))
    work[f"LEB_w{wave}"] = np.clip(90.0 - mbmi_pct, 0, None) / 90.0
    cols.append(f"LEB_w{wave}")
    return cols

def build_predictors(df: pd.DataFrame):
    work = df.copy()
    risk_cols = []
    for suffix in RISK_SUFFIXES:
        col = f"w1{suffix}"
        if col in work.columns:
            work[col] = _coerce_numeric(work[col])
            risk_cols.append(col)
    prod_cols = engineer_wave_prodromals(work, wave=1)
    features = list(dict.fromkeys(risk_cols + prod_cols))
    cond_cols = [c for c in ["cond_bp", "cond_hw"] if c in work.columns]
    features = list(dict.fromkeys(features + cond_cols))
    return work, features

def build_onset_dataset(df: pd.DataFrame, onset_weight_label: str = ONSET_WEIGHT_LABEL) -> pd.DataFrame:
    mask_fan = has_cols(df, ["fan"])
    mask_pan = has_cols(df, ["pan"])
    mask_w1_onset = _coerce_boolean(df.get("w1ONSET-FULL", pd.Series(np.nan, index=df.index))).fillna(False)
    subset = df.loc[~(mask_fan | mask_pan | mask_w1_onset)].copy()
    onset_pattern = re.compile(rf"^w([1-6])ONSET-FULL-{re.escape(onset_weight_label)}$", re.IGNORECASE)
    onset_cols = [c for c in subset.columns if onset_pattern.match(c)]
    if not onset_cols:
        raise ValueError(f"No ONSET-FULL-{onset_weight_label} columns found in the dataset.")
    onset_block = subset[onset_cols].apply(_coerce_numeric).fillna(0)
    subset["aan_onset_anywave"] = onset_block.gt(0).any(axis=1).astype(int)
    return subset


In [5]:
DATA_FILE_PREFERRED = Path("BP1234-ONSET-WCOND-NUMID.csv")
DATA_FILE_FALLBACK = Path("BP1234-ONSET.csv")
if DATA_FILE_PREFERRED.exists():
    raw_df = pd.read_csv(DATA_FILE_PREFERRED, low_memory=False)
    print(f"Using dataset with condition labels: {DATA_FILE_PREFERRED.name}")
elif DATA_FILE_FALLBACK.exists():
    raw_df = pd.read_csv(DATA_FILE_FALLBACK, low_memory=False)
    print(f"Using fallback dataset: {DATA_FILE_FALLBACK.name}")
else:
    raise FileNotFoundError("BP1234-ONSET-WCOND-NUMID.csv or BP1234-ONSET.csv is required.")

# Clean and encode study condition dummies
df_with_cond = clean_and_encode_condition(raw_df)

# Build predictors and outcome
feature_df, base_features = build_predictors(df_with_cond)
dataset = build_onset_dataset(feature_df)
predictor_cols = [c for c in base_features if c in dataset.columns and not c.endswith("-persistence")]
# Ensure condition dummies are evaluated individually in univariate models
for cond_col in ["cond_bp", "cond_hw"]:
    if cond_col in dataset.columns and cond_col not in predictor_cols:
        predictor_cols.append(cond_col)
X = dataset[predictor_cols]
y = dataset["aan_onset_anywave"]
print("Predictors:", predictor_cols)
print("Class balance:", y.value_counts().to_dict())
print(f"Design matrix shape: {X.shape}")


Using dataset with condition labels: BP1234-ONSET-WCOND-NUMID.csv
Predictors: ['w1tii', 'w1bs', 'w1dres', 'w1socf', 'w1dep', 'w1intbmi', 'BE_w1', 'CB_w1', 'WSO_w1', 'FEAR_w1', 'FAT_w1', 'LEB_w1', 'cond_bp', 'cond_hw']
Class balance: {0: 1821, 1: 59}
Design matrix shape: (1880, 14)


## Univariate predictor grid searches

Evaluate each predictor alone with Balanced RF and class-weighted Logistic (AUROC via stratified CV); thresholds will be tuned later for balanced accuracy.


In [6]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Respect the global toggle if it exists
RUN_BRF = bool(globals().get("RUN_BRF", False))

# --- Logistic (always runs) ---
log_pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=3000, class_weight="balanced")),
    ]
)

log_grid = {
    "model__C": [0.01, 0.1, 1, 3, 10],
    "model__penalty": ["l1", "l2"],
    "model__solver": ["liblinear", "saga"],
}

# --- Balanced RF (currently disabled) ---
# If you want to re-enable later, set RUN_BRF = True in the imports cell.
if RUN_BRF:
    from imblearn.ensemble import BalancedRandomForestClassifier
    brf_pipe = Pipeline(
        [
            ("imputer", SimpleImputer(strategy="median")),
            ("model", BalancedRandomForestClassifier(random_state=42, n_jobs=-1)),
        ]
)

    brf_grid = {
        "model__n_estimators": [300, 500, 800],
        "model__max_depth": [None, 6, 10, 14],
        "model__min_samples_leaf": [1, 2, 4],
        "model__max_features": ["sqrt", "log2", 0.5],
    }

results = []
log_searches = []
brf_searches = []

for feature in predictor_cols:
    X_feat = X[[feature]]

    log_search = GridSearchCV(
        log_pipe,
        param_grid=log_grid,
        cv=cv,
        scoring="roc_auc",
        n_jobs=-1,
        error_score="raise",
    )
    log_search.fit(X_feat, y)
    log_searches.append((feature, log_search))
    results.append(
        {
            "feature": feature,
            "model": "logistic",
            "best_auc": log_search.best_score_,
            "best_params": log_search.best_params_,
        }
    )

    if RUN_BRF:
        brf_search = GridSearchCV(
            brf_pipe,
            param_grid=brf_grid,
            cv=cv,
            scoring="roc_auc",
            n_jobs=-1,
            error_score="raise",
        )
        brf_search.fit(X_feat, y)
        brf_searches.append((feature, brf_search))
        results.append(
            {
                "feature": feature,
                "model": "balanced_rf",
                "best_auc": brf_search.best_score_,
                "best_params": brf_search.best_params_,
            }
        )

results_df = pd.DataFrame(results).sort_values("best_auc", ascending=False)

print("Top univariate predictors (by AUROC):")
display(results_df.head(20))

best_log_feature, best_log_search = max(log_searches, key=lambda t: t[1].best_score_)
print(f"Best logistic feature: {best_log_feature} (AUROC {best_log_search.best_score_:.3f})")

if RUN_BRF and brf_searches:
    best_brf_feature, best_brf_search = max(brf_searches, key=lambda t: t[1].best_score_)
    print(f"Best balanced RF feature: {best_brf_feature} (AUROC {best_brf_search.best_score_:.3f})")
else:
    best_brf_feature, best_brf_search = None, None
    print("Balanced RF disabled (RUN_BRF=False).")




Top univariate predictors (by AUROC):


Unnamed: 0,feature,model,best_auc,best_params
4,w1dep,logistic,0.707526,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo..."
8,WSO_w1,logistic,0.703698,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo..."
9,FEAR_w1,logistic,0.698826,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo..."
10,FAT_w1,logistic,0.697791,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo..."
2,w1dres,logistic,0.678357,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo..."
1,w1bs,logistic,0.67814,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo..."
5,w1intbmi,logistic,0.614721,"{'model__C': 0.01, 'model__penalty': 'l2', 'mo..."
11,LEB_w1,logistic,0.612082,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo..."
7,CB_w1,logistic,0.605917,"{'model__C': 0.01, 'model__penalty': 'l2', 'mo..."
0,w1tii,logistic,0.597794,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo..."


Best logistic feature: w1dep (AUROC 0.708)
Balanced RF disabled (RUN_BRF=False).


## Recreate best estimators, store for inference, and compute logistic ORs/p-values


In [7]:
from copy import deepcopy

# Respect the global toggle if it exists
RUN_BRF = bool(globals().get("RUN_BRF", False))

# Persist fitted best estimators for later inference/SHAP
log_best_estimators = {feat: search.best_estimator_ for feat, search in log_searches}
brf_best_estimators = {feat: search.best_estimator_ for feat, search in brf_searches} if RUN_BRF else {}


def compute_or_stats(feature: str):
    """Fit unpenalized statsmodels Logit on a single feature; return OR, CI, p-value.
    Returns NaNs if the fit fails (e.g., separation).
    """
    X_feat = X[[feature]].copy()
    X_imp = X_feat.fillna(X_feat.median())
    X_sm = sm.add_constant(X_imp, has_constant="add")
    try:
        res = sm.Logit(y, X_sm).fit(disp=False)
        coef = res.params.get(feature, np.nan)
        ci_bounds = res.conf_int().loc[feature] if feature in res.params.index else (np.nan, np.nan)
        return {
            "odds_ratio": float(np.exp(coef)) if pd.notna(coef) else np.nan,
            "p_value": float(res.pvalues.get(feature, np.nan)),
            "ci_low": float(np.exp(ci_bounds[0])) if pd.notna(ci_bounds[0]) else np.nan,
            "ci_high": float(np.exp(ci_bounds[1])) if pd.notna(ci_bounds[1]) else np.nan,
        }
    except Exception:
        return {"odds_ratio": np.nan, "p_value": np.nan, "ci_low": np.nan, "ci_high": np.nan}


rows = []
for feature, search in log_searches:
    stats = compute_or_stats(feature)
    rows.append(
        {
            "feature": feature,
            "model": "logistic",
            "best_auc": search.best_score_,
            "best_params": search.best_params_,
            **stats,
        }
    )

# BRF summary rows currently disabled
if RUN_BRF:
    for feature, search in brf_searches:
        rows.append(
            {
                "feature": feature,
                "model": "balanced_rf",
                "best_auc": search.best_score_,
                "best_params": search.best_params_,
                "odds_ratio": np.nan,
                "p_value": np.nan,
                "ci_low": np.nan,
                "ci_high": np.nan,
            }
        )

best_models_summary = pd.DataFrame(rows).sort_values("best_auc", ascending=False)

print("Stored fitted estimators for inference:")
print(f"  Logistic: {len(log_best_estimators)} features")
if RUN_BRF:
    print(f"  Balanced RF: {len(brf_best_estimators)} features")

print("Top 5 by AUROC (logistic):")
display(best_models_summary[best_models_summary["model"] == "logistic"].head(5))
if RUN_BRF:
    print("Top 5 by AUROC (balanced RF):")
    display(best_models_summary[best_models_summary["model"] == "balanced_rf"].head(5))

print("Full summary with odds ratios (logistic only):")
display(best_models_summary)


Stored fitted estimators for inference:
  Logistic: 14 features
Top 5 by AUROC (logistic):


Unnamed: 0,feature,model,best_auc,best_params,odds_ratio,p_value,ci_low,ci_high
4,w1dep,logistic,0.707526,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.832031,8.195871e-08,1.468375,2.285749
8,WSO_w1,logistic,0.703698,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.834608,5.143673e-07,1.447657,2.324988
9,FEAR_w1,logistic,0.698826,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.340402,8.160794e-08,1.204316,1.491866
10,FAT_w1,logistic,0.697791,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.386836,4.705596e-07,1.221157,1.574993
2,w1dres,logistic,0.678357,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",2.051659,9.219007e-06,1.493371,2.818661


Full summary with odds ratios (logistic only):


Unnamed: 0,feature,model,best_auc,best_params,odds_ratio,p_value,ci_low,ci_high
4,w1dep,logistic,0.707526,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.832031,8.195871e-08,1.468375,2.285749
8,WSO_w1,logistic,0.703698,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.834608,5.143673e-07,1.447657,2.324988
9,FEAR_w1,logistic,0.698826,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.340402,8.160794e-08,1.204316,1.491866
10,FAT_w1,logistic,0.697791,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.386836,4.705596e-07,1.221157,1.574993
2,w1dres,logistic,0.678357,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",2.051659,9.219007e-06,1.493371,2.818661
1,w1bs,logistic,0.67814,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",2.595817,2.058632e-06,1.750844,3.84858
5,w1intbmi,logistic,0.614721,"{'model__C': 0.01, 'model__penalty': 'l2', 'mo...",1.056893,0.006772096,1.0154,1.100081
11,LEB_w1,logistic,0.612082,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",2.712065e-11,0.0102857,2.30392e-19,0.003193
7,CB_w1,logistic,0.605917,"{'model__C': 0.01, 'model__penalty': 'l2', 'mo...",1.032409,0.1514334,0.9883844,1.078394
0,w1tii,logistic,0.597794,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",2.152348,0.007636595,1.225529,3.780084


In [8]:
# Logistic-only summary (rounded to 0.001) before threshold selection
if "best_models_summary" not in globals():
    raise RuntimeError("best_models_summary not found; run the grid-search and summary cells first.")

# Compute accuracy/sensitivity/specificity at default 0.5 threshold for each logistic model

def _compute_metrics(feature: str):
    est = log_best_estimators.get(feature)
    if est is None:
        return {"accuracy": np.nan, "sensitivity": np.nan, "specificity": np.nan}
    probs = cross_val_predict(est, X[[feature]], y, cv=cv, method="predict_proba", n_jobs=-1)[:, 1]
    preds = (probs >= 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(y, preds).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else np.nan
    spec = tn / (tn + fp) if (tn + fp) > 0 else np.nan
    acc = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else np.nan
    return {"accuracy": acc, "sensitivity": sens, "specificity": spec}

log_only = best_models_summary[best_models_summary["model"] == "logistic"].copy()
metric_rows = [_compute_metrics(f) for f in log_only["feature"]]
log_only = pd.concat([log_only.reset_index(drop=True), pd.DataFrame(metric_rows)], axis=1)

# --- Derive SE from reported 95% CI (log-odds scale) ---
# If CI is for OR: OR = exp(beta), CI = exp(beta ± z*SE_beta)  =>
# SE_beta = (log(ci_high) - log(ci_low)) / (2*z)
if {"ci_low", "ci_high"}.issubset(log_only.columns):
    z = 1.959963984540054  # ~N(0,1) 97.5% quantile for 95% CI
    ci_low = pd.to_numeric(log_only["ci_low"], errors="coerce")
    ci_high = pd.to_numeric(log_only["ci_high"], errors="coerce")
    ok = (ci_low > 0) & (ci_high > 0) & ci_low.notna() & ci_high.notna()
    se_beta = pd.Series(np.nan, index=log_only.index, dtype=float)
    se_beta.loc[ok] = (np.log(ci_high.loc[ok]) - np.log(ci_low.loc[ok])) / (2.0 * z)
    log_only["se_beta"] = se_beta
    # Delta-method SE for OR: Var(exp(beta)) ≈ exp(beta)^2 Var(beta)  => SE_or ≈ OR * SE_beta
    or_vals = pd.to_numeric(log_only.get("odds_ratio"), errors="coerce")
    log_only["se_or"] = or_vals * log_only["se_beta"]

rounded_cols = {
    c: 3
    for c in [
        "best_auc",
        "odds_ratio",
        "se_beta",
        "se_or",
        "p_value",
        "ci_low",
        "ci_high",
        "accuracy",
        "sensitivity",
        "specificity",
    ]
    if c in log_only
}
log_only = log_only.round(rounded_cols)

if "p_value" in log_only.columns:
    log_only["p_value"] = log_only["p_value"].apply(
        lambda v: "<0.001" if pd.notna(v) and v < 0.001 else (f"{v:.3f}" if pd.notna(v) else np.nan)
    )

if "model" in log_only.columns:
    log_only = log_only.drop(columns=["model"])

print("Logistic models (AUROC, ORs, SEs from 95% CI, default 0.5 threshold metrics; p<0.001 shown as <0.001):")
display(log_only)


Logistic models (AUROC, ORs, SEs from 95% CI, default 0.5 threshold metrics; p<0.001 shown as <0.001):


Unnamed: 0,feature,best_auc,best_params,odds_ratio,p_value,ci_low,ci_high,accuracy,sensitivity,specificity,se_beta,se_or
0,w1dep,0.708,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.832,<0.001,1.468,2.286,0.591,0.644,0.59,0.113,0.207
1,WSO_w1,0.704,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.835,<0.001,1.448,2.325,0.452,0.864,0.438,0.121,0.222
2,FEAR_w1,0.699,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.34,<0.001,1.204,1.492,0.604,0.78,0.598,0.055,0.073
3,FAT_w1,0.698,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",1.387,<0.001,1.221,1.575,0.53,0.814,0.521,0.065,0.09
4,w1dres,0.678,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",2.052,<0.001,1.493,2.819,0.507,0.814,0.498,0.162,0.332
5,w1bs,0.678,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",2.596,<0.001,1.751,3.849,0.493,0.797,0.483,0.201,0.522
6,w1intbmi,0.615,"{'model__C': 0.01, 'model__penalty': 'l2', 'mo...",1.057,0.007,1.015,1.1,0.683,0.458,0.69,0.02,0.022
7,LEB_w1,0.612,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",0.0,0.010,0.0,0.003,0.255,0.966,0.232,9.482,0.0
8,CB_w1,0.606,"{'model__C': 0.01, 'model__penalty': 'l2', 'mo...",1.032,0.151,0.988,1.078,0.734,0.407,0.744,0.022,0.023
9,w1tii,0.598,"{'model__C': 0.01, 'model__penalty': 'l1', 'mo...",2.152,0.008,1.226,3.78,0.495,0.644,0.49,0.287,0.618


## Threshold selection (balanced accuracy)

Pick thresholds that maximize balanced accuracy for the best AUROC logistic and balanced RF models.


In [9]:
thresholds = np.linspace(0.05, 0.95, 19)

# Respect the global toggle if it exists
RUN_BRF = bool(globals().get("RUN_BRF", False))


def evaluate_thresholds(name: str, search, X_subset):
    probs = cross_val_predict(search.best_estimator_, X_subset, y, cv=cv, method="predict_proba", n_jobs=-1)[:, 1]
    rows = []
    for thr in thresholds:
        preds = (probs >= thr).astype(int)
        tn, fp, fn, tp = confusion_matrix(y, preds).ravel()
        sens = tp / (tp + fn) if (tp + fn) > 0 else np.nan
        spec = tn / (tn + fp) if (tn + fp) > 0 else np.nan
        acc = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else np.nan
        bal_acc = (sens + spec) / 2 if np.isfinite(sens) and np.isfinite(spec) else np.nan
        rows.append(
            {
                "threshold": thr,
                "sensitivity": sens,
                "specificity": spec,
                "accuracy": acc,
                "balanced_accuracy": bal_acc,
            }
        )
    results = pd.DataFrame(rows)
    if results["balanced_accuracy"].notna().sum() == 0:
        print(f"{name}: no valid balanced accuracy values; showing top rows by sensitivity")
        display(results.sort_values("sensitivity", ascending=False).head(10))
        return None
    best = results.sort_values(["balanced_accuracy", "sensitivity", "specificity"], ascending=[False, False, False]).iloc[0]
    print(
        f"{name}: best balanced accuracy {best.balanced_accuracy:.3f} at threshold {best.threshold:.2f} "
        f"(sens {best.sensitivity:.3f}, spec {best.specificity:.3f}, acc {best.accuracy:.3f})"
    )
    display(results.sort_values("balanced_accuracy", ascending=False).head(10))
    return best

# BRF threshold selection currently disabled
if RUN_BRF and best_brf_search is not None and best_brf_feature is not None:
    brf_best_threshold = evaluate_thresholds("Balanced RF", best_brf_search, X[[best_brf_feature]])
else:
    brf_best_threshold = None

log_best_threshold = evaluate_thresholds("Logistic", best_log_search, X[[best_log_feature]])


Logistic: best balanced accuracy 0.635 at threshold 0.45 (sens 0.898, spec 0.372, acc 0.388)


Unnamed: 0,threshold,sensitivity,specificity,accuracy,balanced_accuracy
8,0.45,0.898305,0.371774,0.388298,0.635039
10,0.55,0.491525,0.746842,0.73883,0.619184
9,0.5,0.644068,0.589786,0.591489,0.616927
11,0.6,0.372881,0.85777,0.842553,0.615326
12,0.65,0.237288,0.924767,0.903191,0.581027
7,0.4,1.0,0.124108,0.151596,0.562054
13,0.7,0.067797,0.969248,0.940957,0.518522
14,0.75,0.016949,0.989566,0.959043,0.503258
6,0.35,1.0,0.003295,0.034574,0.501647
0,0.05,1.0,0.0,0.031383,0.5


## Logistic odds ratios and p-values

Fit an unpenalized logistic model on the best logistic feature to report ORs and p-values.


In [10]:

# Odds ratio and p-value for the best logistic feature
X_log = X[[best_log_feature]].copy()
X_log_imputed = X_log.fillna(X_log.median())
X_sm = sm.add_constant(X_log_imputed, has_constant='add')
logit_model = sm.Logit(y, X_sm)
logit_res = logit_model.fit(disp=False)

coef = logit_res.params[best_log_feature]
or_val = np.exp(coef)
p_val = logit_res.pvalues[best_log_feature]
ci_low, ci_high = np.exp(logit_res.conf_int().loc[best_log_feature])

print(f"Best logistic feature: {best_log_feature}")
print(f"Odds ratio: {or_val:.3f} (95% CI: {ci_low:.3f}, {ci_high:.3f})")
print(f"p-value: {p_val:.3g}")



Best logistic feature: w1dep
Odds ratio: 1.832 (95% CI: 1.468, 2.286)
p-value: 8.2e-08
