# Multivariate prediction models

This notebook tracks the persistence cohort across the balanced random forest, logistic regression, iBRF, TabPFN random forest, and AutoTabPFN pipelines while keeping the deprecated tree heuristics removed.


In [None]:
import urllib.request
from pathlib import Path

BASE_URL = "https://raw.githubusercontent.com/FelixZhan/AtyAN/main/"
HELPER_FILES = [
    "analysis_utils.py",
    "requirements.txt",
    "BP1234-ONSET.csv",
]

for filename in HELPER_FILES:
    dest = Path(filename)
    if dest.exists():
        print(f'{filename} already present, skipping download.')
        continue
    print(f'Downloading {filename}...')
    urllib.request.urlretrieve(f"{BASE_URL}{filename}", dest)
print('Helper files are ready.')


In [None]:
!pip install -q -r requirements.txt


## Dataset and design matrix

We reuse the persistence filter so the multivariate models are aligned with the univariate regressions.


In [None]:
from analysis_utils import (
    load_base_dataset,
    engineer_baseline_features,
    prepare_persistence_dataset,
    evaluate_model_zoo,
)
from IPython.display import display


In [None]:
model_subset = []  # e.g., ['tabpfn_random_forest']


In [None]:

raw_df = load_base_dataset()
feature_df, feature_sets = engineer_baseline_features(raw_df)
persistence_df = prepare_persistence_dataset(feature_df, feature_sets["all_features"])
print(f"Design matrix shape: {persistence_df.shape}")


## Model comparison summary

Each pipeline uses stratified repeated holdout (configurable repeats/test-size) and reports ROC-AUC, PR-AUC, balanced accuracy, F1, and accuracy. Toggle `model_subset` to rerun a subset (e.g., just TabPFN) when GPU time is scarce.


In [None]:
model_metrics, split_tables, feature_tables, model_errors = evaluate_model_zoo(
    persistence_df,
    feature_sets['all_features'],
    target_col='aan_persistence',
    model_names=model_subset or None,
)
display(model_metrics)
if not model_metrics.empty and 'overfit_flag' in model_metrics.columns:
    flagged = model_metrics[model_metrics['overfit_flag']]
    if not flagged.empty:
        for name in flagged['model']:
            display(split_tables[name])
for name, err in model_errors.items():
    print(f'{name}: {err}')


## Feature importance snapshots

Tree-based models expose feature importances (or absolute coefficients for logistic regression).


In [None]:
for model_name, table in feature_tables.items():
    print(f'Top features for {model_name}')
    display(table.head(20))
