In [None]:
# Ensure dependencies are installed when running in hosted notebooks%pip install -r requirements.txt

import urllib.request
from pathlib import Path

BASE_URL = "https://raw.githubusercontent.com/FelixZhan/AtyAN/main/"
HELPER_FILES = [
    "analysis_utils.py",
    "requirements.txt",
    "BP1234-ONSET.csv",
]

for filename in HELPER_FILES:
    dest = Path(filename)
    if dest.exists():
        print(f'{filename} already present, skipping download.')
        continue
    print(f'Downloading {filename}...')
    urllib.request.urlretrieve(f"{BASE_URL}{filename}", dest)
print('Helper files are ready.')


In [None]:
!pip install -q -r requirements.txt


## Dataset and design matrix

By default the multivariate models target future atypical AN onset (waves 1–6). Set `RUN_PERSISTENCE_MODELS = True` to reuse the persistence/remission cohort instead.

In [None]:
from analysis_utils import (
    load_base_dataset,
    engineer_baseline_features,
    prepare_univariate_prediction_dataset,
    prepare_persistence_dataset,
    evaluate_model_zoo,
)
from IPython.display import display

In [None]:
RUN_PERSISTENCE_MODELS = False  # Toggle to True to target persistence instead of onset

In [None]:
raw_df = load_base_dataset()
feature_df, feature_sets = engineer_baseline_features(raw_df)

if RUN_PERSISTENCE_MODELS:
    design_df = prepare_persistence_dataset(feature_df, feature_sets['all_features'])
    target_label = 'aan_persistence'
    cohort_label = 'persistence/remission'
else:
    design_df = prepare_univariate_prediction_dataset(feature_df, feature_sets['all_features'])
    target_label = 'aan_onset_anywave'
    cohort_label = 'onset prediction'

print(f"Design matrix shape: {design_df.shape}")
print(f"Target: {target_label} ({cohort_label})")

## Model comparison summary

Each pipeline uses stratified repeated holdout splits and now reports AUROC, accuracy, sensitivity, specificity, F-score, and G-mean alongside the previous diagnostics. All available models run by default.

In [None]:
model_metrics, split_tables, feature_tables, model_errors = evaluate_model_zoo(
    design_df,
    feature_sets['all_features'],
    target_col=target_label,
    model_names=None,
)
if model_metrics.empty:
    print('No models could be evaluated.')
else:
    metric_order = [
        'test_roc_auc_mean',
        'test_accuracy_mean',
        'test_sensitivity_mean',
        'test_specificity_mean',
        'test_f_score_mean',
        'test_g_mean_mean',
    ]
    present = ['model'] + [c for c in metric_order if c in model_metrics.columns]
    display(model_metrics[present])
if not model_metrics.empty and 'overfit_flag' in model_metrics.columns:
    flagged = model_metrics[model_metrics['overfit_flag']]
    if not flagged.empty:
        for name in flagged['model']:
            display(split_tables[name])
for name, err in model_errors.items():
    print(f'{name}: {err}')

## Feature importance snapshots

Tree-based models expose feature importances (or absolute coefficients for logistic regression).

In [None]:
for model_name, table in feature_tables.items():
    print(f'Top features for {model_name}')
    display(table.head(20))
