In [None]:
# Ensure dependencies are installed when running in hosted notebooks%pip install -r requirements.txt

import urllib.request
from pathlib import Path

BASE_URL = "https://raw.githubusercontent.com/FelixZhan/AtyAN/main/"
HELPER_FILES = [
    "analysis_utils.py",
    "requirements.txt",
    "BP1234-ONSET.csv",
]

for filename in HELPER_FILES:
    dest = Path(filename)
    if dest.exists():
        print(f'{filename} already present, skipping download.')
        continue
    print(f'Downloading {filename}...')
    urllib.request.urlretrieve(f"{BASE_URL}{filename}", dest)
print('Helper files are ready.')


In [None]:
!pip install -q -r requirements.txt


## Imports and shared setup

In [None]:
from analysis_utils import (
    load_base_dataset,
    engineer_baseline_features,
    prepare_univariate_prediction_dataset,
    prepare_persistence_dataset,
    run_univariate_logistic_regressions,
    evaluate_model_zoo,
)
from IPython.display import display


In [None]:
RUN_PERSISTENCE = False  # Set True to re-enable persistence/remission analyses

In [None]:
raw_df = load_base_dataset()
feature_df, feature_sets = engineer_baseline_features(raw_df)
print(f'Dataset shape: {raw_df.shape}')
print(f'Feature matrix shape: {feature_df[feature_sets["all_features"]].shape}')


## Univariate prediction of future atypical AN onset

Participants with full AN diagnoses or baseline atypical AN onset are removed to mirror the original risk-prediction experiment. The target labels any mBMI-defined atypical AN onset across waves 1–6.

In [None]:
prediction_df = prepare_univariate_prediction_dataset(
    feature_df, feature_sets['all_features']
)
outcome_counts = prediction_df['aan_onset_anywave'].value_counts().to_dict()
print('Univariate prediction cohort size:', len(prediction_df))
print('Outcome counts:', outcome_counts)
onset_logistic = run_univariate_logistic_regressions(
    prediction_df, feature_sets['all_features'], target_col='aan_onset_anywave'
)
display(onset_logistic)

## Univariate persistence vs. remission analyses

Set `RUN_PERSISTENCE = True` above to execute the persistence/remission cohort cells. The dataset retains participants with baseline or mBMI-defined onset who have complete wave-1–6 onset data and labels cases that revisit onset after at least one remission wave.

In [None]:
persistence_df = None
if RUN_PERSISTENCE:
    persistence_df = prepare_persistence_dataset(
        feature_df, feature_sets['all_features']
    )
    print('Persistence cohort size:', len(persistence_df))
    print(persistence_df['aan_persistence'].value_counts().rename('count'))
else:
    print('Persistence/remission analyses are disabled. Set RUN_PERSISTENCE = True to enable them.')

### Onset logistic summaries


In [None]:
display(onset_logistic.head(20))

### Onset model zoo


In [None]:
onset_model_metrics, onset_split_tables, _, onset_errors = evaluate_model_zoo(
    prediction_df,
    feature_sets['all_features'],
    target_col='aan_onset_anywave',
    model_names=None,
)
if onset_model_metrics.empty:
    print('No onset models could be evaluated.')
else:
    metric_order = [
        'test_roc_auc_mean',
        'test_accuracy_mean',
        'test_sensitivity_mean',
        'test_specificity_mean',
        'test_f_score_mean',
        'test_g_mean_mean',
    ]
    present = ['model'] + [c for c in metric_order if c in onset_model_metrics.columns]
    display(onset_model_metrics[present])
if not onset_model_metrics.empty and 'overfit_flag' in onset_model_metrics.columns:
    flagged = onset_model_metrics[onset_model_metrics['overfit_flag']]
    if not flagged.empty:
        for name in flagged['model']:
            display(onset_split_tables[name])
for name, err in onset_errors.items():
    print(f'{name}: {err}')

### Persistence logistic summaries


In [None]:
if RUN_PERSISTENCE and persistence_df is not None:
    persistence_logistic = run_univariate_logistic_regressions(
        persistence_df,
        feature_sets['all_features'],
        target_col='aan_persistence',
    )
    display(persistence_logistic.head(20))
else:
    print('Skipping persistence logistic summaries.')

### Persistence model zoo


In [None]:
if RUN_PERSISTENCE and persistence_df is not None:
    persistence_model_metrics, persistence_split_tables, persistence_feature_tables, persistence_errors = evaluate_model_zoo(
        persistence_df,
        feature_sets['all_features'],
        target_col='aan_persistence',
        model_names=None,
    )
    metric_order = [
        'test_roc_auc_mean',
        'test_accuracy_mean',
        'test_sensitivity_mean',
        'test_specificity_mean',
        'test_f_score_mean',
        'test_g_mean_mean',
    ]
    if not persistence_model_metrics.empty:
        present = ['model'] + [c for c in metric_order if c in persistence_model_metrics.columns]
        display(persistence_model_metrics[present])
    else:
        print('No persistence models could be evaluated.')
    if not persistence_model_metrics.empty and 'overfit_flag' in persistence_model_metrics.columns:
        flagged = persistence_model_metrics[persistence_model_metrics['overfit_flag']]
        if not flagged.empty:
            for name in flagged['model']:
                display(persistence_split_tables[name])
    for label, table in persistence_feature_tables.items():
        display(table.head(20))
    for name, err in persistence_errors.items():
        print(f'{name}: {err}')
else:
    print('Skipping persistence model zoo.')