In [3]:
import urllib.request
from pathlib import Path

BASE_URL = "https://raw.githubusercontent.com/FelixZhan/AtyAN/main/"
HELPER_FILES = [
    "analysis_utils.py",
    "requirements.txt",
    "BP1234-ONSET.csv",
]

for filename in HELPER_FILES:
    dest = Path(filename)
    if dest.exists():
        print(f"{filename} already present, skipping download.")
        continue
    print(f"Downloading {filename}...")
    urllib.request.urlretrieve(f"{BASE_URL}{filename}", dest)

print("Helper files are ready.")


Downloading analysis_utils.py...
Downloading requirements.txt...
Downloading BP1234-ONSET.csv...
Helper files are ready.


In [14]:
!pip install -q -r requirements.txt
!pip install -q imbalanced-learn



  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m551.9/551.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.5/14.5 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.3/487.3 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.1/225.1 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71

In [15]:
import numpy as np
import pandas as pd

from analysis_utils import (
    load_base_dataset,
    engineer_baseline_features,
)

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score,
    recall_score,
    confusion_matrix,
    make_scorer,
)

from imblearn.ensemble import BalancedRandomForestClassifier


In [16]:
raw_df = load_base_dataset()
feature_df, feature_sets = engineer_baseline_features(raw_df)

print(f"Raw dataset shape: {raw_df.shape}")
print(f"Feature matrix shape: {feature_df[feature_sets['all_features']].shape}")


Raw dataset shape: (1952, 3713)
Feature matrix shape: (1952, 24)


In [19]:
# Unique participant ID column – change if needed
ID_COL = "id"  # e.g., "study_id" if that's what your data uses

# AAN presence columns for waves 1–6 (0/1)
# >>> EDIT THESE NAMES TO MATCH YOUR DATA <<<
AAN_PRESENCE_COLS = {
    1: "w1ONSET-FULL",
    2: "w2ONSET-FULL-mBMI",
    3: "w3ONSET-FULL-mBMI",
    4: "w4ONSET-FULL-mBMI",
    5: "w5ONSET-FULL-mBMI",
    6: "w6ONSET-FULL-mBMI",
}

# w{x}ed14 and w{x}ed16 column name templates
# Assumes columns like "w4ed14", "w4ed16", "w5ed14", etc.
ED14_TEMPLATE = "w{wave}ed14"
ED16_TEMPLATE = "w{wave}ed16"

print("Using AAN presence columns:", AAN_PRESENCE_COLS)
print("ID column:", ID_COL)


Using AAN presence columns: {1: 'w1ONSET-FULL', 2: 'w2ONSET-FULL-mBMI', 3: 'w3ONSET-FULL-mBMI', 4: 'w4ONSET-FULL-mBMI', 5: 'w5ONSET-FULL-mBMI', 6: 'w6ONSET-FULL-mBMI'}
ID column: id


In [20]:
labels = raw_df[[ID_COL]].copy()

# Ensure presence columns exist and are 0/1
for wave, col in AAN_PRESENCE_COLS.items():
    labels[col] = raw_df[col].fillna(0).astype(int)

# AAN onset at ANY wave (1–6)
presence_cols_all = [AAN_PRESENCE_COLS[w] for w in sorted(AAN_PRESENCE_COLS)]
labels["aan_onset_any_1_6"] = (labels[presence_cols_all].sum(axis=1) > 0).astype(int)

print("Onset (any wave 1–6) counts:")
print(labels["aan_onset_any_1_6"].value_counts().rename("count"))


Onset (any wave 1–6) counts:
aan_onset_any_1_6
0    1859
1      93
Name: count, dtype: int64


In [21]:
def first_onset_wave(row):
    for w in sorted(AAN_PRESENCE_COLS.keys()):
        if row[AAN_PRESENCE_COLS[w]] == 1:
            return w
    return np.nan

labels["first_onset_wave"] = labels.apply(first_onset_wave, axis=1)

def persistence_anchor_wave(first_w):
    """
    Map first onset wave to the wave at which we check w{x}ed14 / w{x}ed16.
    Intervals: 1–4, 4–5, 5–6.
    - Onset at 1–4  -> anchor = 4
    - Onset at 5    -> anchor = 5
    - Onset at 6    -> anchor = 6
    """
    if pd.isna(first_w):
        return np.nan
    first_w = int(first_w)
    if first_w <= 4:
        return 4
    elif first_w == 5:
        return 5
    elif first_w == 6:
        return 6
    else:
        return np.nan

labels["persistence_anchor_wave"] = labels["first_onset_wave"].apply(persistence_anchor_wave)

print("First onset wave distribution:")
print(labels["first_onset_wave"].value_counts(dropna=False).rename("count"))

print("\nPersistence anchor wave distribution:")
print(labels["persistence_anchor_wave"].value_counts(dropna=False).rename("count"))


First onset wave distribution:
first_onset_wave
NaN    1859
1.0      34
4.0      31
3.0      17
5.0       8
2.0       2
6.0       1
Name: count, dtype: int64

Persistence anchor wave distribution:
persistence_anchor_wave
NaN    1859
4.0      84
5.0       8
6.0       1
Name: count, dtype: int64


In [22]:
# Bring in w4ed14, w4ed16, w5ed14, w5ed16, w6ed14, w6ed16 (if present)
for wave in (4, 5, 6):
    for tmpl in (ED14_TEMPLATE, ED16_TEMPLATE):
        col = tmpl.format(wave=wave)
        if col in raw_df.columns:
            labels[col] = raw_df[col]
        else:
            print(f"WARNING: column {col} not found in raw_df; it will be treated as missing.")

def is_persistent(row, missing_as_nonpersistent=True):
    """
    Persistence definition:
    - Find persistence_anchor_wave in {4,5,6} (based on first onset).
    - Look at w{x}ed14 and w{x}ed16 for that x.
    - If BOTH are >= 5, classify as persistent (1).
    """
    w = row["persistence_anchor_wave"]
    if pd.isna(w):
        return 0
    w = int(w)
    if w not in (4, 5, 6):
        return 0
    
    ed14_col = ED14_TEMPLATE.format(wave=w)
    ed16_col = ED16_TEMPLATE.format(wave=w)
    
    ed14 = row.get(ed14_col, np.nan)
    ed16 = row.get(ed16_col, np.nan)
    
    if pd.isna(ed14) or pd.isna(ed16):
        return 0 if missing_as_nonpersistent else np.nan
    
    return int((ed14 >= 5) and (ed16 >= 5))

labels["aan_persistence_new"] = labels.apply(is_persistent, axis=1).astype(int)

print("New persistence label counts (aan_persistence_new):")
print(labels["aan_persistence_new"].value_counts().rename("count"))


New persistence label counts (aan_persistence_new):
aan_persistence_new
0    1925
1      27
Name: count, dtype: int64


In [23]:
# Base remission: onset at any wave (1–6), BUT not persistent
labels["remission_base"] = (
    (labels["aan_onset_any_1_6"] == 1) &
    (labels["aan_persistence_new"] == 0)
).astype(int)

# Extra remission rule:
#   AAN in wave 2 or 3, but NOT in wave 4
w2_col = AAN_PRESENCE_COLS[2]
w3_col = AAN_PRESENCE_COLS[3]
w4_col = AAN_PRESENCE_COLS[4]

labels["remission_early_2_3_no_4"] = (
    ((labels[w2_col] == 1) | (labels[w3_col] == 1)) &
    (labels[w4_col] == 0)
).astype(int)

# Final remission: must NOT be persistent, and satisfy base or early-remission rule
labels["aan_remission_new"] = (
    (labels["aan_persistence_new"] == 0) &
    (
        (labels["remission_base"] == 1) |
        (labels["remission_early_2_3_no_4"] == 1)
    )
).astype(int)

print("New remission label counts (aan_remission_new):")
print(labels["aan_remission_new"].value_counts().rename("count"))

# Optional: combined course label
# 0 = no onset; 1 = remission; 2 = persistence
labels["aan_course_new"] = 0
labels.loc[labels["aan_remission_new"] == 1, "aan_course_new"] = 1
labels.loc[labels["aan_persistence_new"] == 1, "aan_course_new"] = 2

print("\nCourse label counts (aan_course_new: 0=none,1=remission,2=persistence):")
print(labels["aan_course_new"].value_counts().rename("count"))


New remission label counts (aan_remission_new):
aan_remission_new
0    1886
1      66
Name: count, dtype: int64

Course label counts (aan_course_new: 0=none,1=remission,2=persistence):
aan_course_new
0    1859
1      66
2      27
Name: count, dtype: int64


In [24]:
# Merge labels onto feature_df by ID
model_df = feature_df.merge(
    labels[[ID_COL, "aan_onset_any_1_6", "aan_persistence_new"]],
    on=ID_COL,
    how="inner",
)

# Restrict to participants with AAN onset (1–6)
model_df = model_df[model_df["aan_onset_any_1_6"] == 1].copy()

y = model_df["aan_persistence_new"].astype(int).values

print("Modeling cohort (onset only) size:", len(model_df))
print("Persistence outcome counts:")
print(pd.Series(y).value_counts().rename("count"))

# Drop predictors directly tied to the label to avoid leakage
target_related_cols = set(AAN_PRESENCE_COLS.values())
for wave in (4, 5, 6):
    target_related_cols.add(ED14_TEMPLATE.format(wave=wave))
    target_related_cols.add(ED16_TEMPLATE.format(wave=wave))

all_features = feature_sets["all_features"]
predictor_cols = [c for c in all_features if c not in target_related_cols]

print(f"\nUsing {len(predictor_cols)} predictors (after excluding target-related columns).")

# Simple median imputation
X = model_df[predictor_cols].copy()
X = X.fillna(X.median(numeric_only=True))


Modeling cohort (onset only) size: 93
Persistence outcome counts:
0    66
1    27
Name: count, dtype: int64

Using 24 predictors (after excluding target-related columns).
