focusing code on 1 algorithm (tabnet) + 1 baseline (svm) + 1 cnn (pretrained convnext)
- still running all same tasks
- running the 4 different convnext datasets (based off # of features + SD info)




In [12]:
# STEP 1 — config & smoke test

from pathlib import Path

DATASETS = {
    'ConvNext_128d': '/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_patient_features_128d.csv',
    'ConvNext_256d': '/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_patient_features_256d.csv',
    'ConvNext_Sep_128d': '/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_patient_features_separate_128d.csv',
    'ConvNext_Sep_256d': '/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_patient_features_separate_256d.csv'
}

REPORT_DIR = Path("/Users/joi263/Documents/MultimodalTabData/data/convnext_data")
REPORT_DIR.mkdir(parents=True, exist_ok=True)
REPORT_PATH = REPORT_DIR / "convnext_tabnet_svm_report.txt"

# Check dataset paths exist
for name, p in DATASETS.items():
    pth = Path(p)
    if not pth.exists():
        raise FileNotFoundError(f"{name} not found at: {pth}")

# Optional TabNet check
try:
    from pytorch_tabnet.tab_model import TabNetClassifier  # noqa
    TABNET_AVAILABLE = True
except Exception:
    TABNET_AVAILABLE = False
    print("TabNet not available. You can still run SVM. To add TabNet: pip install pytorch-tabnet torch")

print("All dataset paths found.")
print(f"Report will be written to: {REPORT_PATH}")
print(f"TabNet available: {TABNET_AVAILABLE}")


All dataset paths found.
Report will be written to: /Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_tabnet_svm_report.txt
TabNet available: True


In [10]:
# STEP 2 — loaders, targets, and features

import pandas as pd
import numpy as np

MORTALITY_WINDOWS = [6, 12, 24]

def load_dataset(path: str) -> pd.DataFrame:
    """Read a CSV into a DataFrame."""
    return pd.read_csv(path)

def build_targets(df: pd.DataFrame):
    """
    Return dict: task_name -> (row_index, y_array)
    - Mortality at 6/12/24 months using survival (months) and patient_status (2=dead).
    - High-grade via methylation_class text.
    - IDH: idh_1_r132h (1=neg, 2=mut).
    - MGMT: mgmt (1=methylated, 2=unmethylated).
    """
    targets = {}

   # Mortality (landmark at 6/12/24 months), using your status codes:
# 1 = Alive, 2 = Deceased (date known), 3 = Deceased: Date Unknown (exclude)
    if {'survival', 'patient_status'}.issubset(df.columns):
     has_time = df['survival'].notna()
     status = df['patient_status']
    known_status = status.isin([1, 2])  # exclude 3

    for m in MORTALITY_WINDOWS:
        # Eligible if: known time, known status, and either died (any time) OR observed at least m months
        eligible = has_time & known_status & ((status == 2) | (df['survival'] >= m))
        idx = df.index[eligible]

        # Positive = died by m months with known death date
        y = ((status.loc[idx] == 2) & (df.loc[idx, 'survival'] <= m)).astype(int).to_numpy()
        targets[f"Mortality_{m}mo"] = (idx, y)

    # High-grade tumor via methylation_class
    if 'methylation_class' in df.columns:
        cls = df['methylation_class'].astype(str).str.lower()
        hg_mask = cls.notna()
        hg_idx = df.index[hg_mask]
        high_terms = ['glioblastoma', 'anaplastic', 'high grade', 'grade iv', 'gbm']
        y = cls.loc[hg_idx].str.contains('|'.join(high_terms), na=False).astype(int).to_numpy()
        targets["HighGrade"] = (hg_idx, y)

    # IDH mutation
    if 'idh_1_r132h' in df.columns:
        idh_mask = df['idh_1_r132h'].isin([1, 2])
        idh_idx = df.index[idh_mask]
        y = (df.loc[idh_idx, 'idh_1_r132h'] == 2).astype(int).to_numpy()
        targets["IDH_mut"] = (idh_idx, y)

    # MGMT methylation
    if 'mgmt' in df.columns:
        mgmt_mask = df['mgmt'].isin([1, 2])
        mgmt_idx = df.index[mgmt_mask]
        y = (df.loc[mgmt_idx, 'mgmt'] == 1).astype(int).to_numpy()
        targets["MGMT_meth"] = (df.index[mgmt_mask], y)

    return targets

def select_features(df: pd.DataFrame):
    """
    Collect available features: clinical + molecular + image features.
    We’ll drop target-leaky columns per task later.
    """
    clinical = [c for c in ['age', 'sex', 'race', 'ethnicity', 'gtr'] if c in df.columns]

    # Image features (two possible schemas)
    if any(c.startswith('mean_feature_') for c in df.columns):
        image = [c for c in df.columns if c.startswith(('mean_feature_', 'std_feature_'))]
    else:
        image = [c for c in df.columns if c.startswith('feature_')]

    molecular = [c for c in ['mgmt_pyro', 'atrx', 'p53', 'braf_v600', 'h3k27m', 'gfap',
                             'idh_1_r132h', 'mgmt'] if c in df.columns]

    feats = clinical + molecular + image
    return feats

# Quick smoke test on one dataset
_first_path = next(iter(DATASETS.values()))
_df0 = load_dataset(_first_path)
_tgts0 = build_targets(_df0)
_feats0 = select_features(_df0)

print("Sample dataset shape:", _df0.shape)
print("Num tasks found:", len(_tgts0))
for name, (idx, y) in _tgts0.items():
    print(f"{name:>14} -> n={len(idx)}, pos_rate={y.mean():.3f}")
print("Num features selected:", len(_feats0))


Sample dataset shape: (532, 232)
Num tasks found: 6
 Mortality_6mo -> n=86, pos_rate=0.221
Mortality_12mo -> n=84, pos_rate=0.452
Mortality_24mo -> n=83, pos_rate=0.843
     HighGrade -> n=532, pos_rate=0.242
       IDH_mut -> n=196, pos_rate=0.786
     MGMT_meth -> n=212, pos_rate=0.396
Num features selected: 141


In [13]:
# STEP 3 — evaluation loop + report (SVM + optional TabNet) with AP (PR-AUC)

from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, average_precision_score
from sklearn.svm import SVC

# TabNet optional
try:
    from pytorch_tabnet.tab_model import TabNetClassifier
    TABNET_AVAILABLE = True
except Exception:
    TABNET_AVAILABLE = False

# -------- helpers

def drop_leaky_features(feat_cols, task_name):
    f = feat_cols.copy()
    tl = task_name.lower()
    if 'idh' in tl:
        f = [c for c in f if 'idh' not in c.lower()]
    if 'mgmt' in tl:
        f = [c for c in f if 'mgmt' not in c.lower() and 'mgmt_pyro' not in c.lower()]
    f = [c for c in f if c != 'methylation_class']
    return f

def prep_svm(df, feat_cols, idx, y):
    Xdf = df.loc[idx, feat_cols].copy()
    cat_cols = [c for c in Xdf.columns if Xdf[c].dtype == 'object' or str(Xdf[c].dtype).startswith('category')]
    cat_cols += [c for c in Xdf.columns if Xdf[c].dtype == 'bool' and c not in cat_cols]
    cat_cols = list(dict.fromkeys(cat_cols))
    num_cols = [c for c in Xdf.columns if c not in cat_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", Pipeline([
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler())
            ]), num_cols),
            ("cat", Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore"))
            ]), cat_cols),
        ],
        remainder="drop",
        n_jobs=None
    )
    return Xdf, y, preprocessor

def train_eval_svm(Xdf, y, preprocessor, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        Xdf, y, test_size=0.25, stratify=y, random_state=random_state
    )
    svm_clf = Pipeline([
        ("prep", preprocessor),
        ("clf", SVC(probability=True, class_weight="balanced", random_state=random_state))
    ])
    svm_clf.fit(X_train, y_train)
    y_proba = svm_clf.predict_proba(X_test)[:, 1]
    y_hat = (y_proba >= 0.5).astype(int)

    test_auc = roc_auc_score(y_test, y_proba)
    test_ap  = average_precision_score(y_test, y_proba)
    test_acc = accuracy_score(y_test, y_hat)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    cv_auc = cross_val_score(svm_clf, X_train, y_train, cv=cv, scoring="roc_auc")
    return test_auc, test_ap, test_acc, cv_auc.mean(), cv_auc.std()

def prep_tabnet(df, feat_cols, idx, y):
    X = df.loc[idx, feat_cols].copy()
    cat_cols = [c for c in X.columns if X[c].dtype == 'object' or str(X[c].dtype).startswith('category') or X[c].dtype == 'bool']
    num_cols = [c for c in X.columns if c not in cat_cols]

    for c in num_cols:
        X[c] = X[c].astype(float)
        if X[c].isna().any():
            X[c] = X[c].fillna(X[c].median())

    cat_idxs, cat_dims = [], []
    for i, c in enumerate(X.columns):
        if c in cat_cols:
            codes, uniques = pd.factorize(X[c].astype(str), sort=True)
            X[c] = codes
            cat_idxs.append(i)
            cat_dims.append(len(uniques))
        else:
            X[c] = pd.to_numeric(X[c], errors="coerce").fillna(X[c].median())

    X_np = X.to_numpy().astype(np.float32)
    y_np = np.asarray(y).astype(int)
    return X_np, y_np, cat_idxs, cat_dims

def train_eval_tabnet(X, y, cat_idxs, cat_dims, random_state=42):
    X_train, X_test,


In [15]:
# DIAG 3a — file write + loop smoke test

from pathlib import Path

print("DIAG START")
print("REPORT_PATH:", REPORT_PATH)
print("Report folder exists?:", REPORT_PATH.parent.exists())

# Try writing a tiny file right away
try:
    with open(REPORT_PATH, "w") as f:
        f.write("diagnostic write\n")
    print("File write: OK")
except Exception as e:
    print("File write: FAILED ->", repr(e))

# Check the training loop would iterate
for ds_name, ds_path in DATASETS.items():
    print("Will process:", ds_name)
    try:
        df = load_dataset(ds_path)
        tgts = build_targets(df)
        print("  rows/cols:", df.shape, "| tasks:", [k for k in tgts.keys()])
    except Exception as e:
        print("  ERROR ->", repr(e))

print("DIAG END")


DIAG START
REPORT_PATH: /Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_tabnet_svm_report.txt
Report folder exists?: True
File write: OK
Will process: ConvNext_128d
  rows/cols: (532, 232) | tasks: ['Mortality_6mo', 'Mortality_12mo', 'Mortality_24mo', 'HighGrade', 'IDH_mut', 'MGMT_meth']
Will process: ConvNext_256d
  rows/cols: (532, 356) | tasks: ['Mortality_6mo', 'Mortality_12mo', 'Mortality_24mo', 'HighGrade', 'IDH_mut', 'MGMT_meth']
Will process: ConvNext_Sep_128d
  rows/cols: (532, 228) | tasks: ['Mortality_6mo', 'Mortality_12mo', 'Mortality_24mo', 'HighGrade', 'IDH_mut', 'MGMT_meth']
Will process: ConvNext_Sep_256d
  rows/cols: (532, 356) | tasks: ['Mortality_6mo', 'Mortality_12mo', 'Mortality_24mo', 'HighGrade', 'IDH_mut', 'MGMT_meth']
DIAG END
