okay so have to clean up the data a bit more (post talk w/ MJ)
1. need to make all categorical values binary, and one-hot the diagnosis data (permanent) so DLMs can read it properly
-one-hot encoding creates new columns—one per category—and fills them with 0/1 flags
-drop_first=True: you can drop one column (e.g. oligodendroglioma) to avoid perfect multicollinearity
2. also need to plot data to visualize it so we can see if there's an even spread and just see it
3. stratify the y-axis avoid performance artifacts and ensures your model is evaluated on a realistic class balance
-stratify=y tells train_test_split to preserve the same proportion of each class (0 vs. 1) in both the training and validation sets

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

# ── 1) Paths ──
in_csv  = "/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_master.csv"
out_csv = "/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input.csv"

# ── 2) Load ──
df = pd.read_csv(in_csv)

# ── 3) Identify all object/categorical columns ──
cat_cols = df.select_dtypes(include=["object","category"]).columns.tolist()

# ── 4) Find the ones that are truly binary (2 unique non-null values) ──
binary_cols = [c for c in cat_cols if df[c].nunique(dropna=True)==2]

# ── 5) Label-encode those binaries to 0/1 ──
le = LabelEncoder()
for c in binary_cols:
    df[c] = le.fit_transform(df[c].astype(str))

# ── 6) Now one-hot encode ONLY the 'permanent' diagnosis column ──
#     (you can add more multi-class cols here if needed)
df = pd.get_dummies(
    df,
    columns=["permanent"],
    prefix="diag",
    prefix_sep="_",
    drop_first=True   #drop 1st column = k-1, if you want to keep all dummies set drop_first=False
)

# ── 7) (Optional) verify no remaining object cols ──
assert df.select_dtypes(include=["object","category"]).shape[1] == 0, "Still have non-numeric columns!"

# ── 8) Save the fully-numeric DataFrame ──
df.to_csv(out_csv, index=False)
print(f"✅ Saved model-ready CSV: {out_csv}")
print(f"   → shape = {df.shape}  (rows, columns)")


AssertionError: Still have non-numeric columns!

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

# ── User-provided file path ──
in_csv  = "/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_master.csv"
out_csv = "/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input.csv"

# ── Load the DataFrame ──
df = pd.read_csv(in_csv)

# ── Identify all object/categorical columns ──
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

# ── Process each categorical column ──
le = LabelEncoder()
for col in cat_cols:
    n_unique = df[col].nunique(dropna=True)
    if n_unique == 2:
        # Binary: encode to 0/1
        df[col] = le.fit_transform(df[col].astype(str))
    else:
        # Multi-class: one-hot encode, drop first to avoid collinearity
        df = pd.get_dummies(df, columns=[col], prefix=col, drop_first=True)

# ── Verify no remaining non-numeric columns ──
non_numeric = df.select_dtypes(include=["object", "category"]).columns.tolist()
if non_numeric:
    print("⚠️ Still non-numeric columns:", non_numeric)
else:
    print("✅ All columns are now numeric.")

# ── Save the numeric DataFrame ──
os.makedirs(os.path.dirname(out_csv), exist_ok=True)
df.to_csv(out_csv, index=False)
print(f"✅ Saved model-ready CSV to: {out_csv}")
print(f"   → Shape: {df.shape} (rows, columns)")


✅ All columns are now numeric.
✅ Saved model-ready CSV to: /Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input.csv
   → Shape: (510, 1200) (rows, columns)


too many columns (1200+, excel can only output 1000 max)
subset first, then one-hot (only permanent column)

In [5]:
import pandas as pd

# 1. Load
df = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_master.csv")

# 2. Define what you actually want (≥ 40 % complete)
imaging_cols     = [c for c in df if c.startswith("feature_")]
keep_clinical    = ["age","sex","chemotherapy_any"]
keep_molecular   = ["mgmt_pyro","methylation_class","ki_67"]
diag_col         = "permanent"    # the one multi-class

wanted = imaging_cols + keep_clinical + keep_molecular + [diag_col, "mortality_1yr"]
df_sub = df[wanted].copy()

# 3. Encode 
#  a) Binary columns
binary_cols = [c for c in keep_clinical+keep_molecular if df_sub[c].nunique(dropna=True)==2]
for c in binary_cols:
    df_sub[c] = df_sub[c].map({df_sub[c].dropna().unique()[0]:0,
                                df_sub[c].dropna().unique()[1]:1})

#  b) One-hot the diagnosis
df_out = pd.get_dummies(df_sub, columns=[diag_col], drop_first=True)

# 4. Save
df_out.to_csv("convnext_model_input_trimmed.csv", index=False)
print("→ final shape", df_out.shape)


KeyError: "['chemotherapy_any', 'mortality_1yr'] not in index"

Created chemotherapy_any by OR’ing all chemotherapy___* columns

Derived mortality_1yr from survival and patient_status

Subselected only the imaging, clinical, molecular, diagnosis, and target columns

Label‐encoded binary features and one‐hot encoded permanent

Saved a trimmed, fully numeric CSV at /mnt/data/convnext_model_input_trimmed.csv (510 rows × 345 columns)

In [6]:
import pandas as pd
import numpy as np

# ── Paths ──
in_csv  = "/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_master.csv"
out_csv = "/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input_trimmed.csv"

# ── Load ──
df = pd.read_csv(in_csv)

# ── 1) Create 'chemotherapy_any' flag ──
chemo_cols = [c for c in df.columns if c.startswith("chemotherapy___")]
if chemo_cols:
    df["chemotherapy_any"] = (df[chemo_cols].sum(axis=1) > 0).astype(int)
else:
    df["chemotherapy_any"] = 0  # or drop if not present

# ── 2) Create 1-year mortality flag 'mortality_1yr' ──
# Assuming 'survival' (months) and 'patient_status' (2=dead) from earlier
df["mortality_1yr"] = np.where(
    (df["survival"] <= 12) & (df["patient_status"] == 2.0),
    1, 0
)

# ── 3) Define columns to keep ──
# Imaging features
imaging_cols   = [c for c in df.columns if c.startswith("feature_")]
# Clinical/demographic
keep_clinical  = ["age", "sex", "chemotherapy_any"]
# Molecular (≥40% complete as decided)
keep_molecular = ["mgmt_pyro", "methylation_class", "ki_67"]
# Diagnosis and target
diag_col       = "permanent"
target_col     = "mortality_1yr"

wanted = imaging_cols + keep_clinical + keep_molecular + [diag_col, target_col]
df_sub = df[wanted].copy()

# ── 4) Encode categorical ──
# Binary columns
binary_cols = [c for c in keep_clinical+keep_molecular if df_sub[c].nunique(dropna=True)==2]
for c in binary_cols:
    df_sub[c] = df_sub[c].astype(str).map(
        {df_sub[c].dropna().unique()[0]:0,
         df_sub[c].dropna().unique()[1]:1}
    )

# One-hot for diagnosis
df_out = pd.get_dummies(df_sub, columns=[diag_col], drop_first=True)

# ── 5) Save ──
df_out.to_csv(out_csv, index=False)
print("✅ Trimmed model input saved at:", out_csv)
print("   Final shape (rows, columns):", df_out.shape)


✅ Trimmed model input saved at: /Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input_trimmed.csv
   Final shape (rows, columns): (510, 345)


trying to figure out 1 and 2 yr mortality rates
see a big class imbalance --> greater than the recommended 3:7 (97.4:100)
consider looking at 6 mo and 9 mo and overall survival

In [7]:
import pandas as pd
import numpy as np

# ── 1) Load your cleaned master CSV ──
df = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_master.csv")

# ── 2) Derive 1‑year and 2‑year raw labels with proper censoring ──
df["mortality_1yr_raw"] = np.where(
    (df["survival"] <= 12) & (df["patient_status"] == 2.0), 1,
    np.where((df["survival"] >= 12) & (df["patient_status"] == 1.0), 0, np.nan)
)
df["mortality_2yr_raw"] = np.where(
    (df["survival"] <= 24) & (df["patient_status"] == 2.0), 1,
    np.where((df["survival"] >= 24) & (df["patient_status"] == 1.0), 0, np.nan)
)

# ── 3) Subset to fully labeled cohorts ──
df_1yr = df[df["mortality_1yr_raw"].notna()].copy()
df_2yr = df[df["mortality_2yr_raw"].notna()].copy()
df_1yr["mortality_1yr"] = df_1yr["mortality_1yr_raw"].astype(int)
df_2yr["mortality_2yr"] = df_2yr["mortality_2yr_raw"].astype(int)

# ── 4) Calculate overall rates ──
n1, d1 = len(df_1yr), df_1yr["mortality_1yr"].sum()
n2, d2 = len(df_2yr), df_2yr["mortality_2yr"].sum()
print(f"1‑year mortality: {d1}/{n1} = {d1/n1*100:.1f}%")
print(f"2‑year mortality: {d2}/{n2} = {d2/n2*100:.1f}%")

# ── 5) Mortality rates by diagnosis ──
summary = pd.DataFrame({
    "1yr_n":     df_1yr.groupby("permanent").size(),
    "1yr_deaths":df_1yr.groupby("permanent")["mortality_1yr"].sum(),
    "1yr_rate":  df_1yr.groupby("permanent")["mortality_1yr"].mean()*100,
    "2yr_n":     df_2yr.groupby("permanent").size(),
    "2yr_deaths":df_2yr.groupby("permanent")["mortality_2yr"].sum(),
    "2yr_rate":  df_2yr.groupby("permanent")["mortality_2yr"].mean()*100
})
out_path = "/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_mortality_rates_by_diagnosis.csv"
summary.to_csv(out_path)
print("✅ Saved per‑diagnosis mortality rates to:", out_path)
print(summary.head(10))


1‑year mortality: 38/39 = 97.4%
2‑year mortality: 70/70 = 100.0%
✅ Saved per‑diagnosis mortality rates to: /Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_mortality_rates_by_diagnosis.csv
                                                   1yr_n  1yr_deaths  \
permanent                                                              
anaplastic astrocytoma                               1.0         1.0   
anaplastic astrocytoma who grade iii                 NaN         NaN   
anaplastic diffuse glioma, at least who grade iii    1.0         1.0   
astrocytoma, idh-mutant, cns who grade 4             1.0         1.0   
burkitt lymphoma                                     1.0         1.0   
glioblastoma                                         2.0         2.0   
glioblastoma who grade iv                           15.0        14.0   
glioblastoma, cns who grade 4                       13.0        13.0   
glioblastoma, cns who grade 4,                       1.0         1.0  

In [11]:
import pandas as pd
import numpy as np
import os

# 1) Load your cleaned master CSV
df = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_master.csv")

# 2) Derive 6- and 9-month raw labels, properly censoring those without enough follow-up
def make_binary_label(df, months, event_col="patient_status", time_col="survival"):
    # 1 = died on or before `months`; 0 = alive at or beyond `months`; NaN = censored before `months`
    return np.where(
        (df[time_col] <= months) & (df[event_col] == 2.0), 1,
    np.where(
        (df[time_col] >= months) & (df[event_col] == 1.0), 0,
        np.nan
    ))

df["mortality_6m_raw"] = make_binary_label(df, 6)
df["mortality_9m_raw"] = make_binary_label(df, 9)

# 3) Subset to fully labeled patients
df6 = df[df["mortality_6m_raw"].notna()].copy()
df9 = df[df["mortality_9m_raw"].notna()].copy()
df6["mortality_6m"] = df6["mortality_6m_raw"].astype(int)
df9["mortality_9m"] = df9["mortality_9m_raw"].astype(int)

# 4) Compute and save class‐balance summaries
for label, subdf in [("6m", df6), ("9m", df9)]:
    n = len(subdf)
    d = subdf[f"mortality_{label}"].sum()
    pct = d/n*100
    print(f"{label} mortality: {d}/{n} = {pct:.1f}% positive")
    # Optionally save per-diagnosis breakdown:
    summary = (
        subdf.groupby("permanent")[f"mortality_{label}"]
             .agg(n="size", deaths="sum")
             .assign(rate=lambda x: x["deaths"]/x["n"]*100)
    )
    out = f"/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_mortality_{label}_by_dx.csv"
    summary.to_csv(out)
    print(f"  → Saved per-diagnosis {label}-m summary to {out}")


6m mortality: 19/22 = 86.4% positive
  → Saved per-diagnosis 6m-m summary to /Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_mortality_6m_by_dx.csv
9m mortality: 28/31 = 90.3% positive
  → Saved per-diagnosis 9m-m summary to /Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_mortality_9m_by_dx.csv


In [14]:
# overall survival 

import pandas as pd
from sksurv.util import Surv
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

# 1) Load your cleaned master CSV
df = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input_trimmed.csv")

# 2) Build the structured survival array via from_arrays
#    event = True if patient_status == 2.0 (dead), False otherwise
#    time  = survival time in months
event = df["patient_status"] == 2.0
time  = df["survival"]
surv  = Surv.from_arrays(event, time)

# 3) Select your feature matrix X
image_feats     = [c for c in df.columns if c.startswith("feature_")]
non_image_feats = ["age","sex","mgmt_pyro","methylation_class","ki_67"]
X = df[non_image_feats + image_feats]

# 4) Fit the Cox model
cox = CoxPHSurvivalAnalysis()
cox.fit(X, surv)

# 5) Evaluate via concordance index (higher ↑ is better; 0.5 = random)
cindex = concordance_index_censored(
    surv["event"], surv["time"], cox.predict(X)
)[0]
print(f"CoxPH concordance index (C-index): {cindex:.3f}")


KeyError: 'patient_status'

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import os

# ── Paths ──
in_csv  = "/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_master.csv"
out_csv = "/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input_trimmed.csv"

# ── Load original cleaned master ──
df = pd.read_csv(in_csv)

# ── 1) Preserve patient IDs ──
case_id_col = "case_number"

# ── 2) Create 'chemotherapy_any' flag ──
chemo_cols = [c for c in df.columns if c.startswith("chemotherapy___")]
df["chemotherapy_any"] = (df[chemo_cols].sum(axis=1) > 0).astype(int)

# ── 3) Create 1-year mortality flag 'mortality_1yr' with censoring ──
df["mortality_1yr"] = np.where(
    (df["survival"] <= 12) & (df["patient_status"] == 2.0), 1,
    np.where((df["survival"] >= 12) & (df["patient_status"] == 1.0), 0, np.nan)
)

# ── 4) Select desired columns ──
imaging_cols   = [c for c in df.columns if c.startswith("feature_")]
keep_clinical  = ["age", "sex", "chemotherapy_any"]
keep_molecular = ["mgmt_pyro", "methylation_class", "ki_67"]
diag_col       = "permanent"
target_col     = "mortality_1yr"

wanted = [case_id_col] + imaging_cols + keep_clinical + keep_molecular + [diag_col, target_col]
df_sub = df[wanted].copy()

# ── 5) Encode categorical columns ──
le = LabelEncoder()
cat_cols = df_sub.select_dtypes(include=["object", "category"]).columns.tolist()
for col in cat_cols:
    unique_vals = df_sub[col].dropna().unique()
    if len(unique_vals) == 2:
        # Binary: map two categories to 0/1
        df_sub[col] = le.fit_transform(df_sub[col].astype(str))
    else:
        # Multi-class: one-hot encode, drop first level
        df_sub = pd.get_dummies(df_sub, columns=[col], prefix=col, drop_first=True)

# ── 6) Drop rows still censored before 1 year ──
df_sub = df_sub[df_sub[target_col].notna()].copy()

# ── 7) Save trimmed with ID ──
os.makedirs(os.path.dirname(out_csv), exist_ok=True)
df_sub.to_csv(out_csv, index=False)
print(f"✅ Saved trimmed model input with IDs: {out_csv}")
print(f"   Shape: {df_sub.shape} (rows, columns)")


✅ Saved trimmed model input with IDs: /Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input_trimmed.csv
   Shape: (39, 479) (rows, columns)


In [25]:
from sksurv.linear_model import CoxPHSurvivalAnalysis

# … after imputing X_imp and building surv …

# Use an alpha > 0 to penalize collinearity
cox = CoxPHSurvivalAnalysis(alpha=1.0)  
cox.fit(X_imp, surv)

cindex = concordance_index_censored(
    surv["event"], surv["time"], cox.predict(X_imp)
)[0]
print(f"CoxPH C-index (L2, α=1.0): {cindex:.3f}")


CoxPH C-index (L2, α=1.0): 0.820


In [26]:
import pandas as pd
from sksurv.util import Surv
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored

# ── 1) Load the two CSVs ──
df_master     = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_master.csv")                 # has survival & patient_status
df_preds      = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input_trimmed.csv")    # numeric predictors + case_number

# ── 2) Merge to bring back time/event ──
df = df_preds.merge(
    df_master[["case_number","patient_status","survival"]],
    on="case_number",
    how="left"
)

# ── 3) Build the survival array ──
#    event: True when patient_status == 2 (dead)
#    time:  months in df["survival"]
event = df["patient_status"] == 2.0
time  = df["survival"]
surv  = Surv.from_arrays(event, time)

# ── 4) Drop non-predictors to form X ──
#    Remove ID, the two survival columns, plus any explicit binary target if present
drop_cols = ["case_number", "patient_status", "survival"]
# If your trimmed file also has mortality_1yr, you can drop it here:
if "mortality_1yr" in df.columns:
    drop_cols.append("mortality_1yr")
X = df.drop(columns=drop_cols)

# Now X contains only numeric predictors: imaging features, clinical flags, one-hot columns, etc.

# ── 5) Fit CoxPH and compute C-index ──
cox = CoxPHSurvivalAnalysis()
cox.fit(X, surv)

cindex = concordance_index_censored(
    surv["event"], surv["time"], cox.predict(X)
)[0]
print(f"✅ CoxPH concordance index (C-index): {cindex:.3f}")

ValueError: Input X contains NaN.
CoxPHSurvivalAnalysis does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [33]:
import pandas as pd
from sksurv.util import Surv
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored
from sklearn.impute import SimpleImputer

# 1) Load your merged DataFrame (preds + survival)
df = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_model_input_trimmed.csv") \
       .merge(
         pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_data/convnext_cleaned_master.csv")[["case_number","patient_status","survival"]],
         on="case_number", how="left"
       )

# 2) Build the survival array
event = df["patient_status"] == 2.0
time  = df["survival"]
surv  = Surv.from_arrays(event, time)

# 3) Prepare your feature matrix X
#    Drop ID, survival/time, event/status, and the explicit target if present
drop_cols = ["case_number","patient_status","survival"]
if "mortality_1yr" in df.columns:
    drop_cols.append("mortality_1yr")
X = df.drop(columns=drop_cols)

# 4) Median‐impute any remaining NaNs
imp = SimpleImputer(strategy="median")
X_imp = pd.DataFrame(
    imp.fit_transform(X),
    columns=X.columns,
    index=X.index
)

# 5) Fit CoxPH on the imputed data
cox = CoxPHSurvivalAnalysis()
cox.fit(X_imp, surv)

# 6) Evaluate via concordance index
cindex = concordance_index_censored(
    surv["event"], surv["time"], cox.predict(X_imp)
)[0]
print(f"CoxPH C-index after median imputation: {cindex:.3f}")


LinAlgError: Matrix is singular.