# Collect Stats

In this notebook, we will apply Neural Fine Gray on the FRAMINGHAM data.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/DLHC

/content/drive/MyDrive/DLHC


In [9]:
from getpass import getpass

# Enter token securely
token = getpass("Enter your GitHub token: ")

# Set remote URL with token
remote_url = f"https://DrEaston:{token}@github.com/DrEaston/DLHC.git"



Enter your GitHub token: ··········


In [10]:
! git push origin main

Everything up-to-date


In [11]:
!  git config --global user.email "curtis.easton@gmail.com"
!  git config --global user.name "DrEaston"

In [6]:

!pip install scikit-survival
!pip install pycox
!pip install lifelines

Collecting scikit-survival
  Downloading scikit_survival-0.24.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ecos (from scikit-survival)
  Downloading ecos-2.0.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.0 kB)
Collecting osqp<1.0.0,>=0.6.3 (from scikit-survival)
  Downloading osqp-0.6.7.post3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting qdldl (from osqp<1.0.0,>=0.6.3->scikit-survival)
  Downloading qdldl-0.1.7.post5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading scikit_survival-0.24.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)
[2K  

In [7]:
import sys
sys.path.append('/content/drive/MyDrive/DLHC')
sys.path.append('/content/drive/MyDrive/DLHC/NeuralFineGray')
sys.path.append('/content/drive/MyDrive/DLHC/DeepSurvivalMachines')

In [8]:
# --- Setup
import os
import sys
import numpy as np
import pandas as pd

from nfg import datasets
from experiment import Experiment
from metrics import truncated_concordance_td, auc_td, brier_score as bs

from pycox.evaluation import EvalSurv
from sksurv.metrics import concordance_index_ipcw, brier_score, cumulative_dynamic_auc, integrated_brier_score
from metrics import truncated_concordance_td, auc_td, brier_score as bs

# --- Set your dataset
dataset = 'FRAMINGHAM'

# --- Set the correct path dynamically
path = f'/content/drive/MyDrive/DLHC/Results/{dataset}/'

# --- Load your data
x, t, e, covariates = datasets.load_dataset(dataset, path='./', competing=True, normalize=False)

# --- Set evaluation times
horizons = [0.25, 0.5, 0.75]
times_eval = np.quantile(t[e > 0], horizons)

groups = None  # <-- no groups for PBC


### Utils: The evaluatino metrics used
def evaluate(survival, e = e, t = t, groups = None, times_eval = []):
    folds = survival.iloc[:, -1].values
    survival = survival.iloc[:, :-1]
    survival.columns = pd.MultiIndex.from_frame(pd.DataFrame(index=survival.columns).reset_index().astype(float))

    times = survival.columns.get_level_values(1).unique()
    results = {}

    # If multiple risk, compute cause specific metrics
    for r in survival.columns.get_level_values(0).unique():
        for fold in np.arange(5):
            res = {}
            e_train, t_train = e[folds != fold], t[folds != fold]
            e_test,  t_test  = e[folds == fold], t[folds == fold]
            g_train, g_test = (None, None) if groups is None else (groups[folds != fold], groups[folds == fold])

            survival_train = survival[folds != fold][r]
            survival_fold = survival[folds == fold][r]

            km = EvalSurv(survival_train.T, t_train, e_train != 0, censor_surv = 'km')
            test_eval = EvalSurv(survival_fold.T, t_test, e_test == int(r), censor_surv = km)

            res['Overall'] = {
                    "CIS": test_eval.concordance_td(),
                }
            try:
                res['Overall']['BRS'] = test_eval.integrated_brier_score(times.to_numpy())
            except: pass

            km = (e_train, t_train)
            if len(times_eval) > 0:
                for te in times_eval:
                    try:
                        ci, km = truncated_concordance_td(e_test, t_test, 1 - survival_fold.values, times, te, km = km, competing_risk = int(r))
                        res[te] = {
                            "CIS": ci,
                            "BRS": bs(e_test, t_test, 1 - survival_fold.values, times, te, km = km, competing_risk = int(r))[0]}
                    except:
                        pass

                    for group in groups.unique() if groups is not None else []:
                        try:
                            km = (e_train[g_train == group], t_train[g_train == group])
                            res[te]["CIS_{}".format(group)] = truncated_concordance_td(e_test[g_test == group], t_test[g_test == group], 1 - survival_fold[g_test == group].values, times, te, km = km, competing_risk = int(r))[0]
                            res[te]["BRS_{}".format(group)] = bs(e_test[g_test == group], t_test[g_test == group], 1 - survival_fold[g_test == group].values, times, te, km = km, competing_risk = int(r))[0]

                            km = (e_train[g_train != group], t_train[g_train != group])
                            res[te]["Delta_CIS_{}".format(group)] = res[te]["CIS_{}".format(group)] - truncated_concordance_td(e_test[g_test != group], t_test[g_test != group], 1 - survival_fold[g_test != group].values, times, te, km = km, competing_risk = int(r))[0]
                            res[te]["Delta_BRS_{}".format(group)] = res[te]["BRS_{}".format(group)] - bs(e_test[g_test != group], t_test[g_test != group], 1 - survival_fold[g_test != group].values, times, te, km = km, competing_risk = int(r))[0]

                        except:
                            pass
            results[(r, fold)] = pd.DataFrame.from_dict(res)
    results = pd.concat(results)
    results.index.set_names(['Risk', 'Fold', 'Metric'], inplace = True)

    return results




Opening: FRAMINGHAM_dh.csv  -  dh
Opening: FRAMINGHAM_ds.csv  -  ds
Opening: FRAMINGHAM_dsm.csv  -  dsm
Opening: FRAMINGHAM_nfg.csv  -  nfg
Metric                CIS                                               \
                  Overall        2153.75         4589.5        6620.75   
Risk Model                                                               
1.0  DSM    0.697 (0.033)  0.698 (0.056)  0.672 (0.035)  0.664 (0.021)   
     dh     0.683 (0.023)  0.666 (0.049)  0.645 (0.030)  0.651 (0.027)   
     ds     0.642 (0.039)  0.651 (0.050)  0.629 (0.056)  0.615 (0.037)   
     nfg    0.724 (0.016)  0.723 (0.048)  0.692 (0.028)  0.686 (0.017)   

Metric            BRS                                               
              Overall        2153.75         4589.5        6620.75  
Risk Model                                                          
1.0  DSM    nan (nan)  0.026 (0.004)  0.069 (0.003)  0.108 (0.008)  
     dh     nan (nan)  0.026 (0.004)  0.068 (0.004)  0.108 (0.007

  table = table.unstack(level=-1).stack(level=0).unstack(level=-1).loc[:, ['CIS', 'BRS']]


In [None]:
# --- Load predictions and compute metrics
predictions, results, models = {}, {}, {}
for file_name in os.listdir(path):
    if dataset in file_name and (('.csv' in file_name) or ('.csv.gz' in file_name)):
        model = file_name
        model = model[model.rindex('_') + 1: model.index('.')]
        print("Opening:", file_name, ' - ', model)

        predictions[model] = pd.read_csv(path + file_name, header=[0, 1], index_col=0)
        results[model] = evaluate(predictions[model], groups=groups, times_eval=times_eval)

# --- Rename models nicely
dict_name = {'dsm': 'DSM'}  # keep simple for now

results = pd.concat(results).rename(dict_name)
results.index.set_names('Model', level=0, inplace=True)

# --- Summarize results
table = results.groupby(['Model', 'Risk', 'Metric']).apply(
    lambda x: pd.Series(["{:.3f} ({:.3f})".format(mean, std) for mean, std in zip(x.mean(), x.std())], index=x.columns)
)
table = table.unstack(level=-1).stack(level=0).unstack(level=-1).loc[:, ['CIS', 'BRS']]
table = table.reorder_levels(['Risk', 'Model']).sort_index(level=0, sort_remaining=False)

# --- Display table
print(table)

In [None]:
import os
import pandas as pd
from metrics import evaluate  # assumes your metrics.py is accessible

base_path = '/content/drive/MyDrive/DLHC/NeuralFineGray/Results'
all_tables = []

# Loop through each dataset directory
for dataset in os.listdir(base_path):
    path = os.path.join(base_path, dataset) + '/'
    if not os.path.isdir(path):
        continue

    print(f"\n🔍 Processing dataset: {dataset}")
    predictions, results = {}, {}

    # Load all model prediction CSVs for this dataset
    for file_name in os.listdir(path):
        if dataset in file_name and file_name.endswith('.csv') and '_dh' not in file_name:
            model = file_name[file_name.rindex('_') + 1: file_name.index('.')]
            print("  → Opening:", file_name, 'as model', model)
            df = pd.read_csv(path + file_name, header=[0, 1], index_col=0)
            predictions[model] = df
            # Call your own evaluate() function
            results[model] = evaluate(df, groups='all', times_eval='default')  # modify args if needed

    if not results:
        print(f"⚠️ No usable results for {dataset}")
        continue

    # Rename models nicely
    dict_name = {'dsm': 'DSM', 'rfg': 'RFG', 'nfg': 'NFG', 'deephit': 'DeepHit'}
    results = pd.concat(results).rename(dict_name)
    results.index.set_names('Model', level=0, inplace=True)

    # Summarize results
    table = results.groupby(['Model', 'Risk', 'Metric']).apply(
        lambda x: pd.Series(["{:.3f} ({:.3f})".format(mean, std) for mean, std in zip(x.mean(), x.std())], index=x.columns)
    )
    table = table.unstack(level=-1).stack(level=0).unstack(level=-1).loc[:, ['CIS', 'BRS']]
    table = table.reorder_levels(['Risk', 'Model']).sort_index(level=0, sort_remaining=False)

    # Add dataset label and collect
    table['Dataset'] = dataset
    all_tables.append(table.reset_index())

# Combine all into one big summary table
summary_df = pd.concat(all_tables)
summary_df.to_csv(os.path.join(base_path, 'summary_all_datasets.csv'), index=False)
print("✅ Saved summary_all_datasets.csv")


In [None]:
! git push origin main

In [None]:
! chmod +x .git/hooks/post-commit

In [None]:
rm .git/hooks/post-commit

In [None]:
ls -l .git/hooks/

In [None]:
pwd

In [None]:
! shutil.copy("CollectStats.ipynb", "CollectStats_backup.ipynb")