In [None]:
import json

import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from survhive._base import PCSurv
from survhive.cv import PCPHElasticNetCV, PCAFTElasticNetCV
from survhive.utils import transform_survival
from survhive.aft import AFT
from survhive.eh import EH

In [None]:
with open(f"../config.json") as f:
    config = json.load(f)

In [3]:
results_efron_lasso = {}
failures_efron_lasso = {}
sparsity_efron_lasso = {}

results_efron_elastic_net = {}
failures_efron_elastic_net = {}
sparsity_efron_elastic_net = {}

results_efron_precond = {}
failures_efron_precond = {}
sparsity_efron_precond = {}
tau_precond = {}

import json

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.nonparametric import kaplan_meier_estimator

from survhive._base import PCSurv
from survhive.aft import AFT
from survhive.cv import PCAFTElasticNetCV, PCEHMultiTaskLassoCV, PCPHElasticNetCV
from survhive.eh import EH
from survhive.utils import transform_survival

In [145]:
pipe = PCSurv(
    pc_pipe=make_pipeline(
        VarianceThreshold(),
        StandardScaler(),
        PCA(n_components=config["pc_n_components"]),
        CoxPHSurvivalAnalysis(ties="efron"),
        #AFT()
        
    ),
    model_pipe=make_pipeline(
        VarianceThreshold(),
        StandardScaler(),
        PCPHElasticNetCV(
            #tie_correction="efron",
            #l1_ratio=config["l1_ratio"],
            #eps=config["eps"],
            eps=0.01,
            n_alphas=config["n_alphas"],
            cv=5,
            stratify_cv=True,
            seed=config["seed"],
            shuffle_cv=config["shuffle_cv"],
            cv_score_method="linear_predictor",
            n_jobs=-1,
            max_coef=np.inf,
            alpha_type="pcvl"
        ),
    ),
)

In [146]:
from survhive.loss import eh_negative_likelihood

In [147]:




#2.1571926193706394 + 

In [148]:
pipe_new = make_pipeline(StandardScaler(), PCA(n_components=16), 
                     
                    #AFT()
                     #CoxPHSurvivalAnalysis(ties="efron")
                         EH()
                    )

In [149]:
config["datasets"]

['BLCA', 'BRCA', 'HNSC', 'KIRC', 'LGG', 'LIHC', 'LUAD', 'LUSC', 'OV', 'STAD']

In [150]:
np.random.seed(42)
#for cancer in config["datasets"]:
for cancer in ["BLCA"]:
    print(f"Starting: {cancer}")
    train_splits = pd.read_csv(f"~/Downloads/20230518/survhive/paper/data/splits/TCGA/{cancer}_train_splits.csv")
    test_splits = pd.read_csv(f"~/Downloads/20230518/survhive/paper/data/splits/TCGA/{cancer}_test_splits.csv")
    data = pd.read_csv(f"~/Downloads/20230518/survhive/paper/data/processed/TCGA/{cancer}_data_preprocessed.csv").iloc[:, 1:]
    X_ = data.iloc[:, 3:]
    y_ = transform_survival(time=data["OS_days"].values, event=data["OS"].values)
    for split in range(25):
        print(f"Starting split: {split+1} / 25")
        train_ix = train_splits.iloc[split, :].dropna().to_numpy().astype(int)
        test_ix = test_splits.iloc[split, :].dropna().to_numpy().astype(int)
        X_train = X_.iloc[train_ix, :].copy().reset_index(drop=True)
        y_train = y_[train_ix].copy()
        y_test = y_[test_ix].copy()
        X_test = X_.iloc[test_ix, :].copy().reset_index(drop=True)
        if split == 0:
            results_efron_lasso[cancer] = {}
            sparsity_efron_lasso[cancer] = {}
            failures_efron_lasso[cancer] = 0
        try:
            pipe.fit(X_train.to_numpy(), y_train)
            sparsity_efron_lasso[cancer][split] = np.sum(pipe.model_pipe[2].coef_ != 0)
            results_efron_lasso[cancer][split] = pipe.predict(X_test)
        except Exception as e:
            raise e
            failures_efron_lasso[cancer] += 1
            results_efron_lasso[cancer][split] = np.zeros(test_ix.shape[0])
            sparsity_efron_lasso[cancer][split] = 0
        break
    break
    pd.concat([pd.DataFrame(results_efron_lasso[cancer][i]) for i in range(25)], axis=1).to_csv(
        f"../results/efron_lasso_{cancer}.csv", index=False
    )

#pd.DataFrame(sparsity_efron_lasso).to_csv(
#    f"../results/efron_lasso_sparsity.csv", index=False
#)
#pd.DataFrame(failures_efron_lasso).to_csv(
#    f"../results/efron_lasso_failures.csv", index=False
#)


Starting: BLCA
Starting split: 1 / 25




In [151]:
np.sum(y_train["event"])

142

In [152]:
# (1, 20237, 100)


In [153]:
np.sum(pipe.model_pipe[2].coef_ != 0.0)

39

In [154]:
eh_negative_likelihood(
    linear_predictor=np.zeros((y_train["time"].shape[0], 2)),
    time=y_train["time"],
    event=y_train["event"].astype(int)
)

3.252880669939253

In [155]:
eh_negative_likelihood(
    linear_predictor=np.random.normal(-1, 3, (y_train["time"].shape[0], 2)),
    time=y_train["time"],
    event=y_train["event"].astype(int)
)


4.479107284191074

In [156]:
y_train.shape

(324,)

In [157]:
from sksurv.metrics import concordance_index_censored

In [158]:
# concordance_index_censored(event_indicator=y_[test_ix].copy()["event"], event_time=y_[test_ix].copy()["time"], estimate=results_efron_lasso["BLCA"][0])[0]

In [159]:
pipe.predict(X_test.to_numpy())

array([-0.2776316 ,  0.14004612, -0.20370126,  0.39173723, -0.00970651,
        0.05771012,  0.05322547,  0.08202448,  0.18323983,  0.12665015,
       -0.11329681,  0.26955933,  0.18883693,  0.10770389, -0.3195352 ,
       -0.38250669, -0.20394125, -0.27703501, -0.02920769,  0.37790002,
       -0.91279247, -0.56168988,  0.05034542,  0.10015822, -0.12558386,
       -0.67776806, -0.4892578 , -0.23505737, -0.87715013,  0.15347796,
        0.08536307, -0.17711362, -0.10368805, -0.69452827, -0.048186  ,
        0.45322366,  0.53541929, -0.65959891,  0.30967634, -0.49299708,
       -0.52438952,  0.22887245,  0.23104268, -0.00908392, -0.05147059,
        0.18515864,  0.26547192,  0.2068358 , -0.34002777, -0.17503982,
       -0.18761047, -0.37667211,  0.2679389 ,  0.62525432, -0.4846417 ,
        0.56822971, -0.17281455, -0.13446605, -0.29406452, -0.44149803,
       -0.1932071 ,  0.07720283,  0.0746714 ,  0.00731313,  0.01936764,
        0.17431902, -0.1150658 ,  0.17154365,  0.32317844,  0.42

In [160]:
surv = pipe.predict_survival_function(X_test.to_numpy(), y_test["time"])
durations_test = y_test["time"]
events_test = y_test["event"].astype(int)



In [161]:
from pycox.evaluation import EvalSurv

ev = EvalSurv(surv.T, durations_test, events_test, censor_surv='km')
ev.concordance_td()


0.61800346220427

In [162]:
time_grid = np.linspace(durations_test.min(), durations_test.max(), 100)
#_ = ev.brier_score(time_grid).plot()
ev.integrated_brier_score(time_grid)


0.20214260133509115

In [None]:
0.6358915175995383
0.21127830030839226
219

0.61800346220427
0.20214260133509115
39

In [None]:
# trust-ncg
# EH
# 
# 
# AFT
# 0.6568112133158125
# 0.19365856682123186