In [9]:
import json

import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from survhive.cv_models import CoxPHElasticNetCV, CoxPHPrecondCV
from survhive.utils import transform_survival, transform_preconditioning

In [10]:
with open(f"../config.json") as f:
    config = json.load(f)

In [11]:
results_efron_lasso = {}
failures_efron_lasso = {}
sparsity_efron_lasso = {}

results_efron_elastic_net = {}
failures_efron_elastic_net = {}
sparsity_efron_elastic_net = {}

results_efron_precond = {}
failures_efron_precond = {}
sparsity_efron_precond = {}

In [14]:
pipe = make_pipeline(
    StandardScaler(),
    CoxPHElasticNetCV(tie_correction="efron",
                     eps=0.05,
                      n_alphas=100,
                      l1_ratios=[1.0],
                      cv=5,
                      n_jobs=-1,
                      random_state=config["random_state"],
                      n_irls_iter=5,
                      tol=0.0001
                     )
)

In [24]:
for cancer in config["datasets"]:
    print(f"Starting: {cancer}")
    train_splits = pd.read_csv(f"../data/splits/TCGA/{cancer}_train_splits.csv")
    test_splits = pd.read_csv(f"../data/splits/TCGA/{cancer}_test_splits.csv")
    data = pd.read_csv(f"../data/processed/TCGA/{cancer}_data_preprocessed.csv").iloc[:, 1:]
    X_ = data.iloc[:, 3:]
    y_ = transform_survival(time=data["OS_days"].values, event=data["OS"].values)
    for split in range(25):
        print(f"Starting split: {split+1} / 25")
        train_ix = train_splits.iloc[split, :].dropna().to_numpy().astype(int)
        test_ix = test_splits.iloc[split, :].dropna().to_numpy().astype(int)
        X_train = X_.iloc[train_ix, :].copy().reset_index(drop=True)
        y_train = y_[train_ix].copy()
        X_test = X_.iloc[test_ix, :].copy().reset_index(drop=True)
        if split == 0:
            results_efron_lasso[cancer] = {}
            sparsity_efron_lasso[cancer] = {}
            failures_efron_lasso[cancer] = 0
        try:
            pipe.fit(X_train, y_train)
            sparsity_efron_lasso[cancer][split] = np.sum(pipe_efron[1].coef_ != 0)
            results_efron_lasso[cancer][split] = pipe_efron.predict(X_test)
        except ValueError as e:
            failures_efron_lasso[cancer] += 1
            results_efron_lasso[cancer][split] = np.zeros(test_ix.shape[0])
            sparsity_efron_lasso[cancer][split] = 0
            
    pd.concat([pd.DataFrame(results_efron[cancer][i]) for i in range(25)], axis=1).to_csv(
        f"../results/efron_lasso_{cancer}.csv", index=False
    )
    
pd.DataFrame(sparsity_efron_lasso).to_csv(
    f"../results/efron_lasso_sparsity.csv", index=False
)
pd.DataFrame(failures_efron_lasso).to_csv(
    f"../results/efron_lasso_failures.csv", index=False
)

Starting: BLCA
Starting split: 1 / 25
Starting split: 2 / 25
Starting split: 3 / 25
Starting split: 4 / 25


Exception ignored in: <Finalize object, dead>
Traceback (most recent call last):
  File "/Users/slightlier/miniforge3/envs/sparsesurv_dev/lib/python3.10/multiprocessing/util.py", line 218, in __call__
KeyboardInterrupt: 


KeyboardInterrupt: 

In [26]:
pipe = make_pipeline(
    StandardScaler(),
    CoxPHElasticNetCV(tie_correction="efron",
                     eps=0.05,
                      n_alphas=100,
                      l1_ratios=[.1, .5, .7, .9, .95, .99, 1],
                      cv=5,
                      n_jobs=-1,
                      random_state=config["random_state"],
                      n_irls_iter=5,
                      tol=0.0001
                     )
)

Unnamed: 0,BLCA
0,0
1,0
2,0


In [None]:
for cancer in config["datasets"]:
    print(f"Starting: {cancer}")
    train_splits = pd.read_csv(f"../data/splits/TCGA/{cancer}_train_splits.csv")
    test_splits = pd.read_csv(f"../data/splits/TCGA/{cancer}_test_splits.csv")
    data = pd.read_csv(f"../data/processed/TCGA/{cancer}_data_preprocessed.csv").iloc[:, 1:]
    X_ = data.iloc[:, 3:]
    y_ = transform_survival(time=data["OS_days"].values, event=data["OS"].values)
    for split in range(25):
        print(f"Starting split: {split+1} / 25")
        train_ix = train_splits.iloc[split, :].dropna().to_numpy().astype(int)
        test_ix = test_splits.iloc[split, :].dropna().to_numpy().astype(int)
        X_train = X_.iloc[train_ix, :].copy().reset_index(drop=True)
        y_train = y_[train_ix].copy()
        X_test = X_.iloc[test_ix, :].copy().reset_index(drop=True)
        if split == 0:
            results_efron_elastic_net[cancer] = {}
            sparsity_efron_elastic_net[cancer] = {}
            failures_efron_elastic_net[cancer] = 0
        try:
            pipe.fit(X_train, y_train)
            sparsity_efron_elastic_net[cancer][split] = np.sum(pipe_efron[1].coef_ != 0)
            results_efron_elastic_net[cancer][split] = pipe_efron.predict(X_test)
        except ValueError as e:
            failures_efron_elastic_net[cancer] += 1
            results_efron_elastic_net[cancer][split] = np.zeros(test_ix.shape[0])
            sparsity_efron_elastic_net[cancer][split] = 0
            
    pd.concat([pd.DataFrame(results_efron[cancer][i]) for i in range(25)], axis=1).to_csv(
        f"../results/efron_elastic_net_{cancer}.csv", index=False
    )
    
pd.DataFrame(sparsity_efron_elastic_net).to_csv(
    f"../results/efron_elastic_net_sparsity.csv", index=False
)
pd.DataFrame(failures_efron_elastic_net).to_csv(
    f"../results/efron_elastic_net_failures.csv", index=False
)

In [None]:
pipe = make_pipeline(
    StandardScaler(),
    CoxPHElasticNetCV(tie_correction="efron",
                     eps=0.05,
                      n_alphas=100,
                      taus=[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
                      cv=5,
                      n_jobs=-1,
                      random_state=config["random_state"],
                      maxiter=1000,
                      rtol=1e-6,
                      verbose=0,
                      default_step_size=1.0
                     )
)

In [None]:
for cancer in config["datasets"]:
    print(f"Starting: {cancer}")
    train_splits = pd.read_csv(f"../data/splits/TCGA/{cancer}_train_splits.csv")
    train_predictions = pd.read_csv(f"../results/teacher/efron_{cancer}.csv")
    test_splits = pd.read_csv(f"../data/splits/TCGA/{cancer}_test_splits.csv")
    data = pd.read_csv(f"../data/processed/TCGA/{cancer}_data_preprocessed.csv").iloc[:, 1:]
    X_ = data.iloc[:, 3:]
    for split in range(25):
        print(f"Starting split: {split+1} / 25")
        train_ix = train_splits.iloc[split, :].dropna().to_numpy().astype(int)
        test_ix = test_splits.iloc[split, :].dropna().to_numpy().astype(int)
        X_train = X_.iloc[train_ix, :].copy().reset_index(drop=True)
        #y_train = y_[train_ix].copy()
        y_train = transform_preconditioning(
            time=data["OS_days"].values[train_ix],
            event=data["OS"].values[train_ix],
            y_teacher=train_predictions.iloc[:, split].dropna().values
        )
        X_test = X_.iloc[test_ix, :].copy().reset_index(drop=True)
        if split == 0:
            results_efron_precond[cancer] = {}
            sparsity_efron_precond[cancer] = {}
            failures_efron_precond[cancer] = 0
        try:
            pipe.fit(X_train, y_train)
            sparsity_efron_precond[cancer][split] = np.sum(pipe_efron[1].coef_ != 0)
            results_efron_precond[cancer][split] = pipe_efron.predict(X_test)
        except ValueError as e:
            failures_efron_precond[cancer] += 1
            results_efron_precond[cancer][split] = np.zeros(test_ix.shape[0])
            sparsity_efron_precond[cancer][split] = 0
            
    pd.concat([pd.DataFrame(results_efron[cancer][i]) for i in range(25)], axis=1).to_csv(
        f"../results/efron_precond_{cancer}.csv", index=False
    )
    
pd.DataFrame(sparsity_efron_precond).to_csv(
    f"../results/efron_precond_sparsity.csv", index=False
)
pd.DataFrame(failures_efron_precond).to_csv(
    f"../results/efron_precond_failures.csv", index=False
)