In [16]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis
from sksurv.metrics import concordance_index_censored,concordance_index_ipcw
from sklearn.model_selection import cross_validate, cross_val_score
import joblib

In [17]:
def c_index(clf, X, y_struct):
    return concordance_index_censored(y_struct['Status'],y_struct['Survival'],clf.predict(X))[0]

In [18]:
def cx_pipelines(pipelines, X, y_struct, cv=5, scoring=c_index):
    scores = dict()

    for model, pipeline in pipelines.items():
        print(f"{model}: Cross validation...")
        score = cross_val_score(pipeline, X, y_struct, cv=cv, scoring=scoring)
        scores[model] = score
        print(f"Score:{score}")
        print("Done.")
        print("="*8)

    return scores

In [19]:
training_df = pd.read_csv("../../data/train_test/training.csv")
X_train = training_df.drop(columns=['survival_status', 'survival_time', 'recurrence', 'metastasis'])
y_train_struct = training_df[['survival_status', 'survival_time']].to_records(index=False).astype([('Status', 'bool'), ('Survival', 'float64')])

In [20]:
pipelines = joblib.load("../../data/pipelines/002-003-basic-pipelines.pl")

p-value

In [21]:
features = list(pd.read_csv("../../data/selected_features/p-value-recur-related-train.csv").columns)
len(features)

5470

In [22]:
X_train_Selected = X_train[features]

In [23]:
scores = dict()

for model, pipeline in pipelines.items():
    print(f"{model}: Cross validation...")
    score = cross_val_score(pipeline, X_train_Selected, y_train_struct, cv=5, scoring=c_index)
    scores[model] = score
    print(f"Score:{score}")
    print("Done.")
    print("="*8)

coxnet: Cross validation...
Score:[0.57467532 0.60335196 0.43884892 0.81725888 0.58450704]
Done.
rf: Cross validation...
Score:[0.52597403 0.56703911 0.58633094 0.56345178 0.41901408]
Done.
gb: Cross validation...
Score:[0.55844156 0.61731844 0.44604317 0.56852792 0.49295775]
Done.
fksvm: Cross validation...


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Score:[0.54545455 0.59078212 0.3794964  0.73604061 0.45422535]
Done.


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [24]:
pd.DataFrame(scores, columns=scores.keys()).to_csv("../../data/results/002-004-pvalue_features_cx_cindex.csv", index=False)

correlation

In [25]:
features = list(pd.read_csv("../../data/selected_features/corr-recur-related-train.csv").columns)
X_train_Selected = X_train[features]
len(features)

4063

In [26]:
corr_scores = cx_pipelines(pipelines, X_train_Selected, y_train_struct)
pd.DataFrame(corr_scores, columns=corr_scores.keys()).to_csv("../../data/results/002-004-corr_features_cx_cindex.csv", index=False)

coxnet: Cross validation...
Score:[0.54545455 0.56424581 0.59352518 0.59898477 0.51056338]
Done.
rf: Cross validation...
Score:[0.52597403 0.54748603 0.58992806 0.65989848 0.5       ]
Done.
gb: Cross validation...
Score:[0.50324675 0.5726257  0.61510791 0.50253807 0.40492958]
Done.
fksvm: Cross validation...


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Score:[0.56818182 0.61312849 0.58633094 0.72081218 0.48943662]
Done.


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


select from rf

In [27]:
features = list(pd.read_csv("../../data/selected_features/rf-recur-related-train.csv").columns)
X_train_Selected = X_train[features]
len(features)

2001

In [28]:
rf_scores = cx_pipelines(pipelines, X_train_Selected, y_train_struct)
pd.DataFrame(rf_scores, columns=rf_scores.keys()).to_csv("../../data/results/002-004-rf_features_cx_cindex.csv", index=False)

coxnet: Cross validation...
Score:[0.49350649 0.58379888 0.5323741  0.71573604 0.54929577]
Done.
rf: Cross validation...
Score:[0.58441558 0.61452514 0.50719424 0.55837563 0.52112676]
Done.
gb: Cross validation...
Score:[0.60064935 0.60893855 0.58633094 0.56852792 0.51408451]
Done.
fksvm: Cross validation...


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Score:[0.50324675 0.5726257  0.39568345 0.68527919 0.54577465]
Done.


pvalue+correlation

In [29]:
features = list(pd.read_csv("../../data/selected_features/cor_p-recur-related-train.csv").columns)
X_train_Selected = X_train[features]
len(features)

6649

In [30]:
p_corr_scores = cx_pipelines(pipelines, X_train_Selected, y_train_struct)
pd.DataFrame(p_corr_scores, columns=p_corr_scores.keys()).to_csv("../../data/results/002-004-p_corr_features_cx_cindex.csv", index=False)

coxnet: Cross validation...
Score:[0.52597403 0.61452514 0.48561151 0.6142132  0.54929577]
Done.
rf: Cross validation...
Score:[0.49025974 0.59776536 0.51798561 0.58375635 0.43309859]
Done.
gb: Cross validation...
Score:[0.47727273 0.55865922 0.39928058 0.45177665 0.49647887]
Done.
fksvm: Cross validation...


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Score:[0.55519481 0.61592179 0.4028777  0.66497462 0.48943662]
Done.


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
