In [None]:
%pip install pandas numpy matplotlib

In [None]:
import pandas as pd
import numpy as np
import pickle
import warnings
import matplotlib.pyplot as plt

# user attention required HERE -----------------------------------------------
#-----------------------------------------------------------------------------

# Insert here model type, data, and outcome columns names
model_name = "Cox"
data_name_boot = "mock_boots.csv"
outcome_name = "outcome"
outcome_time = "outcome_time"
#------------------------------------------------------------------------------

warnings.filterwarnings('ignore')

np.random.seed(seed=7)

savestr = model_name

# Load the data
data_original = pd.read_csv(data_name_boot)

data = data_original.copy()
del data["boot"]
del data[outcome_name]
del data[outcome_time]

# Load pipeline optimization results
final_res = pickle.load(open("backward_optimisation_results_" + savestr + ".p", "rb"))

best_final_res = []

for i in range(len(final_res)):
    best_final_res.append(max(np.asarray(final_res[i])[:,1]))
best_final_res = np.asarray(best_final_res)

# Show bootstrap performance
print("Percentile 25 C-Index on " + str(len(best_final_res)) + " bootstraps: " + str(round(np.percentile(best_final_res,25),3)))
print("Median C-Index on " + str(len(best_final_res)) + " bootstraps: " + str(round(np.median(best_final_res),3)))
print("Percentile 75 C-Index on " + str(len(best_final_res)) + " bootstraps: " + str(round(np.percentile(best_final_res,75),3)))

In [5]:
# Computations for feature selection

covariates_names = data.columns
max_len = len(covariates_names)
worst_covariates = pickle.load(open("backward_worst_covariates_" + savestr + ".p", "rb"))

rank = []

for i in range(len(worst_covariates)):
    ranks = np.zeros(len(covariates_names))
    ranks[worst_covariates[i]] = ranks[worst_covariates[i]] + np.arange(len(worst_covariates[i])-1,-1,-1)
    rank.append(ranks)

mean_ranks = np.mean(np.asarray(rank),0)
std_ranks = np.std(np.asarray(rank),0)
ranks_25 =  abs(mean_ranks - np.percentile(np.asarray(rank),axis=0,q=5))
ranks_75 =  abs(np.percentile(np.asarray(rank),axis=0,q=95) - mean_ranks)

var_performances = []
for i in range(len(covariates_names)):
    var_performances.append([])

for i in range(len(final_res)):
    for j in range(len(final_res[i])):
        var_performances[j].append(final_res[i][j][1])

mean_perf = []
perc_25 = []
perc_75 = []
for i in range(len(var_performances)):
    mean_perf.append(np.median(var_performances[i]))
    perc_25.append(np.median(var_performances[i])-np.percentile(var_performances[i],25))
    perc_75.append(np.percentile(var_performances[i],75)-np.median(var_performances[i]))
    
sort_idx = np.argsort(-mean_ranks)

In [None]:
# Plot figure for model performance
# WARNING: you may need to change axis limit according to your results
plt.figure( figsize=(10,5))
plt.errorbar(np.arange(max_len,0,-1),mean_perf, yerr=np.array(list(zip(perc_25, perc_75))).T, fmt = 'o',color = 'red', ecolor = 'black', elinewidth = 2, capsize=5, markersize = 6, markeredgewidth=2)
plt.xlabel("Number of variables")
plt.ylabel("C-Index")
plt.xticks(range(1,max_len+1))
plt.yticks(np.arange(0.6,0.8,0.05))
plt.ylim([0.6,0.8])
plt.grid("both")
plt.show()

In [None]:
# Plot feature ranking figure
plt.figure( figsize=(7,10))
plt.errorbar(mean_ranks[sort_idx], covariates_names[sort_idx], xerr = np.array(list(zip(ranks_25[sort_idx], ranks_75[sort_idx]))).T, fmt = 'o',color = 'red', ecolor = 'black', elinewidth = 2, capsize=5, markersize=6, markeredgewidth=2)
plt.xlabel("Variable rank, mean, 5th-95th percentile")
plt.grid(axis="y")
plt.show()