# Sample Cox Model Prediction through our model on Box

*PredictThroughCoxModel.ipynb* notebook is offering detailed step-by-step instructions on how to predict cox hazards results based on our models for each cancer.
Please put this notebook at the same path as Cox Models under CL4CaPro_Models folder

### Pick Cancer

In [2]:
Cancer = 'BLCA'

### Put Input Patient Info
e.g. put your input in *BLCA_predict_input.csv*

The format of the data adheres to the following rules:
1. The data starts with six columns to collect clinical information. (Can be blank for both task)
2. The first column, named 'patient bar', contains identifying information for the patient.
3. The second column is labeled 'PFI', which denotes the PFI (Progression-Free Interval) status—either censored or uncensored. A '1' indicates an event such as disease progression, local recurrence, distant metastasis, new primary tumor, or death from cancer without a new tumor event, including cases where the tumor type is unspecified. A '0' represents censored cases, with new primary tumors in other organs also being censored.
4. The third column, 'PFItime', represents the progression-free interval time in days. For events, this could be the time to a new tumor event or death; for censored cases, it's the time to last contact or death, whichever is applicable.
5. The fourth column, 'gen_id', refers to the type of cancer, such as BLCA, BRCA, etc.
6. The fifth column, 'predicted_label', assign 0 or leave blank for cox task.
7. The sixth column is flexible and can include any additional information, such as comments or notes. This column is reserved for the user.
8. The rest column contain Gene expression values and header is corresponding Gene ID.

In [None]:
input_pth = 'BLCA_predict_input.csv'

### Read Input and Check

In [None]:
import pandas as pd
input_df = pd.read_csv(input_pth)
input_df.to_csv('./Dataset/CancerRNA_Prediction_{}_WholeTimeSeq_3.txt'.format(Cancer), index=None)
input_df

### Generate contrastive learning features based on the public cancer model

#### Get model path

In [None]:
import os

def find_clcp_folder_name(directory):
    for folder_name in os.listdir(directory):
        if folder_name.startswith('CLCP'):
            return folder_name
    return 'No CLCP folder found.'

# Assuming the directory to search is the current working directory
directory_to_search = './CL4CaPro_Models/Cox Models/{}'.format(Cancer)
clcp_folder_name = find_clcp_folder_name(directory_to_search)
model_pth = os.path.join(directory_to_search, clcp_folder_name)

#### Generate feature

In [None]:
para = clcp_folder_name.split('_')
input_dim = para[1]
model_n_hidden_1 = para[2]
model_out_dim = para[3]
feat_dim = para[5]
batch_size = para[-3]
l2_rate = para[9]
seed = para[13]
round = para[11]
device = 0
lr = para[7]

In [None]:
! python GenerateFeatures_Predict.py --layer_name feat --model_in_dim {input_dim} --dim_1_list {model_n_hidden_1} \
                                     --dim_2_list {model_out_dim} --dim_3_list {feat_dim} --batch_size {batch_size} \
                                     --l2_rate {l2_rate} --seed {seed} --round {round} --gpu_device {device} \
                                     --learning_rate_list {lr} --task WholeTimeSeq --model_pth {model_pth} \
                                     --cancer_group {cancer}

#### Predict Results

##### Cox-XGB
Predict Results

In [None]:
import xgboost as xgb

# Initialize a model instance
loaded_cox_model = xgb.Booster()

# Load the model from the file
loaded_cox_model.load_model('./CL4CaPro_Models/Cox Models/{}/coxxgb_model.json'.format(Cancer))

predict_input_df = pd.read_csv('Features/PredictFeature_{}.txt'.format(Cancer))
X = predict_input_df.iloc[:, 6:]


test_pred = loaded_cox_model.predict(X)

Calculate C-index and IBS

In [None]:
from xgbse.converters import convert_data_to_xgb_format
from sksurv.metrics import concordance_index_censored, integrated_brier_score
from Cal_IBS import BreslowEstimator
import numpy as np

train_input_df = pd.read_csv('./CL4CaPro_Models/Cox Models/{}/model_CLCP_*.txt'.format(Cancer))
x_train = train_input_df.iloc[:, 6:]
y = train_input_df[['PFItime', 'PFI']]

y_test = predict_input_df[['PFItime', 'PFI']]

# Calculate the time points
combined_time_test_list = list(zip(y_test['Status'], y_test['Survival_in_days']))
combined_time_train_list = list(zip(y['Status'], y['Survival_in_days']))
sorted_combined_time_test_list = sorted(combined_time_test_list, key=lambda x: x[1])
sorted_combined_time_train_list = sorted(combined_time_train_list, key=lambda x: x[1])
sorted_status_test, sorted_time_test = zip(*sorted_combined_time_test_list)
sorted_status_train, sorted_time_train = zip(*sorted_combined_time_train_list)

last_true_index = -1
num_thre = 20
for index, status in reversed(list(enumerate(sorted_status_train))):
    if status and index <= len(sorted_status_train) - (num_thre + 1):
        last_true_index = index
        break
sorted_time_train_end = sorted_time_train[last_true_index]

last_true_index = -1
num_thre = 20
for index, status in reversed(list(enumerate(sorted_status_test))):
    if status and index <= len(sorted_status_test) - (num_thre + 1):
        last_true_index = index
        break
sorted_time_test_end = sorted_time_train[last_true_index]

# sorted_time_test_end = sorted_time_test[int(0.8 * len(sorted_time_test))]
sorted_time_test_start = sorted_time_test[int(0 * len(sorted_time_test))]
# sorted_time_train_end = sorted_time_train[int(0.8 * len(sorted_time_train))]
sorted_time_train_start = sorted_time_train[int(0 * len(sorted_time_train))]
times = np.arange(max(sorted_time_test_start, sorted_time_train_start),
                  min(sorted_time_test_end, sorted_time_train_end))

dval = convert_data_to_xgb_format(X, y_test, 'survival:cox')
dtrain = convert_data_to_xgb_format(x_train, y, 'survival:cox')

predictions = loaded_cox_model.predict(X)

test_pred = loaded_cox_model.predict(dval)
train_pred = loaded_cox_model.predict(dtrain)

scores = concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], predictions)
c_index = round(scores[0], 10)

baseline_model = BreslowEstimator().fit(train_pred, y['Status'], y['Survival_in_days'])
survs = baseline_model.get_survival_function(test_pred)
preds = np.asarray([[fn(t) for t in times] for fn in survs])
scores = integrated_brier_score(y, y_test, preds, times)
ibs = round(scores[0], 6)

print(c_index, ibs)

##### Cox-EN
Predict Results

In [None]:
from joblib import load

# Load the model from file
estimator_loaded = load('./CL4CaPro_Models/Cox Models/{}/coxen_model.joblib'.format(Cancer))

predict_input_df = pd.read_csv('Features/PredictFeature_{}.txt'.format(Cancer))
X = predict_input_df.iloc[:, 6:]

# Now you can use estimator_loaded for predictions
test_pred = estimator_loaded.predict(X)

Calculate C-index and IBS

In [None]:
from sksurv.metrics import concordance_index_censored, integrated_brier_score
from Cal_IBS import BreslowEstimator
import numpy as np

train_input_df = pd.read_csv('./CL4CaPro_Models/Cox Models/{}/model_CLCP_*.txt'.format(Cancer))
x_train = train_input_df.iloc[:, 6:]
y = train_input_df[['PFItime', 'PFI']]

y_test = predict_input_df[['PFItime', 'PFI']]

# Calculate the time points
combined_time_test_list = list(zip(y_test['Status'], y_test['Survival_in_days']))
combined_time_train_list = list(zip(y['Status'], y['Survival_in_days']))
sorted_combined_time_test_list = sorted(combined_time_test_list, key=lambda x: x[1])
sorted_combined_time_train_list = sorted(combined_time_train_list, key=lambda x: x[1])
sorted_status_test, sorted_time_test = zip(*sorted_combined_time_test_list)
sorted_status_train, sorted_time_train = zip(*sorted_combined_time_train_list)

last_true_index = -1
num_thre = 20
for index, status in reversed(list(enumerate(sorted_status_train))):
    if status and index <= len(sorted_status_train) - (num_thre + 1):
        last_true_index = index
        break
sorted_time_train_end = sorted_time_train[last_true_index]

last_true_index = -1
num_thre = 20
for index, status in reversed(list(enumerate(sorted_status_test))):
    if status and index <= len(sorted_status_test) - (num_thre + 1):
        last_true_index = index
        break
sorted_time_test_end = sorted_time_train[last_true_index]

# sorted_time_test_end = sorted_time_test[int(0.8 * len(sorted_time_test))]
sorted_time_test_start = sorted_time_test[int(0 * len(sorted_time_test))]
# sorted_time_train_end = sorted_time_train[int(0.8 * len(sorted_time_train))]
sorted_time_train_start = sorted_time_train[int(0 * len(sorted_time_train))]
times = np.arange(max(sorted_time_test_start, sorted_time_train_start),
                  min(sorted_time_test_end, sorted_time_train_end))

scores = concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], estimator_loaded.predict(X))
c_index = round(scores[0], 6)

train_pred = estimator_loaded.predict(x_train)
baseline_model = BreslowEstimator().fit(train_pred, y['Status'], y['Survival_in_days'])
survs = baseline_model.get_survival_function(test_pred)
preds = np.asarray([[fn(t) for t in times] for fn in survs])
scores = integrated_brier_score(y, y_test, preds, times)
ibs = round(scores[0], 6)

print(c_index, ibs)

##### Cox-nnet
Predict Results

In [None]:
from cox_nnet import *

# Load the model from file
coxnnet_loaded = loadModel('./CL4CaPro_Models/Cox Models/{}/coxnn_model.pkl'.format(Cancer))

predict_input_df = pd.read_csv('Features/PredictFeature_{}.txt'.format(Cancer))
X = predict_input_df.iloc[:, 6:]

test_pred = coxnnet_loaded.predictNewData(X)

Calculate C-index and IBS

In [None]:
from sksurv.metrics import concordance_index_censored, integrated_brier_score
from Cal_IBS import BreslowEstimator
import numpy as np

train_input_df = pd.read_csv('./CL4CaPro_Models/Cox Models/{}/model_CLCP_*.txt'.format(Cancer))
x_train = train_input_df.iloc[:, 6:]
y = train_input_df[['PFItime', 'PFI']]

y_test = predict_input_df[['PFItime', 'PFI']]

# Calculate the time points
combined_time_test_list = list(zip(y_test['Status'], y_test['Survival_in_days']))
combined_time_train_list = list(zip(y['Status'], y['Survival_in_days']))
sorted_combined_time_test_list = sorted(combined_time_test_list, key=lambda x: x[1])
sorted_combined_time_train_list = sorted(combined_time_train_list, key=lambda x: x[1])
sorted_status_test, sorted_time_test = zip(*sorted_combined_time_test_list)
sorted_status_train, sorted_time_train = zip(*sorted_combined_time_train_list)

last_true_index = -1
num_thre = 20
for index, status in reversed(list(enumerate(sorted_status_train))):
    if status and index <= len(sorted_status_train) - (num_thre + 1):
        last_true_index = index
        break
sorted_time_train_end = sorted_time_train[last_true_index]

last_true_index = -1
num_thre = 20
for index, status in reversed(list(enumerate(sorted_status_test))):
    if status and index <= len(sorted_status_test) - (num_thre + 1):
        last_true_index = index
        break
sorted_time_test_end = sorted_time_train[last_true_index]

# sorted_time_test_end = sorted_time_test[int(0.8 * len(sorted_time_test))]
sorted_time_test_start = sorted_time_test[int(0 * len(sorted_time_test))]
# sorted_time_train_end = sorted_time_train[int(0.8 * len(sorted_time_train))]
sorted_time_train_start = sorted_time_train[int(0 * len(sorted_time_train))]
times = np.arange(max(sorted_time_test_start, sorted_time_train_start),
                  min(sorted_time_test_end, sorted_time_train_end))

scores = concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], coxnnet_loaded.predictNewData(X))
c_index = round(scores[0], 6)

train_pred = coxnnet_loaded.predictNewData(x_train)
baseline_model = BreslowEstimator().fit(train_pred, y['Status'], y['Survival_in_days'])
survs = baseline_model.get_survival_function(test_pred)
preds = np.asarray([[fn(t) for t in times] for fn in survs])
scores = integrated_brier_score(y, y_test, preds, times)
ibs = round(scores[0], 6)

print(c_index, ibs)