# Contrastive Learning for Predicting Cancer Prognosis Using Gene Expression Values

## CPTAC3 and DKFZ Validation

*CPTAC3&DKFZ.ipynb* notebook is offering detailed step-by-step instructions on how to validate CPTAC-3 LUAD, LUSC and DKFZ PRAD dataset through contrastive learning model trained by TCGA dataset.

### Setup Dataset

Pick Cancer and Task and choose corresponding prepared mapped dataset (Download in CPTAC-3&DKFZ folder in the GitHub repo)

In [None]:
Task = 'Risk'
Cancer = 'LUAD'
dataset_pth = 'CancerRNA_CPTAC3_LUAD_Risk_2.txt' #Put Dataset in the folder called CPTAC-3 or DKFZ (if pick DKFZ dataset)

Get Contrastive Learning Model path

In [None]:
import os

def find_clcp_folder_name(directory):
    for folder_name in os.listdir(directory):
        if folder_name.startswith('CLCP'):
            return folder_name
    return 'No CLCP folder found.'

# Assuming the directory to search is the current working directory
if Task == 'Risk':
    directory_to_search = './Classifier Models/{}'.format(Cancer)
else:
    directory_to_search = './Cox Models/{}'.format(Cancer)
clcp_folder_name = find_clcp_folder_name(directory_to_search)
clcp_folder_name
model_pth = './{}/{}'.format(Cancer, clcp_folder_name)

Generate Contrastive Learning Model features

In [None]:
para = clcp_folder_name.split('_')
input_dim = para[1]
model_n_hidden_1 = para[2]
model_out_dim = para[3]
feat_dim = para[5]
batch_size = para[-3]
l2_rate = para[9]
seed = para[13]
round = para[11]
device = 0
lr = para[7]

In [None]:
! python GenerateFeatures_CPTAC3.py --layer_name feat --model_in_dim {input_dim} --dim_1_list {model_n_hidden_1} \
                                    --dim_2_list {model_out_dim} --dim_3_list {feat_dim} --batch_size {batch_size} \
                                    --l2_rate {l2_rate} --seed {seed} --round {round} --gpu_device {device} \
                                    --learning_rate_list {lr} --task Risk \
                                    --cancer_group {cancer}

### Predict Results

#### XGB Classifier
Predict Results and Calculate AUC_ROC

In [None]:
import numpy as np
def calculate_sensitivity_specificity(y_test, predictions):
    true_positive = np.sum((y_test == 1) & (predictions == 1))
    true_negative = np.sum((y_test == 0) & (predictions == 0))
    false_positive = np.sum((y_test == 0) & (predictions == 1))
    false_negative = np.sum((y_test == 1) & (predictions == 0))

    sensitivity = true_positive / (true_positive + false_negative)
    specificity = true_negative / (true_negative + false_positive)

    return sensitivity, specificity

In [None]:
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, auc, roc_curve, roc_auc_score

# Initialize a model instance
loaded_classifier_model = XGBClassifier()

# Load the model from the file
loaded_classifier_model.load_model('./Classifier Models/{}/classifier_model.json'.format(Cancer))

predict_input_df = pd.read_csv('Features/PredictFeature_{}.txt'.format(Cancer))
X = predict_input_df.iloc[:, 6:]
y_test = predict_input_df[['PFItime', 'PFI']]

predictions = loaded_classifier_model.predict(X)

prob_predictions = loaded_classifier_model.predict_proba(X)[:, 1]

# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(y_test, prob_predictions)
auc_roc = roc_auc_score(y_test, prob_predictions)
print(auc_roc)

# Plot
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc_roc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show

f1 = f1_score(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
sensitivity, specificity = calculate_sensitivity_specificity(y_test, predictions)

#### Cox-XGB
Predict Results and Calculate C-index and IBS

In [None]:
import xgboost as xgb
from xgbse.converters import convert_data_to_xgb_format
from sksurv.metrics import concordance_index_censored, integrated_brier_score

# Initialize a model instance
loaded_cox_model = xgb.Booster()

# Load the model from the file
loaded_cox_model.load_model('./Cox Models/{}/coxxgb_model.json'.format(Cancer))

predict_input_df = pd.read_csv('Features/PredictFeature_{}.txt'.format(Cancer))
X = predict_input_df.iloc[:, 6:]
y_test = predict_input_df[['PFItime', 'PFI']]

train_input_df = pd.read_csv('./Cox Models/{}/model_CLCP_*.txt'.format(Cancer))
x_train = train_input_df.iloc[:, 6:]
y = train_input_df[['PFItime', 'PFI']]

dval = convert_data_to_xgb_format(X, y_test, 'survival:cox')
dtrain = convert_data_to_xgb_format(x_train, y, 'survival:cox')

predictions = loaded_cox_model.predict(X)

test_pred = loaded_cox_model.predict(dval)
train_pred = loaded_cox_model.predict(dtrain)

scores = concordance_index_censored(y['Status'], y['Survival_in_days'], predictions)
c_index = round(scores[0], 10)

baseline_model = BreslowEstimator().fit(train_pred, y['Status'], y['Survival_in_days'])
survs = baseline_model.get_survival_function(test_pred)
preds = np.asarray([[fn(t) for t in times] for fn in survs])
scores = integrated_brier_score(y, y_test, preds, times)
ibs = round(scores[0], 6)

print(predictions, c_index, ibs)

#### Cox-EN
Predict Results and Calculate C-index and IBS

In [None]:
from joblib import load
from sksurv.metrics import concordance_index_censored, integrated_brier_score

# Load the model from file
estimator_loaded = load('./Cox Models/{}/coxen_model.joblib'.format(Cancer))

predict_input_df = pd.read_csv('Features/PredictFeature_{}.txt'.format(Cancer))
X = predict_input_df.iloc[:, 6:]
y_test = predict_input_df[['PFItime', 'PFI']]

train_input_df = pd.read_csv('./Cox Models/{}/model_CLCP_*.txt'.format(Cancer))
x_train = train_input_df.iloc[:, 6:]
y = train_input_df[['PFItime', 'PFI']]

# Now you can use estimator_loaded for predictions
test_pred = estimator_loaded.predict(X)

scores = concordance_index_censored(y['Status'], y['Survival_in_days'], estimator_loaded.predict(X))
c_index = round(scores[0], 6)

train_preds = estimator_loaded.predict(x_train)
baseline_model = BreslowEstimator().fit(train_preds, y['Status'], y['Survival_in_days'])
survs = baseline_model.get_survival_function(test_pred)
preds = np.asarray([[fn(t) for t in times] for fn in survs])
scores = integrated_brier_score(y, y_test, preds, times)
ibs = round(scores[0], 6)

print(predictions, c_index, ibs)

#### Cox-nnet
Predict Results and Calculate C-index and IBS

In [None]:
from cox_nnet import *
from sksurv.metrics import concordance_index_censored, integrated_brier_score

# Load the model from file
coxnnet_loaded = loadModel('./Cox Models/{}/coxnn_model.pkl'.format(Cancer))

predict_input_df = pd.read_csv('Features/PredictFeature_{}.txt'.format(Cancer))
X = predict_input_df.iloc[:, 6:]
y_test = predict_input_df[['PFItime', 'PFI']]

train_input_df = pd.read_csv('./Cox Models/{}/model_CLCP_*.txt'.format(Cancer))
x_train = train_input_df.iloc[:, 6:]
y = train_input_df[['PFItime', 'PFI']]

test_pred = coxnnet_loaded.predictNewData(X)

scores = concordance_index_censored(y['Status'], y['Survival_in_days'], coxnnet_loaded.predictNewData(X))
c_index = round(scores[0], 6)

train_preds = coxnnet_loaded.predictNewData(x_train)
baseline_model = BreslowEstimator().fit(train_preds, y['Status'], y['Survival_in_days'])
survs = baseline_model.get_survival_function(test_pred)
preds = np.asarray([[fn(t) for t in times] for fn in survs])
scores = integrated_brier_score(y, y_test, preds, times)
ibs = round(scores[0], 6)

print(predictions, c_index, ibs)