# Contrastive Learning for Predicting Cancer Prognosis Using Gene Expression Values

## Sample Model Prediction

*PredictThroughClassifierModel.ipynb* notebook is offering detailed step-by-step instructions on how to predict classifier results based on our models for each cancer.
Please put this notebook at the same path as Classifier Models under CL4CaPro_Models folder

### Pick Cancer

In [2]:
Cancer = 'BLCA'

### Put Input Patient Info
e.g. put your input in *BLCA_predict_input.csv*

In [None]:
input_pth = 'BLCA_predict_input.csv' # your prepared RNA-seq input

### Read Input and Check

In [None]:
import pandas as pd
input_df = pd.read_csv(input_pth)
input_df

### Generate contrastive learning features based on the public cancer model

#### Get model path

In [None]:
import os

def find_clcp_folder_name(directory):
    for folder_name in os.listdir(directory):
        if folder_name.startswith('CLCP'):
            return folder_name
    return 'No CLCP folder found.'

# Assuming the directory to search is the current working directory
directory_to_search = './CL4CaPro_Models/Classifier Models/{}'.format(Cancer)
clcp_folder_name = find_clcp_folder_name(directory_to_search)

#### Generate feature

In [None]:
para = clcp_folder_name.split('_')
input_dim = para[1]
model_n_hidden_1 = para[2]
model_out_dim = para[3]
feat_dim = para[5]
batch_size = para[-3]
l2_rate = para[9]
seed = para[13]
round = para[11]
device = 0
lr = para[7]

In [None]:
! python GenerateFeatures_Predict.py --layer_name feat --model_in_dim {input_dim} --dim_1_list {model_n_hidden_1} \
                                     --dim_2_list {model_out_dim} --dim_3_list {feat_dim} --batch_size {batch_size} \
                                     --l2_rate {l2_rate} --seed {seed} --round {round} --gpu_device {device} \
                                     --learning_rate_list {lr} --task Risk \
                                     --cancer_group {cancer}

#### Predict Results

Predict Risk

In [None]:
from xgboost import XGBClassifier

# Initialize a model instance
loaded_classifier_model = XGBClassifier()

# Load the model from the file
loaded_classifier_model.load_model('./CL4CaPro_Models/Classifier Models/{}/classifier_model.json'.format(Cancer))

predict_input_df = pd.read_csv('Features/PredictFeature_{}.txt'.format(Cancer))
X = predict_input_df.iloc[:, 6:]

predictions = loaded_classifier_model.predict(X)

Calculate AUC for multiple patients

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_curve, roc_auc_score

prob_predictions = loaded_classifier_model.predict_proba(X)[:, 1]

y = predict_input_df[['PFItime', 'PFI']]

# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(y, prob_predictions)
auc_roc = roc_auc_score(y, prob_predictions)
print(auc_roc)

# Plot
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc_roc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show

f1 = f1_score(y, predictions)
accuracy = accuracy_score(y, predictions)
precision = precision_score(y, predictions)
recall = recall_score(y, predictions)