# CS598 DLH Project: SADRE & CADRE
Reproducing the work of "Predicting Drug Sensitivity of Cancer Cell Lines via Collaborative Filtering with Contextual Attention" by Yifeng Tao, Shuangxia Ren, Michael Q. Ding, Russell Schwartz, and Xinghua Lu.

### Mounting the Repository
Run the following cell to mount the repository locaed in your Google Drive into Google Collab.

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Listing the Python Version used and Importing the Necessary Packages
A random seed is set to make tesing more reliable across different experiments

In [None]:
!python --version

In [None]:
import numpy as np
import torch
import random
import pandas as pd
from sklearn.metrics import auc, roc_curve, precision_recall_curve
random.seed(598)

### Data Description and Analysis
Use of pandas to describe and analyze the provided csv files.

In [None]:
dataset_path = "/content/drive/MyDrive/CS598_DL4H_Project/CADRE-master/data/input/"

In [None]:
gdsc_df = pd.read_csv(dataset_path+"gdsc.csv")
# Find the number of cosmic ids and drug ids
num_cosmic_ids, num_drugs = gdsc_df.shape
# Skip the first column containing the cosmic ids
gdsc_df_summed = gdsc_df.iloc[:, 1:].sum(axis=1, skipna=True)
min_gene_response = gdsc_df_summed.min()
max_gene_response = gdsc_df_summed.max()
avg_gene_response = gdsc_df_summed.mean()
# Create a table to display stats about the gdsc.csv file
stats = {'num_cosmic_ids':num_cosmic_ids, 'num_drugs':num_drugs, 'min_gene_response':min_gene_response, 'max_gene_response':max_gene_response, 'avg_gene_response':avg_gene_response}
gdsc_stats_df = pd.DataFrame.from_dict(stats, orient='index')
print("Table 1: gdsc.csv Description")
gdsc_stats_df

In [None]:
exp_gdsc_df = pd.read_csv(dataset_path+"exp_gdsc.csv")
# Find the number of cosmic ids and gene symbols
num_cosmic_ids, num_gene_symbols = exp_gdsc_df.shape
# Skip the first column containing the cosmic ids
exp_gdsc_df_summed = exp_gdsc_df.iloc[:, 1:].sum(axis=1, skipna=True)
min_gene_expression = exp_gdsc_df_summed.min()
max_gene_expression = exp_gdsc_df_summed.max()
avg_gene_expression = exp_gdsc_df_summed.mean()
# Create a table to display stats about the exp_gdsc.csv file
stats = {'num_cosmic_ids':num_cosmic_ids, 'num_gene_symbols':num_gene_symbols, 'min_gene_expression':min_gene_expression, 'max_gene_expression':max_gene_expression, 'avg_gene_expression':avg_gene_expression}
exp_gdsc_stats_df = pd.DataFrame.from_dict(stats, orient='index')
print("Table 2: exp_gdsc.csv Description")
exp_gdsc_stats_df

In [None]:
drug_info_gdsc_df = pd.read_csv(dataset_path+"drug_info_gdsc.csv")
# Find the number of drug ids, unique targets, and unique target pathways
num_drug_ids = len(drug_info_gdsc_df['drug_id'].unique())
unique_targets = len(drug_info_gdsc_df['Targets'].unique())
unique_target_pathways = len(drug_info_gdsc_df['Target pathway'].unique())
# For each drug find the min, max, and avg sample size
drug_info_gdsc_df_sample_size = drug_info_gdsc_df['Sample Size']
min_sample_size = drug_info_gdsc_df_sample_size.min()
max_sample_size = drug_info_gdsc_df_sample_size.max()
avg_sample_size = drug_info_gdsc_df_sample_size.mean()
# Create a table to display stats about the exp_gdsc.csv file
stats = {'num_drug_ids':num_drug_ids, 'unique_targets':unique_targets, 'unique_target_pathways':unique_target_pathways, 'min_sample_size':min_sample_size, 'max_sample_size':max_sample_size, 'avg_sample_size':avg_sample_size}
drug_info_gdsc_stats_df = pd.DataFrame.from_dict(stats, orient='index')
print("Table 3: drug_info_gdsc.csv Description")
drug_info_gdsc_stats_df

### Executing the training and evaluation of the SADRE & CADRE models
Run the following cell to execute the training and evaluation of the models. 

**Note**: Be sure to change the runtime to "T4 GPU" in order to have the necessary resources to run the model.

In [None]:
# Takes ~25 min to finish running, CF/SADRE is fast, CADRE training is slower
!python /content/drive/MyDrive/CS598/CADRE-master/run_cf.py

### Results Analysis and Breakdown

In [None]:
CF = [['Collaborative Filtering', 0.4, 0.1, 41.7, 52.8, 37.0, 52.9],
      ['Collaborative Filtering', 0.4, 0.3, 42.5, 54.8, 37.2, 54.5],
      ['Collaborative Filtering', 0.4, 0.5, 44.2, 57.1, 39.2, 57.4],
      ['Collaborative Filtering', 0.6, 0.1, 40.3, 50.7, 34.7, 50.9],
      ['Collaborative Filtering', 0.6, 0.3, 42.1, 54.8, 36.6, 54.5],
      ['Collaborative Filtering', 0.6, 0.5, 44.3, 56.6, 41.1, 57.6],
      ['Collaborative Filtering', 0.8, 0.1, 40.3, 51.3, 34.6, 51.2],
      ['Collaborative Filtering', 0.8, 0.3, 42.6, 52.8, 36.3, 53.5],
      ['Collaborative Filtering', 0.8, 0.5, 41.3, 53.6, 36.7, 53.6]]
SADRE = [['SADRE', 0.4, 0.1, 39.9, 50.0, 34.5, 49.8],
         ['SADRE', 0.4, 0.3, 42.1, 55.5, 38.8, 55.3],
         ['SADRE', 0.4, 0.5, 44.1, 57.1, 39.4, 57.7],
         ['SADRE', 0.6, 0.1, 41.1, 52.4, 35.0, 52.4],
         ['SADRE', 0.6, 0.3, 42.1, 53.5, 35.9, 53.5],
         ['SADRE', 0.6, 0.5, 43.0, 56.1, 37.9, 56.1],
         ['SADRE', 0.8, 0.1, 40.2, 50.3, 33.7, 50.0],
         ['SADRE', 0.8, 0.3, 40.8, 51.4, 34.6, 51.1],
         ['SADRE', 0.8, 0.5, 42.6, 55.3, 38.0, 55.2]]
CADRE_no_pretrain = [['CADRE w/o pretrain', 0.4, 0.1, 53.6, 69.2, 56.1, 70.9],
                     ['CADRE w/o pretrain', 0.4, 0.3, 54.2, 70.4, 56.8, 72.4],
                     ['CADRE w/o pretrain', 0.4, 0.5, 55.3, 71.8, 59.1, 73.8],
                     ['CADRE w/o pretrain', 0.6, 0.1, 52.1, 67.5, 53.2, 69.1],
                     ['CADRE w/o pretrain', 0.6, 0.3, 53.2, 68.9, 55.3, 70.7],
                     ['CADRE w/o pretrain', 0.6, 0.5, 53.7, 69.6, 56.9, 71.5],
                     ['CADRE w/o pretrain', 0.8, 0.1, 48.2, 62.7, 47.6, 63.9],
                     ['CADRE w/o pretrain', 0.8, 0.3, 49.7, 64.5, 49.0, 65.6],
                     ['CADRE w/o pretrain', 0.8, 0.5, 50.4, 65.2, 50.4, 66.8]]
CADRE_with_pretrain = [['CADRE with pretrain', 0.4, 0.1, 47.8, 62.4, 46.6, 63.5],
                       ['CADRE with pretrain', 0.4, 0.3, 52.9, 68.7, 52.8, 69.8],
                       ['CADRE with pretrain', 0.4, 0.5, 53.9, 69.9, 55.2, 71.5],
                       ['CADRE with pretrain', 0.6, 0.1, 45.3, 59.0, 43.8, 59.6],
                       ['CADRE with pretrain', 0.6, 0.3, 48.9, 63.6, 45.7, 64.6],
                       ['CADRE with pretrain', 0.6, 0.5, 50.5, 66.1, 46.9, 67.2],
                       ['CADRE with pretrain', 0.8, 0.1, 42.2, 55.0, 35.6, 54.0],
                       ['CADRE with pretrain', 0.8, 0.3, 10.2, 65.4, 37.6, 51.1],
                       ['CADRE with pretrain', 0.8, 0.5, 0.5, 65.9, 27.5, 50.2]]

results = pd.DataFrame(CF + SADRE + CADRE_no_pretrain + CADRE_with_pretrain, columns=['model', 'dropout_rate', 'learning_rate', 'f1 score', 'accuracy', 'AUPR', 'AUROC'])
results

In [None]:
best_by_test_f1 = results.loc[
    results.groupby(['dropout_rate', 'learning_rate'])['f1 score'].idxmax()
].sort_values(by='f1 score', ascending=False)
best_by_test_f1

In [None]:
best_by_test_acc = results.loc[
    results.groupby(['dropout_rate', 'learning_rate'])['accuracy'].idxmax()
].sort_values(by='accuracy', ascending=False)
best_by_test_acc

In [None]:
results[results['dropout_rate'] == 0.4].groupby('model').mean().reset_index().drop('learning_rate', axis=1)

In [None]:
results[results['dropout_rate'] == 0.6].groupby('model').mean().reset_index().drop('learning_rate', axis=1)

In [None]:
results[results['dropout_rate'] == 0.8].groupby('model').mean().reset_index().drop('learning_rate', axis=1)

In [None]:
ablation_CF = [['Collaborative Filtering', 43.1, 57.5, 40.4, 57.3]]
ablation_SADRE = [['SADRE', 45.3, 59.4, 42.3, 59.8]]
ablation_CADRE_no_pretrain = [['CADRE w/o pretrain', 53.9, 70.3, 57.1, 72.0]]
ablation_CADRE_with_pretrain = [['CADRE with pretrain', 30.4, 62.4, 38.6, 54.0]]

ablation_results = pd.DataFrame(ablation_CF + ablation_SADRE + ablation_CADRE_no_pretrain + ablation_CADRE_with_pretrain, columns=['model', 'f1 score', 'accuracy', 'AUPR', 'AUROC'])
ablation_results