This notebook implements the baseline CODE-AE on the raw mutations, for drugs using cell lines and TCGA on all genes.

### Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys

sys.path.append("../benchmarks/CODE-AE/")
sys.path.append("../src/")

In [3]:
import datetime
import logging
import os
import time
import torch
import random
import pickle

In [4]:
from torch import nn
from torch.nn import functional as F

from functools import cached_property

from torch.nn import Linear, ReLU, Sequential
from sklearn.metrics import average_precision_score, ndcg_score, roc_auc_score
from sklearn.model_selection import train_test_split

from datasets_drug_filtered import (
    TcgaDataset,
    AggCategoricalAnnotatedCellLineDatasetFilteredByDrug,
    AggCategoricalAnnotatedTcgaDatasetFilteredByDrug,
    AggCategoricalAnnotatedRad51DatasetFilteredByDrug
)
from utils import get_kld_loss, get_zinb_loss

In [5]:
from testbed import EvaluationTestbed
from model import (
    BaseDruidModel
)
from seaborn import scatterplot

In [6]:
torch.manual_seed(2020)
random.seed(2020)
np.random.seed(2020)

In [7]:
# To avoid randomness in DataLoaders - https://pytorch.org/docs/stable/notes/randomness.html
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)
    
g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7f146302f130>

In [8]:
sample_id = 0

### CODE-AE model

#### Pre-training CODE-AE with unlabelled datasets

In [9]:
import itertools
from torch.utils.data import TensorDataset, DataLoader
import json

In [10]:
import train_code_adv

In [11]:
# arguments class for hyperparameters like weight decay, learning rate etc
class arguments():
    def __init__(self, is_train=True):
        self.method = "code_adv" # we will use CODE-AE ADV since that was the best performing one in the original paper.
        if is_train:
            self.retrain_flag = True
        else:
            self.retrain_flag = False
        self.pdtc_flag = False
        self.norm_flag = False
        self.measurement = "AUC"
        

In [12]:
args = arguments()

In [13]:
params_grid = {
    "pretrain_num_epochs": [300],
    "train_num_epochs": [500],
    "dop": [0.1]
}

if args.method not in ['code_adv', 'adsn', 'adae', 'dsnw']:
    params_grid.pop('pretrain_num_epochs')

keys, values = zip(*params_grid.items())
update_params_dict_list = [dict(zip(keys, v)) for v in itertools.product(*values)]


In [14]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.device_count() > 0:
        torch.cuda.manual_seed_all(seed)


In [15]:
def unlabeled_dataloaders_mutations(batch_size, seed):
    set_seed(seed)
    # train cell line mutations
    ccle_dataset = AggCategoricalAnnotatedCellLineDatasetFilteredByDrug(is_train=True, filter_for="tcga", sample_id = sample_id)
    ccle_mutations_df = ccle_dataset.embedded_raw_mutations_all_genes_v2[ccle_dataset.embedded_raw_mutations_all_genes_v2.index.isin(ccle_dataset.y_df.depmap_id.unique())]
    ccle_mutations_dataset = TensorDataset(
        torch.from_numpy(ccle_mutations_df.values.astype('float32'))
    )
    
    # test cell line mutations
    ccle_dataset_test = AggCategoricalAnnotatedCellLineDatasetFilteredByDrug(is_train=False, filter_for="tcga", sample_id = sample_id)
    ccle_mutations_df_test = ccle_dataset_test.embedded_raw_mutations_all_genes_v2[ccle_dataset_test.embedded_raw_mutations_all_genes_v2.isin(ccle_dataset_test.y_df.depmap_id.unique())]
    ccle_mutations_dataset_test = TensorDataset(
        torch.from_numpy(ccle_mutations_df_test.values.astype('float32'))
    )
    
    ccle_mutations_df_full = ccle_dataset.embedded_raw_mutations_all_genes_v2
    ccle_dataloader = DataLoader(TensorDataset(torch.from_numpy(ccle_mutations_df_full.values.astype('float32'))),
                                 batch_size=batch_size, shuffle=True, generator=g, worker_init_fn=seed_worker
                                )
    train_ccle_dataloader = DataLoader(ccle_mutations_dataset,
                                       batch_size=batch_size,
                                       shuffle=True, generator=g, worker_init_fn=seed_worker)
    test_ccle_dataloader = DataLoader(ccle_mutations_dataset_test,
                                      batch_size=batch_size,
                                      shuffle=True, generator=g, worker_init_fn=seed_worker)
    rad51_dataset = AggCategoricalAnnotatedTcgaDatasetFilteredByDrug(is_train=True, filter_for="tcga", sample_id = sample_id)
    rad51_mutations_df_train = rad51_dataset.embedded_raw_mutations_all_genes_v2[rad51_dataset.embedded_raw_mutations_all_genes_v2.index.isin(rad51_dataset.tcga_response.submitter_id.unique())]
    rad51_dataloader = DataLoader(TensorDataset(torch.from_numpy(rad51_mutations_df_train.values.astype('float32'))),
                                 batch_size=batch_size, shuffle=True, generator=g, worker_init_fn=seed_worker)
    
    rad51_dataset_test = AggCategoricalAnnotatedTcgaDatasetFilteredByDrug(is_train=False, filter_for="tcga", sample_id = sample_id)
    rad51_mutations_df_test = rad51_dataset_test.embedded_raw_mutations_all_genes_v2[rad51_dataset_test.embedded_raw_mutations_all_genes_v2.index.isin(rad51_dataset_test.tcga_response.submitter_id.unique())]
    rad51_dataloader_test = DataLoader(TensorDataset(torch.from_numpy(rad51_mutations_df_test.values.astype('float32'))),
                                 batch_size=batch_size, shuffle=True, generator=g, worker_init_fn=seed_worker)
    
    return (ccle_dataloader, test_ccle_dataloader), (rad51_dataloader, rad51_dataloader_test)

In [16]:
def generate_encoded_features(encoder, dataloader, normalize_flag=False):
    """

    :param normalize_flag:
    :param encoder:
    :param dataloader:
    :return:
    """
    encoder.eval()
    raw_feature_tensor = dataloader.dataset.tensors[0].cpu()
    label_tensor = dataloader.dataset.tensors[1].cpu()

    encoded_feature_tensor = encoder.cpu()(raw_feature_tensor)
    if normalize_flag:
        encoded_feature_tensor = torch.nn.functional.normalize(encoded_feature_tensor, p=2, dim=1)
    return encoded_feature_tensor, label_tensor


def load_pickle(pickle_file):
    data = []
    with open(pickle_file, 'rb') as f:
        try:
            while True:
                data.append(pickle.load(f))
        except EOFError:
            pass

    return data


def wrap_training_params(training_params, type='unlabeled'):
    aux_dict = {k: v for k, v in training_params.items() if k not in ['unlabeled', 'labeled']}
    aux_dict.update(**training_params[type])

    return aux_dict


def safe_make_dir(new_folder_name):
    if not os.path.exists(new_folder_name):
        os.makedirs(new_folder_name)
    else:
        print(new_folder_name, 'exists!')


def dict_to_str(d):
    return "_".join(["_".join([k, str(v)]) for k, v in d.items()])


In [17]:
# From https://github.com/XieResearchGroup/CODE-AE/blob/main/code/pretrain_hyper_main.py
def pretrain(args, update_params_dict):
    if args.method == 'dsn':
        train_fn = train_dsn.train_dsn
    elif args.method == 'adae':
        train_fn = train_adae.train_adae
    elif args.method == 'coral':
        train_fn = train_coral.train_coral
    elif args.method == 'dae':
        train_fn = train_dae.train_dae
    elif args.method == 'vae':
        train_fn = train_vae.train_vae
    elif args.method == 'vaen':
        train_fn = train_vae.train_vae
    elif args.method == 'ae':
        train_fn = train_ae.train_ae
    elif args.method == 'code_mmd':
        train_fn = train_code_mmd.train_code_mmd
    elif args.method == 'code_base':
        train_fn = train_code_base.train_code_base
    elif args.method == 'dsna':
        train_fn = train_dsna.train_dsna
    else:
        train_fn = train_code_adv.train_code_adv

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    

    with open(os.path.join('../benchmarks/CODE-AE/model_save/train_params.json'), 'r') as f:
        training_params = json.load(f)

    training_params['unlabeled'].update(update_params_dict)
    param_str = dict_to_str(update_params_dict)

    if not args.norm_flag:
        method_save_folder = os.path.join('../benchmarks/CODE-AE/model_save', args.method)
    else:
        method_save_folder = os.path.join('../benchmarks/CODE-AE/model_save', f'{args.method}_norm')

    training_params.update(
        {
            'device': device,
            'input_dim': 324,
            'model_save_folder': os.path.join(method_save_folder, param_str),
            'es_flag': False,
            'retrain_flag': args.retrain_flag,
            'norm_flag': args.norm_flag
        })

    safe_make_dir(training_params['model_save_folder'])
    random.seed(2020)

    
    s_dataloaders, t_dataloaders = unlabeled_dataloaders_mutations(
        batch_size=training_params['unlabeled']['batch_size'],
        seed=2020,
    )

    # start unlabeled training
    encoder, historys = train_fn(s_dataloaders=s_dataloaders,
                                 t_dataloaders=t_dataloaders,
                                 **wrap_training_params(training_params, type='unlabeled'))
    with open(os.path.join(training_params['model_save_folder'], f'unlabel_train_history.pickle'),
              'wb') as f:
        for history in historys:
            pickle.dump(dict(history), f)

In [18]:
# for param_dict in update_params_dict_list:
#     pretrain(args, param_dict)

#### Fine tuning for drugs

In [19]:
from collections import defaultdict
from copy import deepcopy

In [20]:
from scipy.stats import zscore

In [21]:
import fine_tuning

In [22]:
def get_labeled_dataloader(drug, batch_size):
    # CCLE dataset labelled
    # train CCLE dataset
    ccle_dataset = AggCategoricalAnnotatedCellLineDatasetFilteredByDrug(is_train=True, filter_for="tcga", sample_id = sample_id)
    ccle_mutations_df = ccle_dataset.embedded_raw_mutations_all_genes_v2[ccle_dataset.embedded_raw_mutations_all_genes_v2.index.isin(ccle_dataset.y_df.depmap_id.unique())]
    ccle_mutations_dataset = TensorDataset(
        torch.from_numpy(ccle_mutations_df.values.astype('float32'))
    )
    
    # test CCLE dataset
    ccle_dataset_test = AggCategoricalAnnotatedCellLineDatasetFilteredByDrug(is_train=False, filter_for="tcga", sample_id = sample_id)
    ccle_mutations_df_test = ccle_dataset_test.embedded_raw_mutations_all_genes_v2[ccle_dataset_test.embedded_raw_mutations_all_genes_v2.isin(ccle_dataset_test.y_df.depmap_id.unique())]
    ccle_mutations_dataset_test = TensorDataset(
        torch.from_numpy(ccle_mutations_df_test.values.astype('float32'))
    )
    
    # Converting AUDRC scores to ZScores (as in the original implementation) and binarizing as 1 if Zscore < 0, else 0.
    ccle_full_dataset = AggCategoricalAnnotatedCellLineDatasetFilteredByDrug(is_train=None, filter_for="tcga", sample_id = sample_id)
    drug_specific_audrc = ccle_full_dataset.y_df[ccle_full_dataset.y_df.drug_name == drug]
    features = ccle_full_dataset.embedded_raw_mutations_all_genes_v2[ccle_full_dataset.embedded_raw_mutations_all_genes_v2.index.isin(drug_specific_audrc.depmap_id)]
    if len(drug_specific_audrc) == 0:
        print(f"Drug {drug} nor found in cell lines!") 
    else:
        zscores_from_auc = zscore(drug_specific_audrc["auc"].values, nan_policy="omit")
        zscore_threshold = 0.0
        labels = (zscores_from_auc < zscore_threshold)
        features["labels"] = labels
        features["labels"] = features["labels"].astype(int)
        ccle_train = torch.from_numpy(features[features.index.isin(ccle_mutations_df.index)].drop("labels", axis = 1).values.astype('float32'))
        ccle_labels_train = torch.from_numpy(features[features.index.isin(ccle_mutations_df.index)]["labels"].values)
        ccle_test = torch.from_numpy(features[features.index.isin(ccle_mutations_df_test.index)].drop("labels", axis = 1).values.astype('float32'))
        ccle_labels_test = torch.from_numpy(features[features.index.isin(ccle_mutations_df_test.index)]["labels"].values)
        train_labeled_ccle_dataset = TensorDataset(ccle_train, ccle_labels_train)
        test_labeled_ccle_dataset = TensorDataset(ccle_test, ccle_labels_test)
        train_labeled_ccle_dataloader = DataLoader(train_labeled_ccle_dataset, batch_size=batch_size, shuffle=True, generator=g, worker_init_fn=seed_worker)
        test_labeled_ccle_dataloader = DataLoader(test_labeled_ccle_dataset, batch_size=batch_size, shuffle=True, generator=g, worker_init_fn=seed_worker)
        
#     # Rad51 dataset
#     rad51_train_data = AggCategoricalAnnotatedRad51Dataset(is_train=True, only_cat_one_drugs=False)
#     rad51_drug_specific_df = rad51_train_data.rad51_response[rad51_train_data.rad51_response.drug_name == drug]
#     rad51_drug_specific_features = rad51_train_data.mutations[rad51_train_data.mutations.index.isin(rad51_drug_specific_df.patient_id.unique())]

#     if len(rad51_drug_specific_df) == 0:
#         print(f"Drug {drug} not found in Rad51")
#         train_labeled_rad51_dataloader = None
#     else:
#         rad51_df = rad51_drug_specific_features.merge(rad51_drug_specific_df[["response", "patient_id"]], on="patient_id").set_index("patient_id")
#         rad51_train = torch.from_numpy(rad51_df.drop("response", axis = 1).values)
#         rad51_labels_train = torch.from_numpy(rad51_df["response"].values.astype('float32'))
#         train_labeled_rad51_dataset = TensorDataset(rad51_train, rad51_labels_train)
#         train_labeled_rad51_dataloader = DataLoader(train_labeled_rad51_dataset, batch_size=batch_size, shuffle=True)
    
    return train_labeled_ccle_dataloader, test_labeled_ccle_dataloader, None
        

In [29]:
def fine_tune(args, drug, update_params_dict):
    if args.method == 'dsn':
        train_fn = train_dsn.train_dsn
    elif args.method == 'adae':
        train_fn = train_adae.train_adae
    elif args.method == 'coral':
        train_fn = train_coral.train_coral
    elif args.method == 'dae':
        train_fn = train_dae.train_dae
    elif args.method == 'vae':
        train_fn = train_vae.train_vae
    elif args.method == 'ae':
        train_fn = train_ae.train_ae
    elif args.method == 'code_mmd':
        train_fn = train_code_mmd.train_code_mmd
    elif args.method == 'code_base':
        train_fn = train_code_base.train_code_base
    elif args.method == 'dsna':
        train_fn = train_dsna.train_dsna
    else:
        train_fn = train_code_adv.train_code_adv

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    with open(os.path.join('../benchmarks/CODE-AE/model_save/train_params.json'), 'r') as f:
        training_params = json.load(f)

    training_params['unlabeled'].update(update_params_dict)
    param_str = dict_to_str(update_params_dict)

    if not args.norm_flag:
        method_save_folder = os.path.join('../benchmarks/CODE-AE/model_save', args.method)
    else:
        method_save_folder = os.path.join('../benchmarks/CODE-AE/model_save', f'{args.method}_norm')

    training_params.update(
        {
            'device': device,
            'input_dim': 324,
            'model_save_folder': os.path.join(method_save_folder, param_str),
            'es_flag': False,
            'retrain_flag': args.retrain_flag,
            'norm_flag': args.norm_flag
        })
    if args.pdtc_flag:
        task_save_folder = os.path.join(f'{method_save_folder}', args.measurement, 'pdtc', drug)
    else:
        task_save_folder = os.path.join(f'{method_save_folder}', args.measurement, drug)

    safe_make_dir(training_params['model_save_folder'])
    safe_make_dir(task_save_folder)

    random.seed(2020)

    s_dataloaders, t_dataloaders = unlabeled_dataloaders_mutations(
        batch_size=training_params['unlabeled']['batch_size'],
        seed=2020,
    )


    # start unlabeled training
    encoder, historys = train_fn(s_dataloaders=s_dataloaders,
                                 t_dataloaders=t_dataloaders,
                                 **wrap_training_params(training_params, type='unlabeled'))
    if args.retrain_flag:
        with open(os.path.join(training_params['model_save_folder'], f'unlabel_train_history.pickle'),
                  'wb') as f:
            for history in historys:
                pickle.dump(dict(history), f)

    # labeled_ccle_dataloader, labeled_tcga_dataloader = data.get_labeled_dataloaders(
    #     gex_features_df=gex_features_df,
    #     seed=2020,
    #     batch_size=training_params['labeled']['batch_size'],
    #     drug=drug,
    #     threshold=args.a_thres,
    #     days_threshold=args.days_thres,
    #     ccle_measurement=args.measurement,
    #     ft_flag=False,
    #     pdtc_flag=args.pdtc_flag
    # )
    # ml_baseline_history = defaultdict(list)
    # ccle_encoded_feature_tensor, ccle_label_tensor = generate_encoded_features(encoder, labeled_ccle_dataloader,
    #                                                                            normalize_flag=args.norm_flag)
    # tcga_encoded_feature_tensor, tcga_label_tensor = generate_encoded_features(encoder, labeled_tcga_dataloader,
    #                                                                            normalize_flag=args.norm_flag)
    # ml_baseline_history['enet'].append(
    #     ml_baseline.n_time_cv(
    #         model_fn=ml_baseline.classify_with_enet,
    #         n=int(args.n),
    #         train_data=(
    #             ccle_encoded_feature_tensor.detach().cpu().numpy(),
    #             ccle_label_tensor.detach().cpu().numpy()
    #         ),
    #         test_data=(
    #             tcga_encoded_feature_tensor.detach().cpu().numpy(),
    #             tcga_label_tensor.detach().cpu().numpy()
    #         ),
    #         metric=args.metric
    #     )[1]
    # )
    #
    # with open(os.path.join(task_save_folder, f'{param_str}_ft_baseline_results.json'), 'w') as f:
    #     json.dump(ml_baseline_history, f)

    ft_evaluation_metrics = defaultdict(list)
    train_labeled_ccle_dataloader, test_labeled_ccle_dataloader, labeled_rad51_dataloader = get_labeled_dataloader(
        batch_size=training_params['labeled']['batch_size'],
        drug=drug
    )

    ft_encoder = deepcopy(encoder)
    
    print(train_labeled_ccle_dataloader.dataset.tensors[1].sum())
    print(test_labeled_ccle_dataloader.dataset.tensors[1].sum())
    # print(labeled_rad51_dataloader.dataset.tensors[1].sum())

    target_classifier, ft_historys = fine_tuning.fine_tune_encoder(
        encoder=ft_encoder,
        train_dataloader=train_labeled_ccle_dataloader,
        val_dataloader=test_labeled_ccle_dataloader,
        test_dataloader=None,
        seed=2020,
        normalize_flag=args.norm_flag,
        task_save_folder=task_save_folder,
        **wrap_training_params(training_params, type='labeled')
    ) # here we are only training the network, so setting test_dataloader = None
    
    return target_classifier
#     ft_evaluation_metrics['best_index'].append(ft_historys[-2]['best_index'])
#     for metric in ['auroc', 'acc', 'aps', 'f1', 'auprc']:
#         ft_evaluation_metrics[metric].append(ft_historys[-1][metric][ft_historys[-2]['best_index']])

#     with open(os.path.join(task_save_folder, f'{param_str}_ft_evaluation_results.json'), 'w') as f:
#         json.dump(ft_evaluation_metrics, f)

In [30]:
test_args = arguments(is_train=True)

In [31]:
rad51_train_data = AggCategoricalAnnotatedTcgaDatasetFilteredByDrug(is_train=True, filter_for="tcga", sample_id = sample_id)

In [32]:
drug_list = list(rad51_train_data.tcga_response.drug_name.unique())
drug_list

['CISPLATIN',
 'PACLITAXEL',
 'GEMCITABINE',
 'DOCETAXEL',
 '5-FLUOROURACIL',
 'CYCLOPHOSPHAMIDE']

In [33]:
all_drugs = list(set(AggCategoricalAnnotatedCellLineDatasetFilteredByDrug(is_train=None, filter_for="tcga", sample_id = sample_id).y_df.drug_name.unique()))
len(all_drugs)

6

In [35]:
models = {}
for drug in [
    "CISPLATIN",
    "PACLITAXEL",
    "5-FLUOROURACIL",
    "CYCLOPHOSPHAMIDE",
    "DOCETAXEL",
    "GEMCITABINE",
]:
    for param_dict in update_params_dict_list:
        models[drug] = fine_tune(args=test_args, drug=drug, update_params_dict=param_dict)

../benchmarks/CODE-AE/model_save/code_adv/pretrain_num_epochs_300_train_num_epochs_500_dop_0.1 exists!
../benchmarks/CODE-AE/model_save/code_adv/AUC/CISPLATIN exists!
AE training epoch 0
AE training epoch 50
AE training epoch 100
AE training epoch 150
AE training epoch 200
AE training epoch 250
confounder wgan training epoch 0
confounder wgan training epoch 50
confounder wgan training epoch 100
confounder wgan training epoch 150
confounder wgan training epoch 200
confounder wgan training epoch 250
confounder wgan training epoch 300
confounder wgan training epoch 350
confounder wgan training epoch 400
confounder wgan training epoch 450


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = features["labels"].astype(int)


tensor(184)
tensor(237)
Fine tuning epoch 0
Fine tuning epoch 50
Fine tuning epoch 100
Fine tuning epoch 150
Fine tuning epoch 200
Fine tuning epoch 250
Fine tuning epoch 300
Unfreezing 321
Unfreezing 345
Fine tuning epoch 350
Fine tuning epoch 400
Fine tuning epoch 450
Fine tuning epoch 500
Fine tuning epoch 550
Fine tuning epoch 600
Fine tuning epoch 650
Fine tuning epoch 700
Fine tuning epoch 750
Fine tuning epoch 800
Fine tuning epoch 850
Fine tuning epoch 900
Fine tuning epoch 950
Fine tuning epoch 1000
Fine tuning epoch 1050
Fine tuning epoch 1100
Fine tuning epoch 1150
Fine tuning epoch 1200
Fine tuning epoch 1250
Fine tuning epoch 1300
Fine tuning epoch 1350
Fine tuning epoch 1400
Fine tuning epoch 1450
Fine tuning epoch 1500
Fine tuning epoch 1550
Fine tuning epoch 1600
Fine tuning epoch 1650
Fine tuning epoch 1700
Fine tuning epoch 1750
Fine tuning epoch 1800
Fine tuning epoch 1850
Fine tuning epoch 1900
Fine tuning epoch 1950
../benchmarks/CODE-AE/model_save/code_adv/pretrai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = features["labels"].astype(int)


tensor(271)
tensor(334)
Fine tuning epoch 0
Fine tuning epoch 50
Fine tuning epoch 100
Fine tuning epoch 150
Fine tuning epoch 200
Fine tuning epoch 250
Unfreezing 266
Fine tuning epoch 300
Unfreezing 336
Fine tuning epoch 350
Fine tuning epoch 400
Fine tuning epoch 450
Fine tuning epoch 500
Fine tuning epoch 550
Fine tuning epoch 600
Fine tuning epoch 650
Fine tuning epoch 700
Fine tuning epoch 750
Fine tuning epoch 800
Fine tuning epoch 850
Fine tuning epoch 900
Fine tuning epoch 950
Fine tuning epoch 1000
Fine tuning epoch 1050
Fine tuning epoch 1100
Fine tuning epoch 1150
Fine tuning epoch 1200
Fine tuning epoch 1250
Fine tuning epoch 1300
Fine tuning epoch 1350
Fine tuning epoch 1400
Fine tuning epoch 1450
Fine tuning epoch 1500
Fine tuning epoch 1550
Fine tuning epoch 1600
Fine tuning epoch 1650
Fine tuning epoch 1700
Fine tuning epoch 1750
Fine tuning epoch 1800
Fine tuning epoch 1850
Fine tuning epoch 1900
Fine tuning epoch 1950
../benchmarks/CODE-AE/model_save/code_adv/pretrai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = features["labels"].astype(int)


tensor(138)
tensor(174)
Fine tuning epoch 0
Fine tuning epoch 50
Fine tuning epoch 100
Fine tuning epoch 150
Fine tuning epoch 200
Fine tuning epoch 250
Fine tuning epoch 300
Unfreezing 303
Unfreezing 324
Fine tuning epoch 350
Fine tuning epoch 400
Fine tuning epoch 450
Fine tuning epoch 500
Fine tuning epoch 550
Fine tuning epoch 600
Fine tuning epoch 650
Fine tuning epoch 700
Fine tuning epoch 750
Fine tuning epoch 800
Fine tuning epoch 850
Fine tuning epoch 900
Fine tuning epoch 950
Fine tuning epoch 1000
Fine tuning epoch 1050
Fine tuning epoch 1100
Fine tuning epoch 1150
Fine tuning epoch 1200
Fine tuning epoch 1250
Fine tuning epoch 1300
Fine tuning epoch 1350
Fine tuning epoch 1400
Fine tuning epoch 1450
Fine tuning epoch 1500
Fine tuning epoch 1550
Fine tuning epoch 1600
Fine tuning epoch 1650
Fine tuning epoch 1700
Fine tuning epoch 1750
Fine tuning epoch 1800
Fine tuning epoch 1850
Fine tuning epoch 1900
Fine tuning epoch 1950
../benchmarks/CODE-AE/model_save/code_adv/pretrai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = features["labels"].astype(int)


tensor(128)
tensor(166)
Fine tuning epoch 0
Fine tuning epoch 50
Fine tuning epoch 100
Fine tuning epoch 150
Fine tuning epoch 200
Fine tuning epoch 250
Unfreezing 263
Unfreezing 273
Fine tuning epoch 300
Fine tuning epoch 350
Fine tuning epoch 400
Fine tuning epoch 450
Fine tuning epoch 500
Fine tuning epoch 550
Fine tuning epoch 600
Fine tuning epoch 650
Fine tuning epoch 700
Fine tuning epoch 750
Fine tuning epoch 800
Fine tuning epoch 850
Fine tuning epoch 900
Fine tuning epoch 950
Fine tuning epoch 1000
Fine tuning epoch 1050
Fine tuning epoch 1100
Fine tuning epoch 1150
Fine tuning epoch 1200
Fine tuning epoch 1250
Fine tuning epoch 1300
Fine tuning epoch 1350
Fine tuning epoch 1400
Fine tuning epoch 1450
Fine tuning epoch 1500
Fine tuning epoch 1550
Fine tuning epoch 1600
Fine tuning epoch 1650
Fine tuning epoch 1700
Fine tuning epoch 1750
Fine tuning epoch 1800
Fine tuning epoch 1850
Fine tuning epoch 1900
Fine tuning epoch 1950
../benchmarks/CODE-AE/model_save/code_adv/pretrai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = features["labels"].astype(int)


tensor(149)
tensor(198)
Fine tuning epoch 0
Fine tuning epoch 50
Fine tuning epoch 100
Fine tuning epoch 150
Fine tuning epoch 200
Fine tuning epoch 250
Fine tuning epoch 300
Fine tuning epoch 350
Unfreezing 390
Fine tuning epoch 400
Unfreezing 442
Fine tuning epoch 450
Fine tuning epoch 500
Fine tuning epoch 550
Fine tuning epoch 600
Fine tuning epoch 650
Fine tuning epoch 700
Fine tuning epoch 750
Fine tuning epoch 800
Fine tuning epoch 850
Fine tuning epoch 900
Fine tuning epoch 950
Fine tuning epoch 1000
Fine tuning epoch 1050
Fine tuning epoch 1100
Fine tuning epoch 1150
Fine tuning epoch 1200
Fine tuning epoch 1250
Fine tuning epoch 1300
Fine tuning epoch 1350
Fine tuning epoch 1400
Fine tuning epoch 1450
Fine tuning epoch 1500
Fine tuning epoch 1550
Fine tuning epoch 1600
Fine tuning epoch 1650
Fine tuning epoch 1700
Fine tuning epoch 1750
Fine tuning epoch 1800
Fine tuning epoch 1850
Fine tuning epoch 1900
Fine tuning epoch 1950
../benchmarks/CODE-AE/model_save/code_adv/pretrai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["labels"] = features["labels"].astype(int)


tensor(148)
tensor(196)
Fine tuning epoch 0
Fine tuning epoch 50
Fine tuning epoch 100
Fine tuning epoch 150
Fine tuning epoch 200
Fine tuning epoch 250
Fine tuning epoch 300
Fine tuning epoch 350
Fine tuning epoch 400
Fine tuning epoch 450
Unfreezing 452
Fine tuning epoch 500
Fine tuning epoch 550
Fine tuning epoch 600
Fine tuning epoch 650
Fine tuning epoch 700
Fine tuning epoch 750
Fine tuning epoch 800
Fine tuning epoch 850
Fine tuning epoch 900
Fine tuning epoch 950
Fine tuning epoch 1000
Fine tuning epoch 1050
Unfreezing 1093
Fine tuning epoch 1100
Fine tuning epoch 1150
Fine tuning epoch 1200
Fine tuning epoch 1250
Fine tuning epoch 1300
Fine tuning epoch 1350
Fine tuning epoch 1400
Fine tuning epoch 1450
Fine tuning epoch 1500
Fine tuning epoch 1550
Fine tuning epoch 1600
Fine tuning epoch 1650
Fine tuning epoch 1700
Fine tuning epoch 1750
Fine tuning epoch 1800
Fine tuning epoch 1850
Fine tuning epoch 1900
Fine tuning epoch 1950


In [36]:
len(models.keys())

6

### Predicting using the trained model

In [37]:
from model import BaseDruidModel

In [44]:
class CodeAE(BaseDruidModel):
    def __init__(self):
        super().__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
    def __str__(self):
        return "CODE-AE Adversarial model"
    
    def forward(self, dataset):
        if os.path.exists(f"/data/ajayago/druid/paper_intermediate/model_checkpoints/CODE_AE/tcga_allgenes_drug_fp_sample{sample_id}.csv"):
            res = pd.read_csv(f"/data/ajayago/druid/paper_intermediate/model_checkpoints//CODE_AE/tcga_allgenes_drug_fp_sample{sample_id}.csv")
            return res["response"].values
        x_df = pd.concat(list(dataset[: len(dataset)].values()), axis=1)

        results = []
        for _, row in x_df.iterrows():
            depmap_id = row[dataset.entity_identifier_name]
            drug_name = row["drug_name"]
            if drug_name in models.keys():
                result = torch.sigmoid(models[drug_name](
                    torch.tensor(
                        dataset.embedded_raw_mutations_all_genes_v2.loc[[depmap_id]].to_numpy(), device=self.device, dtype=torch.float,
                    )
                )   
                )
                results.append(result.detach().item())
            else:
                results.append(np.NaN)

        return np.array(results).flatten()

    def postprocess(self, dataset, np_out):

        assert len(np_out) == len(dataset)

        y_true = pd.concat(list(dataset[: len(dataset)].values()), axis=1)

        y_pred = y_true.copy()

        if isinstance(
            dataset, (PdxDataset, NuhDataset, TcgaDataset, AggCategoricalAnnotatedRad51DatasetFilteredByDrug)
        ):
            #y_pred["response"] = (1 - np_out).squeeze() # we already predict the AADRC here
            y_pred["response"] = np_out.squeeze()
            y_pred = y_pred[
                [dataset.entity_identifier_name, "drug_name", "response"]
            ].copy()
            y_pred.to_csv(f"/data/ajayago/druid/paper_intermediate/model_checkpoints//CODE_AE/tcga_allgenes_drug_fp_sample{sample_id}.csv", index=False)
            return y_pred, y_true

        y_pred["auc"] = np_out.squeeze()

        if isinstance(dataset, (AggCategoricalAnnotatedCellLineDataset)):
            y_pred["auc"] = (1-np_out).squeeze() # convert to AUDRC
            y_true_pivotted = y_true.pivot_table(
                "auc", dataset.entity_identifier_name, "drug_name"
            )
            y_pred_pivotted = y_pred.pivot_table(
                "auc", dataset.entity_identifier_name, "drug_name", dropna=False
            )
            print(y_pred_pivotted.shape, y_true_pivotted.shape)
            return y_pred_pivotted, y_true_pivotted

        return y_pred.pivot_table("auc", dataset.entity_identifier_name, "drug_name")

### Evaluation

In [45]:
from testbed import _EvaluationTestbed
from metric import *

In [46]:
EvaluationTestbed = _EvaluationTestbed(
    metrics=[
        NdcgMetric(),
        RecallMetric(),
        MeanAucMetric(),
        SampleCelllinePredictionsMetric(),
        PatientPredictionsMetric(),
        AurocPrecisionMetric(),
        NuhAurocPrecisionMetric(),
        SurvivalCorrelationMetric(),
        SurvivalCorrelationCoxMetric(),
        TcgaSurvivalCorrelationCoxMetric(),
        RecistCorrelationMetric(),
        CellLineCorrelationMetric(),
    ]
)

In [47]:
model = CodeAE()

In [48]:
# # All datasets 
# res = EvaluationTestbed.run(
#     {
#         model: [
#             AggCategoricalAnnotatedCellLineDataset(
#                 is_train=False,
#                 only_cat_one_drugs=False,
#                 scale_y=False,
#                 use_k_best_worst=None,
#             ),
#             AggCategoricalAnnotatedPdxDataset(
#                 apply_train_test_filter=False,
#                 is_train=False,
#                 only_cat_one_drugs=False,
#                 include_all_cell_line_drugs=True,
#             ),
#             AggCategoricalAnnotatedTcgaDataset(
#                 apply_train_test_filter=False,
#                 is_train=False,
#                 only_cat_one_drugs=False,
#                 include_all_cell_line_drugs=True,
#             ),
#             AggCategoricalAnnotatedMooresDataset(
#                 is_train=False, only_cat_one_drugs=False, include_all_cell_line_drugs=True
#             ),
#             AggCategoricalAnnotatedRad51Dataset(
#                 is_train=False, only_cat_one_drugs=False, include_all_cell_line_drugs=True
#             ),
#         ],
#     },
# )
# pd.set_option("display.max_rows", 100)
# res_df = pd.DataFrame(res)
# res_df.set_index(["model", "dataset", "metric"], inplace=True)
# res_df

## Drug Specific Analysis

In [49]:
from scipy import stats
from numpy import argmax
from sklearn.metrics import roc_curve

### TCGA

In [50]:
patient_dataset = AggCategoricalAnnotatedTcgaDatasetFilteredByDrug(
                is_train=False,
                filter_for="tcga", sample_id = sample_id
            )
patient_results = model(patient_dataset)
patient_pp_out = model.postprocess(patient_dataset, patient_results)
y_pred, y_true = patient_pp_out


y_pred_pivotted = y_pred.pivot_table(
                "response", "submitter_id", "drug_name"
            )
y_pred_pivotted = y_pred_pivotted.fillna(0)
dict_idx_drug = pd.DataFrame(y_pred_pivotted.columns).to_dict()["drug_name"]
dict_id_drug = {}

for patient_id, predictions in y_pred_pivotted.iterrows():

    cur_pred_scores = predictions.values
    cur_recom_drug_idx = np.argsort(cur_pred_scores)[:-11:-1]
    #
    dict_recom_drug = {}
    for idx, cur_idx in enumerate(cur_recom_drug_idx):
        dict_recom_drug[
            dict_idx_drug[cur_idx]
        ] = f"{cur_pred_scores[cur_idx]} ({idx+1})"
    #
    dict_id_drug[patient_id] = dict_recom_drug

predictions_display_tcga = pd.DataFrame.from_dict(dict_id_drug)

na_mask = y_pred.response.isna()
if na_mask.sum():
    print(
        f"[KaplanMeierFitterMetric] Found {na_mask.sum()} rows with invalid response values"
    )
    y_pred = y_pred[~na_mask]
    y_true = y_true.loc[~(na_mask.values)]
na_mask = y_true.response.isna()
y_true = y_true[~na_mask]
y_pred = y_pred[~na_mask]
print(y_pred.shape)
y_pred.head()
y_combined = y_pred.merge(y_true, on=["submitter_id", "drug_name"])

from sklearn.metrics import average_precision_score, ndcg_score, roc_auc_score, f1_score, accuracy_score, precision_score

drugs_with_enough_support = [
    "CISPLATIN",
    "PACLITAXEL",
    "5-FLUOROURACIL",
    "CYCLOPHOSPHAMIDE",
    "DOCETAXEL",
    "GEMCITABINE",
]


# for drug_name in y_true.drug_name.unique():
for drug_name in drugs_with_enough_support:
    try:
        roc = roc_auc_score(
            y_true[y_true.drug_name == drug_name].response.values,
            y_pred[y_pred.drug_name == drug_name].response.values,
            average="micro",
        )
        aupr = average_precision_score(
            y_true[y_true.drug_name == drug_name].response.values,
            y_pred[y_pred.drug_name == drug_name].response.values,
            average="micro",
        )
        # Choosing the right threshold for F1, accuracy and precision calculation from ref: https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
        fpr, tpr, thresholds = roc_curve(
            y_true[y_true.drug_name == drug_name].response.values,
            y_pred[y_pred.drug_name == drug_name].response.values,
        )
        J = tpr - fpr
        ix = argmax(J)
        best_thresh = thresholds[ix]
        
        f1 = f1_score(
            y_true[y_true.drug_name == drug_name].response.values,
            (y_pred[y_pred.drug_name == drug_name].response.values > best_thresh).astype(int),
#             average="micro",
        )
        acc_score = accuracy_score(
            y_true[y_true.drug_name == drug_name].response.values,
            (y_pred[y_pred.drug_name == drug_name].response.values > best_thresh).astype(int),
        )
        prec_score = precision_score(
            y_true[y_true.drug_name == drug_name].response.values,
            (y_pred[y_pred.drug_name == drug_name].response.values > best_thresh).astype(int),
        )
        spearman_stats = stats.spearmanr(
            y_true[y_true.drug_name == drug_name].response.values,
            y_pred[y_pred.drug_name == drug_name].response.values,
        )
        mw_stats = stats.mannwhitneyu(
            y_combined[
                (y_combined.drug_name == drug_name) & (y_combined.response_y == 0)
            ].response_x.values,
            y_combined[
                (y_combined.drug_name == drug_name) & (y_combined.response_y == 1)
            ].response_x.values,
            alternative="greater",
        )
        denominator = (
            y_combined[
                (y_combined.drug_name == drug_name) & (y_combined.response_y == 0)
            ].shape[0]
            * y_combined[
                (y_combined.drug_name == drug_name) & (y_combined.response_y == 1)
            ].shape[0]
        )
        print(f"AUROC for {drug_name}: {roc}")
        print(f"AUPR for {drug_name}: {aupr}")
        print(f"F1 for {drug_name}: {f1}")
        print(f"Accuracy Score for {drug_name}: {acc_score}")
        print(f"Precision Score for {drug_name}: {prec_score}")
        print(
            f"Spearman for {drug_name}: {round(spearman_stats.correlation, 4)} (p-val: {round(spearman_stats.pvalue, 4)})"
        )
        print(
            f"Mann-Whitney for {drug_name}: {round(mw_stats.statistic/denominator, 4)} (p-val: {round(mw_stats.pvalue, 4)})"
        )
    except Exception as e:
        print(f"Error processing {drug_name} - {e}")


drugs_with_enough_support2 = ["CISPLATIN", "PACLITAXEL", "5-FLUOROURACIL"]

print("For CISPLATIN, PACLITAXEL and 5-FU")
roc = roc_auc_score(
    y_true[y_true.drug_name.isin(drugs_with_enough_support2)].response.values,
    y_pred[y_pred.drug_name.isin(drugs_with_enough_support2)].response.values,
    average="micro",
)
aupr = average_precision_score(
    y_true[y_true.drug_name.isin(drugs_with_enough_support2)].response.values,
    y_pred[y_pred.drug_name.isin(drugs_with_enough_support2)].response.values,
    average="micro",
)
# Choosing the right threshold for F1, accuracy and precision calculation from ref: https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
fpr, tpr, thresholds = roc_curve(
    y_true[y_true.drug_name.isin(drugs_with_enough_support2)].response.values,
    y_pred[y_pred.drug_name.isin(drugs_with_enough_support2)].response.values,
)
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]

f1 = f1_score(
    y_true[y_true.drug_name.isin(drugs_with_enough_support2)].response.values,
    (y_pred[y_pred.drug_name.isin(drugs_with_enough_support2)].response.values> best_thresh).astype(int),
#     average="micro",
)
acc_score = accuracy_score(
    y_true[y_true.drug_name.isin(drugs_with_enough_support2)].response.values,
    (y_pred[y_pred.drug_name.isin(drugs_with_enough_support2)].response.values> best_thresh).astype(int),
)
prec_score = precision_score(
    y_true[y_true.drug_name.isin(drugs_with_enough_support2)].response.values,
    (y_pred[y_pred.drug_name.isin(drugs_with_enough_support2)].response.values> best_thresh).astype(int),
)
spearman_stats = stats.spearmanr(
    y_true[y_true.drug_name.isin(drugs_with_enough_support2)].response.values,
    y_pred[y_pred.drug_name.isin(drugs_with_enough_support2)].response.values,
)
mw_stats = stats.mannwhitneyu(
    y_combined[
        (y_combined.drug_name.isin(drugs_with_enough_support2))
        & (y_combined.response_y == 0)
    ].response_x.values,
    y_combined[
        (y_combined.drug_name.isin(drugs_with_enough_support2))
        & (y_combined.response_y == 1)
    ].response_x.values,
    alternative="greater",
)
denominator = (
    y_combined[
        (y_combined.drug_name.isin(drugs_with_enough_support2))
        & (y_combined.response_y == 0)
    ].shape[0]
    * y_combined[
        (y_combined.drug_name.isin(drugs_with_enough_support2))
        & (y_combined.response_y == 1)
    ].shape[0]
)

print(f"Overall AUROC: {roc}")
print(f"Overall AUPR: {aupr}")
print(f"Overall F1: {f1}")
print(f"Overall Accuracy Score: {acc_score}")
print(f"Overall Precision Score: {prec_score}")
print(
    f"Overall Spearman: {round(spearman_stats.correlation, 4)} (p-val: {round(spearman_stats.pvalue, 4)})"
)
print(
    f"Overall Mann-Whitney: {round(mw_stats.statistic/denominator, 4)} (p-val: {round(mw_stats.pvalue, 4)})"
)




(126, 3)
AUROC for CISPLATIN: 0.43333333333333335
AUPR for CISPLATIN: 0.7653643822026431
F1 for CISPLATIN: 0.64
Accuracy Score for CISPLATIN: 0.5384615384615384
Precision Score for CISPLATIN: 0.8
Spearman for CISPLATIN: -0.0973 (p-val: 0.5555)
Mann-Whitney for CISPLATIN: 0.5667 (p-val: 0.2798)
AUROC for PACLITAXEL: 0.08571428571428572
AUPR for PACLITAXEL: 0.6579612920576423
F1 for PACLITAXEL: 0.0
Accuracy Score for PACLITAXEL: 0.19230769230769232
Precision Score for PACLITAXEL: 0.0
Spearman for PACLITAXEL: -0.566 (p-val: 0.0026)
Mann-Whitney for PACLITAXEL: 0.9143 (p-val: 0.0013)
AUROC for 5-FLUOROURACIL: 0.4285714285714286
AUPR for 5-FLUOROURACIL: 0.6796923019348649
F1 for 5-FLUOROURACIL: 0.6451612903225806
Accuracy Score for 5-FLUOROURACIL: 0.56
Precision Score for 5-FLUOROURACIL: 0.7692307692307693
Spearman for 5-FLUOROURACIL: -0.1112 (p-val: 0.5967)
Mann-Whitney for 5-FLUOROURACIL: 0.5714 (p-val: 0.3075)
AUROC for CYCLOPHOSPHAMIDE: 0.8888888888888888
AUPR for CYCLOPHOSPHAMIDE: 0.99

  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
predictions_display_tcga

Unnamed: 0,TCGA-05-4384,TCGA-05-4390,TCGA-05-4427,TCGA-05-5425,TCGA-34-5240,TCGA-43-A474,TCGA-56-A4BW,TCGA-62-A46Y,TCGA-69-7973,TCGA-73-7498,...,TCGA-VS-A8EH,TCGA-VS-A9U6,TCGA-VS-A9UC,TCGA-VS-A9UY,TCGA-VS-A9V3,TCGA-WT-AB41,TCGA-XF-A9SX,TCGA-XF-AAMR,TCGA-XF-AAMY,TCGA-XF-AAN5
CISPLATIN,0.45959949493408203 (1),0.34599584341049194 (1),0.12188302725553513 (1),0.0 (5),0.2847336530685425 (2),0.0 (5),0.0 (5),0.46016404032707214 (1),0.32868289947509766 (1),0.44431817531585693 (1),...,0.603904128074646 (1),0.43941572308540344 (1),0.5282601118087769 (1),0.5704662203788757 (1),0.4010109305381775 (1),0.0 (5),0.0 (5),0.6275138258934021 (2),0.0 (5),0.2575593888759613 (2)
PACLITAXEL,0.0 (2),0.0 (2),0.0 (2),0.7319815158843994 (1),0.0 (3),0.0 (2),0.613555371761322 (1),0.0 (2),0.0 (2),0.0 (2),...,0.0 (2),0.0 (2),0.0 (2),0.0 (2),0.0 (2),0.0 (3),0.0 (2),0.0 (3),0.0 (2),0.0 (3)
GEMCITABINE,0.0 (3),0.0 (3),0.0 (3),0.0 (2),0.0 (4),0.0 (3),0.0 (2),0.0 (3),0.0 (3),0.0 (3),...,0.0 (3),0.0 (3),0.0 (3),0.0 (3),0.0 (3),0.0 (4),0.7589364647865295 (1),0.7622241973876953 (1),0.7564153671264648 (1),0.6371418237686157 (1)
DOCETAXEL,0.0 (4),0.0 (4),0.0 (4),0.0 (3),0.5455200672149658 (1),0.5494821667671204 (1),0.0 (3),0.0 (4),0.0 (4),0.0 (4),...,0.0 (4),0.0 (4),0.0 (4),0.0 (4),0.0 (4),0.5518983602523804 (1),0.0 (3),0.0 (4),0.0 (3),0.0 (4)
CYCLOPHOSPHAMIDE,0.0 (5),0.0 (5),0.0 (5),0.0 (4),0.0 (5),0.0 (4),0.0 (4),0.0 (5),0.0 (5),0.0 (5),...,0.0 (5),0.0 (5),0.0 (5),0.0 (5),0.0 (5),0.2784028947353363 (2),0.0 (4),0.0 (5),0.0 (4),0.0 (5)
5-FLUOROURACIL,0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),...,0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6),0.0 (6)


In [52]:
y_true[y_true.drug_name == "GEMCITABINE"]

Unnamed: 0,submitter_id,drug_name,response
47,TCGA-FD-A5C1,GEMCITABINE,1
58,TCGA-FD-A6TC,GEMCITABINE,1
68,TCGA-XF-A9SX,GEMCITABINE,0
81,TCGA-XF-AAMR,GEMCITABINE,1
82,TCGA-XF-AAMY,GEMCITABINE,1
87,TCGA-XF-AAN5,GEMCITABINE,1
439,TCGA-MP-A4TD,GEMCITABINE,1


In [53]:
y_pred[y_pred.drug_name == "GEMCITABINE"]

Unnamed: 0,submitter_id,drug_name,response
47,TCGA-FD-A5C1,GEMCITABINE,0.762515
58,TCGA-FD-A6TC,GEMCITABINE,0.541762
68,TCGA-XF-A9SX,GEMCITABINE,0.758936
81,TCGA-XF-AAMR,GEMCITABINE,0.762224
82,TCGA-XF-AAMY,GEMCITABINE,0.756415
87,TCGA-XF-AAN5,GEMCITABINE,0.637142
439,TCGA-MP-A4TD,GEMCITABINE,0.750508


In [54]:
from sklearn.metrics import roc_curve

In [55]:
fpr, tpr, thresholds = roc_curve(
            y_true[y_true.drug_name == "PACLITAXEL"].response.values,
            y_pred[y_pred.drug_name == "PACLITAXEL"].response.values,
        )

In [56]:
from math import sqrt

In [57]:
# Ref: https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
# calculate the g-mean for each threshold
gmeans = np.array([sqrt(i) for i in tpr * (1-fpr)])
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

Best Threshold=0.716851, G-Mean=0.239
