In [None]:
# remember to create and activate the data_env environment first (or install specified libraries in base environment)
import os
import json
import random
from itertools import chain, combinations
import pandas as pd
import numpy as np
import pubchempy as pcp


# Drug response preprocessing

## PAN-CANCER IC and AUC

In [None]:
def preprocess_drug_response(pancancer_gdsc, index='Cell Line Name', columns='Drug Name', ic_values='IC50', auc_values='AUC'):
    pancancer_gdsc = pancancer_gdsc.sort_values(by=[index, columns]).drop_duplicates(subset=[index, columns], keep='last') # if a drug was tested more than once on the same cell line, we take the last one (arbitrary decision)
    pancancer_ic_gdsc = pancancer_gdsc.pivot(index=index, columns=columns, values=ic_values).sort_index().sort_index(axis=1)
    if auc_values is not None:
        pancancer_auc_gdsc = pancancer_gdsc.pivot(index=index, columns=columns, values=auc_values).sort_index().sort_index(axis=1)
        return pancancer_ic_gdsc, pancancer_auc_gdsc
    return pancancer_ic_gdsc

In [None]:
# either: https://www.cancerrxgene.org/downloads/genetic_features?screening_set=GDSC2&tissue=PANCANCER&mutation=both
# or: https://www.cancerrxgene.org/downloads/bulk_download -> GDSC2-dataset (almost the same, but has a few less measurements)
pancancer_gdsc2_raw = pd.read_csv('GDSC_Drug_Data/PANCANCER_IC_GDSC2.csv')
pancancer_ic_gdsc2, pancancer_auc_gdsc2 = preprocess_drug_response(pancancer_gdsc2_raw)

In [None]:
pancancer_ic_gdsc2.shape, pancancer_auc_gdsc2.shape

## PAN-CANCER IC and AUC, but only PubChem drugs

In [None]:
# https://www.cancerrxgene.org/compounds -> select All and Export: CSV
drug_list = pd.read_csv('GDSC_Drug_Data/Drug_list.csv')
drugID_to_pubchemID_dict_raw = drug_list[['Drug Id', ' Name', ' PubCHEM', ' Datasets']].set_index(['Drug Id', ' Name', ' Datasets']).T.to_dict('records')[0]

In [None]:
drugID_to_pubchemID_dict = {}
for k, v in drugID_to_pubchemID_dict_raw.items():
    if v == v and v != 'none' and v != 'several': # some drugs do not have a PubChem ID listed in Drug_list.csv
        v2 = v.split(',')[0] # sometimes, multiple PubChem IDs are listed, of which we just take the first
        drugID_to_pubchemID_dict[k] = int(v2)
    else: # if v is 'none' or 'several', make them nan so that they will be dropped along with the original nans later
        v2 = np.nan
        drugID_to_pubchemID_dict[k] = v2

In [None]:
# this cell is needed because between GDSC1 and GDSC2, some drug IDs differ while the PubChem IDs are identical
# there are also some drugs that are both in GDSC2 and have a different drug ID, but there are no PubChem IDs for them (see previous notebook cell)
drugID_to_pubchemID_dict_GDSC2 = {k[0]: v for k, v in drugID_to_pubchemID_dict.items() if k[2] == 'GDSC2'}

In [None]:
pancancer_gdsc2_pubchem = pancancer_gdsc2_raw.copy()
pancancer_gdsc2_pubchem['PubChem ID'] = pancancer_gdsc2_pubchem['Drug ID']
pancancer_gdsc2_pubchem = pancancer_gdsc2_pubchem.replace({'PubChem ID': drugID_to_pubchemID_dict_GDSC2}).dropna()
pancancer_gdsc2_pubchem['PubChem ID'] = [int(pubchem_id) for pubchem_id in pancancer_gdsc2_pubchem['PubChem ID']]
pancancer_ic_pubchem_gdsc2, pancancer_auc_pubchem_gdsc2 = preprocess_drug_response(pancancer_gdsc2_pubchem, columns='PubChem ID')

In [None]:
pancancer_ic_pubchem_gdsc2.shape, pancancer_auc_pubchem_gdsc2.shape

## Save target files

In [None]:
if not os.path.exists('../targets'):
    os.makedirs('../targets')

pancancer_ic_gdsc2.to_csv('../targets/pancancer_ic_gdsc2.csv')
pancancer_auc_gdsc2.to_csv('../targets/pancancer_auc_gdsc2.csv')

pancancer_ic_pubchem_gdsc2.to_csv('../targets/pancancer_ic_pubchem_gdsc2.csv')
pancancer_auc_pubchem_gdsc2.to_csv('../targets/pancancer_auc_pubchem_gdsc2.csv')

# Cell-line feature preprocessing

In [None]:
# https://www.cancerrxgene.org/downloads/bulk_download -> Download from Cell Model Passports, then you reach the website:
# https://cellmodelpassports.sanger.ac.uk/downloads -> Model Annotation -> under Model List, click View all versions
model_list = pd.read_csv('Bulk_Cell_line_Genomic_Data/Cell_Model_Passports/Model_Annotation/model_list_20240110.csv')

## PAN-CANCER MUT&CNV

In [None]:
pancancer_genetic_features_gdsc2_raw = pd.read_csv('GDSC_Genetic_Features/PANCANCER_Genetic_features_GDSC2.csv', low_memory=False) # low_memory=False because columns 'Recurrent Gain Loss' and 'Genes in Segment' throw warning
pancancer_genetic_features_gdsc2 = pancancer_genetic_features_gdsc2_raw.pivot(index='Cell Line Name', columns='Genetic Feature', values='IS Mutated')
pancancer_genetic_features_gdsc2 = pancancer_genetic_features_gdsc2.dropna()

In [None]:
pancancer_genetic_features_gdsc2.shape

## MUT

In [None]:
# https://cellmodelpassports.sanger.ac.uk/downloads -> Mutation Data -> under Mutations Summary, click View all versions
mut_all_raw = pd.read_csv('Bulk_Cell_line_Genomic_Data/Cell_Model_Passports/MUT/mutations_all_20230202.csv')

In [None]:
mut_all_raw2 = mut_all_raw.copy()
mut_all_raw2['is_mutated'] = 1 # all rows stand for a mutation, so we add a new all-1 column 'is_mutated'
mut_all_raw2 = mut_all_raw2.drop_duplicates(subset=['gene_symbol', 'model_id'], keep='first') # if one combination has multiple mutations, treat as one mutation
mut_all = mut_all_raw2.pivot(index='model_id', columns='gene_symbol', values='is_mutated')
mut_all = mut_all.fillna(0).astype(int)
model_list_dict = dict(zip(model_list['model_id'], model_list['model_name']))
mut_all = mut_all.rename(index=model_list_dict)
mut_all = mut_all.sort_index().sort_index(axis=1)
mut_all = mut_all.loc[:, (mut_all != 0).any(axis=0)] # drop zero-only columns (but our mut version has none because pivot)
mut_all = mut_all.loc[(mut_all != 0).any(axis=1)] # drop zero-only rows (but our mut version has none because pivot)

In [None]:
mut_all.shape

## CNV

In [None]:
# https://cellmodelpassports.sanger.ac.uk/downloads -> Copy Number Data -> under Copy Number (SNP6), click View all versions -> zip contains two files: cnv_gistic_20191101.csv and cnv_abs_copy_number_picnic_20191101.csv
# we use cnv_gistic_20191101.csv instead of cnv_abs_copy_number_picnic_20191101.csv because the latter has positive real values, and the numbers have different meanings, see cnv_summary_20230303.csv: 2 is neutral, but sometimes loss, 4 is also neutral, 3 is gain or loss; gistics -2, -1, 0, 1, and 2 are way easier to interpret
# (to get cnv_summary_20230303.csv: https://cellmodelpassports.sanger.ac.uk/downloads -> Copy Number Data -> under CNV Summary, click View all versions)
cnv_all_raw = pd.read_csv('Bulk_Cell_line_Genomic_Data/Cell_Model_Passports/CNV/cnv_20191101/cnv_gistic_20191101.csv', low_memory=False) # 'low_memory=False' to remove warnings

In [None]:
cnv_all_raw2 = cnv_all_raw.T
cnv_all_raw2 = cnv_all_raw2.drop(cnv_all_raw2.index[0])
cnv_all_raw2.index = cnv_all_raw2.iloc[:, 0]
cnv_all_raw2 = cnv_all_raw2.drop(cnv_all_raw2.columns[1], axis=1)
cnv_all_raw2 = cnv_all_raw2.drop(cnv_all_raw2.columns[0], axis=1)
cnv_all_raw2.columns = cnv_all_raw2.iloc[0]
cnv_all_raw2 = cnv_all_raw2.drop(cnv_all_raw2.index[0])
cnv_all_raw2.index = [str(s) for s in cnv_all_raw2.index]
cnv_all = cnv_all_raw2.sort_index().sort_index(axis=1)
cnv_all = cnv_all.loc[:, (cnv_all != 0).any(axis=0)] # drop zero-only columns (but our cnv version has none)
cnv_all = cnv_all.loc[(cnv_all != 0).any(axis=1)] # drop zero-only rows (but our cnv version has none)
cnv_all = cnv_all.dropna(axis=1) # remove nan columns (removing rows would remove all rows)

In [None]:
cnv_all.shape

## EXP

In [None]:
# https://cellmodelpassports.sanger.ac.uk/downloads -> Expression Data -> under RNA-Seq, click View all versions -> zip contains multiple files, of which we chose rnaseq_tpm_20220624.csv because CCLE also uses TPM (but choosing any other file is of course also possible)
exp_tpm_raw = pd.read_csv('Bulk_Cell_line_Genomic_Data/Cell_Model_Passports/GE/rnaseq_all_20220624/rnaseq_tpm_20220624.csv', low_memory=False) # 'low_memory=False' to remove warnings

In [None]:
exp_tpm_raw2 = exp_tpm_raw.T
exp_tpm_raw2.index = exp_tpm_raw2.iloc[:, 0]
exp_tpm_raw2 = exp_tpm_raw2.drop(exp_tpm_raw2.columns[1], axis=1)
exp_tpm_raw2.columns = exp_tpm_raw2.iloc[1]
exp_tpm_raw2 = exp_tpm_raw2.drop(exp_tpm_raw2.index[0])
exp_tpm_raw2 = exp_tpm_raw2.drop(exp_tpm_raw2.columns[0], axis=1)
exp_tpm_raw2 = exp_tpm_raw2.drop(exp_tpm_raw2.index[0])
exp_tpm_raw2 = exp_tpm_raw2.drop(exp_tpm_raw2.columns[0], axis=1)
exp_tpm_raw2 = exp_tpm_raw2.dropna(axis=1)
exp_tpm_raw2 = exp_tpm_raw2.astype(float)
exp_tpm_raw2 = exp_tpm_raw2.sort_index().sort_index(axis=1)
exp_tpm_raw2 = exp_tpm_raw2.loc[:, ~exp_tpm_raw2.columns.duplicated()] # EEF1AKNMT and SEPTIN4 are duplicated columns, we remove them

In [None]:
# because CCLE also calculates log2(TPM+1) and we want to compare them
# after running this notebook cell, both have a similar range of values (0 to 17 and 0 to 19)
exp_tpm = np.log2(exp_tpm_raw2 + 1)

In [None]:
exp_tpm.shape

## Proteomics

In [None]:
# careful! NaN in input, see two notebook cells below
# https://cellmodelpassports.sanger.ac.uk/downloads -> Proteomics -> Download proteomics data -> zip contains multiple files, of which the correct one is Protein_matrix_averaged_20221214.tsv
proteomics_raw = pd.read_csv('Bulk_Cell_line_Genomic_Data/Cell_Model_Passports/Proteomics/Protein_matrix_averaged_20221214.tsv', sep='\t', low_memory=False) # 'low_memory=False' to remove warnings
proteomics_raw = proteomics_raw.drop(proteomics_raw.index[1])
proteomics_raw = proteomics_raw.drop(proteomics_raw.columns[1], axis=1)
proteomics_raw.columns = proteomics_raw.iloc[0]
proteomics_raw = proteomics_raw.drop(proteomics_raw.index[0])
proteomics_raw.index = proteomics_raw.iloc[:, 0]
proteomics_raw = proteomics_raw.drop(proteomics_raw.columns[0], axis=1)
proteomics = proteomics_raw.sort_index().sort_index(axis=1)

In [None]:
proteomics.shape

In [None]:
(proteomics != proteomics).sum().sum()/(proteomics.shape[0]*proteomics.shape[1])
# 38.6 % NaNs, probably unfit for ANNs as they cannot just be imputed with 0 because NaN means the q-value is high, indicating unreliable protein abundance measurements

## Save feature files

In [None]:
if not os.path.exists('../features/cell_features'):
    os.makedirs('../features/cell_features')

# rename features to enable backtracking of feature types
mut_all.columns = [i + '.mut' for i in mut_all.columns]
cnv_all.columns = [i + '.cnv' for i in cnv_all.columns]
exp_tpm.columns = [i + '.exp' for i in exp_tpm.columns]

pancancer_genetic_features_gdsc2.to_csv('../features/cell_features/pancancer_genetic_features_gdsc2.csv')
mut_all.to_csv('../features/cell_features/mut_all.csv')
cnv_all.to_csv('../features/cell_features/cnv_all.csv')
exp_tpm.to_csv('../features/cell_features/exp_tpm.csv')

In [None]:
mut_all_shuffled_numpy = mut_all.values
cnv_all_shuffled_numpy = cnv_all.values
exp_tpm_shuffled_numpy = exp_tpm.values

mut_all_shuffled_numpy = mut_all_shuffled_numpy.flatten()
cnv_all_shuffled_numpy = cnv_all_shuffled_numpy.flatten()
exp_tpm_shuffled_numpy = exp_tpm_shuffled_numpy.flatten()

np.random.shuffle(mut_all_shuffled_numpy)
np.random.shuffle(cnv_all_shuffled_numpy)
np.random.shuffle(exp_tpm_shuffled_numpy)

mut_all_shuffled_numpy = mut_all_shuffled_numpy.reshape(mut_all.shape)
cnv_all_shuffled_numpy = cnv_all_shuffled_numpy.reshape(cnv_all.shape)
exp_tpm_shuffled_numpy = exp_tpm_shuffled_numpy.reshape(exp_tpm.shape)

mut_all_shuffled = pd.DataFrame(mut_all_shuffled_numpy, index=mut_all.index, columns=mut_all.columns)
cnv_all_shuffled = pd.DataFrame(cnv_all_shuffled_numpy, index=cnv_all.index, columns=cnv_all.columns)
exp_tpm_shuffled = pd.DataFrame(exp_tpm_shuffled_numpy, index=exp_tpm.index, columns=exp_tpm.columns)

In [None]:
mut_all_shuffled.to_csv('../features/cell_features/mut_all_shuffled.csv')
cnv_all_shuffled.to_csv('../features/cell_features/cnv_all_shuffled.csv')
exp_tpm_shuffled.to_csv('../features/cell_features/exp_tpm_shuffled.csv')

# Save shared cell lines between all possible data subsets

In [None]:
def powerset_without_empty_set(iterable):
    s = sorted(iterable) # must be alphabetically ordered for config.yaml
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))[1:]

In [None]:
if not os.path.exists('../features/shared_cell_lines'):
    os.makedirs('../features/shared_cell_lines')

pancancer_ic_gdsc2_cell_lines = set(pancancer_ic_gdsc2.index)
pancancer_auc_gdsc2_cell_lines = set(pancancer_auc_gdsc2.index)
pancancer_ic_pubchem_gdsc2_cell_lines = set(pancancer_ic_pubchem_gdsc2.index)
pancancer_auc_pubchem_gdsc2_cell_lines = set(pancancer_auc_pubchem_gdsc2.index)

## PAN-CANCER MUT&CNV

In [None]:
data = ['pancancer_genetic_features_gdsc2']
data_subsets = powerset_without_empty_set(data) # results in 4 files
data_dict = dict()
for cell_lines in ['pancancer_ic_gdsc2_cell_lines', 'pancancer_auc_gdsc2_cell_lines', 'pancancer_ic_pubchem_gdsc2_cell_lines', 'pancancer_auc_pubchem_gdsc2_cell_lines']:
    for subset in data_subsets:
        data_dict[subset] = eval(cell_lines)
        for dataset in subset:
            data_dict[subset] = data_dict[subset].intersection(eval(dataset).index)
    for k, v in data_dict.items():
        filename = '../features/shared_cell_lines/' + str(k).replace('(', '').replace(')', '').replace('\'', '').replace(',', '').replace(' ', '-') + '.' + cell_lines + '.json'
        with open(filename, 'w') as f:
            json.dump(sorted(v), f)
        print(filename, len(v))

## MUT + CNV + EXP

In [None]:
data = ['cnv_all', 'exp_tpm', 'mut_all'] # alphabetical order for config.yaml
data_subsets = powerset_without_empty_set(data) # results in 28 files
data_dict = dict()

for cell_lines in ['pancancer_ic_gdsc2_cell_lines', 'pancancer_auc_gdsc2_cell_lines', 'pancancer_ic_pubchem_gdsc2_cell_lines', 'pancancer_auc_pubchem_gdsc2_cell_lines']:
    for subset in data_subsets:
        data_dict[subset] = eval(cell_lines)
        for dataset in subset:
            data_dict[subset] = data_dict[subset].intersection(eval(dataset).index)
    for k, v in data_dict.items():
        filename = '../features/shared_cell_lines/' + str(k).replace('(', '').replace(')', '').replace('\'', '').replace(',', '').replace(' ', '-') + '.' + cell_lines + '.json'
        with open(filename, 'w') as f:
            json.dump(sorted(v), f)
        print(filename, len(v))

In [None]:
data = ['cnv_all_shuffled', 'exp_tpm_shuffled', 'mut_all_shuffled'] # alphabetical order for config.yaml
data_subsets = powerset_without_empty_set(data) # results in 28 files
data_dict = dict()

for cell_lines in ['pancancer_ic_gdsc2_cell_lines', 'pancancer_auc_gdsc2_cell_lines', 'pancancer_ic_pubchem_gdsc2_cell_lines', 'pancancer_auc_pubchem_gdsc2_cell_lines']:
    for subset in data_subsets:
        data_dict[subset] = eval(cell_lines)
        for dataset in subset:
            data_dict[subset] = data_dict[subset].intersection(eval(dataset).index)
    for k, v in data_dict.items():
        filename = '../features/shared_cell_lines/' + str(k).replace('(', '').replace(')', '').replace('\'', '').replace(',', '').replace(' ', '-') + '.' + cell_lines + '.json'
        with open(filename, 'w') as f:
            json.dump(sorted(v), f)
        print(filename, len(v))

# Create SMILES file for GNNs

In [None]:
pubchemID_to_name_dict_GDSC2 = {v: k[1] for k, v in drugID_to_pubchemID_dict.items() if k[2] == 'GDSC2'}
name_to_pubchemID_dict_GDSC2 = {k[1]: v for k, v in drugID_to_pubchemID_dict.items() if k[2] == 'GDSC2'}

In [None]:
if not os.path.exists('../features/drug_smiles'):
    os.makedirs('../features/drug_smiles')

In [None]:
smiles_gdsc2 = []
for drug in pancancer_ic_pubchem_gdsc2.columns:
    d = pcp.Compound.from_cid(drug)
    name = pubchemID_to_name_dict_GDSC2[drug]
    smiles_gdsc2.append([name, drug, d.canonical_smiles, d.isomeric_smiles])

In [None]:
# the resulting file drug_smiles_GDSC2.csv could also be used for the 9 drugs below from CCLE that overlap with GDSC2
with open('../features/drug_smiles/drug_smiles_GDSC2.csv', 'w') as f:
    f.write('name,CID,CanonicalSMILES,IsomericSMILES\n')
    for line in smiles_gdsc2:
        f.write(f'{line[0]},{line[1]},{line[2]},{line[3]}\n')

In [None]:
# the file resulting from this cell was not used in the paper
random.seed(0)

# shuffle line[2] and line[3]
last_two_columns = [sublist[2:] for sublist in smiles_gdsc2]
random.shuffle(last_two_columns)
for i, sublist in enumerate(smiles_gdsc2):
    sublist[2:] = last_two_columns[i]

with open('../features/drug_smiles/drug_smiles_GDSC2_shuffled.csv', 'w') as f:
    f.write('name,CID,CanonicalSMILES,IsomericSMILES\n')
    for line in smiles_gdsc2:
        f.write(f'{line[0]},{line[1]},{line[2]},{line[3]}\n')

# CCLE drug response

In [None]:
# https://depmap.org/portal/data_page/?tab=allData -> Select a file set to view: Pharmacological Profiling
ccle_ic_raw = pd.read_csv('CCLE_Data/CCLE_NP24.2009_Drug_data_2015.02.24.csv', index_col=0)

In [None]:
ccle_ic = ccle_ic_raw.pivot(index='Primary Cell Line Name', columns='Compound', values='IC50 (uM)').sort_index().sort_index(axis=1)

In [None]:
ccle_ic.shape

# Create SMILES file for GNNs

In [None]:
smiles_ccle = []
ccle_names_to_cid_dict = {} # used later in last notebook cell
for name in ccle_ic.columns:
    d = pcp.get_compounds(name, 'name')[0]
    ccle_names_to_cid_dict[name] = d.cid
    smiles_ccle.append([name, d.cid, d.canonical_smiles, d.isomeric_smiles])

In [None]:
with open('../features/drug_smiles/drug_smiles_CCLE.csv', 'w') as f:
    f.write('name,CID,CanonicalSMILES,IsomericSMILES\n')
    for line in smiles_ccle:
        f.write(f'{line[0]},{line[1]},{line[2]},{line[3]}\n')

# CCLE multi-omics data (24Q2)

In [None]:
# https://depmap.org/portal/data_page/?tab=allData -> Select a file set to view: DepMap Public, Version: DepMap Public 24Q2
cell_mapping_raw = pd.read_csv('CCLE_Data/Model.csv')

In [None]:
cell_mapping = cell_mapping_raw[['ModelID', 'StrippedCellLineName']].set_index(['ModelID']).T.to_dict('records')[0]

In [None]:
# to solve the problems mentioned in the previous notebook cell
pancancer_ic_gdsc2_renamed = pancancer_ic_gdsc2.rename(index={'KM-H2': 'KMH2', 'KMH-2': 'KMHDASH2', 'MS-1': 'MSDASH1'}).sort_index().sort_index(axis=1)
mut_all_renamed = mut_all.rename(index={'KM-H2': 'KMH2', 'KMH-2': 'KMHDASH2', 'MS-1': 'MSDASH1', 'T-T': 'TDOTT'}).sort_index().sort_index(axis=1)
cnv_all_renamed = cnv_all.rename(index={'KM-H2': 'KMH2', 'KMH-2': 'KMHDASH2', 'MS-1': 'MSDASH1', 'T-T': 'TDOTT'}).sort_index().sort_index(axis=1)
exp_tpm_renamed = exp_tpm.rename(index={'KM-H2': 'KMH2', 'KMH-2': 'KMHDASH2', 'MS-1': 'MSDASH1', 'T-T': 'TDOTT'}).sort_index().sort_index(axis=1)

In [None]:
ccle_mut_raw = pd.read_csv('CCLE_Data/OmicsSomaticMutations.csv', index_col=0, low_memory=False)
ccle_exp_raw = pd.read_csv('CCLE_Data/OmicsExpressionProteinCodingGenesTPMLogp1.csv', index_col=0, low_memory=False)
ccle_cnv_raw = pd.read_csv('CCLE_Data/OmicsCNGene.csv', index_col=0, low_memory=False)

In [None]:
ccle_mut_no_duplicates = ccle_mut_raw.drop_duplicates(subset=['ModelID', 'HugoSymbol'], keep='first') # arbitrary choice
ccle_mut = ccle_mut_no_duplicates.pivot(index='ModelID', columns='HugoSymbol', values='VariantType').sort_index().sort_index(axis=1)
ccle_mut[ccle_mut == ccle_mut] = 1 # treat SNV, insertion, etc. as 1
ccle_mut[ccle_mut != ccle_mut] = 0 # treat nan as 0

In [None]:
ccle_cnv = ccle_cnv_raw.copy()
ccle_cnv = ccle_cnv.dropna(axis=1) # 416 or 31 columns have NaN values, we remove the columns because then we lose less data
ccle_cnv.columns = [i.split(' ')[0] for i in ccle_cnv.columns]

In [None]:
ccle_exp = ccle_exp_raw.copy()
ccle_exp.columns = [i.split(' ')[0] for i in ccle_exp.columns]

In [None]:
ccle_mut = ccle_mut.rename(index=cell_mapping).sort_index().sort_index(axis=1)
ccle_cnv = ccle_cnv.rename(index=cell_mapping).sort_index().sort_index(axis=1)
ccle_exp = ccle_exp.rename(index=cell_mapping).sort_index().sort_index(axis=1)

In [None]:
ccle_mut.shape, ccle_cnv.shape, ccle_exp.shape

# Save shared cell lines between all possible data subsets (was not needed and thus not mentioned in our paper)

In [None]:
# needed because cell lines are not necessarily named the same, but careful, this function might not cover all special characters (but you can find out with the previous notebook cell) or might lead to two names becoming the same (see above, for example KM-H2 and KMH-2)
def strip_list_names(a):
    return [i.replace('.', '').replace('-', '').replace(' ', '').replace('(', '').replace(')', '').replace('/', '').replace(':', '').replace('_', '').replace('[', '').replace(']', '').upper() for i in a]

In [None]:
ccle_ic.index = strip_list_names(ccle_ic.index)
ccle_mut.index = strip_list_names(ccle_mut.index)
ccle_cnv.index = strip_list_names(ccle_cnv.index)
ccle_exp.index = strip_list_names(ccle_exp.index)

In [None]:
ccle_ic_cell_lines = set(ccle_ic.index)

In [None]:
data = ['ccle_cnv', 'ccle_exp', 'ccle_mut'] # alphabetical order for config.yaml
data_subsets = powerset_without_empty_set(data) # results in 7 files
data_dict = dict()
for cell_lines in ['ccle_ic_cell_lines']:
    for subset in data_subsets:
        data_dict[subset] = eval(cell_lines)
        for dataset in subset:
            data_dict[subset] = data_dict[subset].intersection(eval(dataset).index)
    for k, v in data_dict.items():
        filename = '../features/shared_cell_lines/' + str(k).replace('(', '').replace(')', '').replace('\'', '').replace(',', '').replace(' ', '-') + '.' + cell_lines + '.json'
        with open(filename, 'w') as f:
            json.dump(sorted(v), f)
        print(filename, len(v))

In [None]:
# rename features to enable backtracking of feature types
ccle_mut.columns = [i + '.mut' for i in ccle_mut.columns]
ccle_cnv.columns = [i + '.cnv' for i in ccle_cnv.columns]
ccle_exp.columns = [i + '.exp' for i in ccle_exp.columns]

ccle_mut.to_csv('../features/cell_features/ccle_mut.csv')
ccle_cnv.to_csv('../features/cell_features/ccle_cnv.csv')
ccle_exp.to_csv('../features/cell_features/ccle_exp.csv')

# Shared cell lines, features, and PubChem drugs of GDSC2 and CCLE

In [None]:
pancancer_ic_gdsc2_stripped = pancancer_ic_gdsc2_renamed.copy()
pancancer_ic_gdsc2_stripped.index = strip_list_names(pancancer_ic_gdsc2_stripped.index)
pancancer_ic_gdsc2_stripped.columns = strip_list_names(pancancer_ic_gdsc2_stripped.columns)
gdsc2_drugnames_dict_before_strip = dict(zip(pancancer_ic_gdsc2_stripped.columns, pancancer_ic_gdsc2_renamed.columns))

In [None]:
mut_all_stripped = mut_all_renamed.copy()
mut_all_stripped.index = strip_list_names(mut_all_stripped.index)
cnv_all_stripped = cnv_all_renamed.copy()
cnv_all_stripped.index = strip_list_names(cnv_all_stripped.index)
exp_tpm_stripped = exp_tpm_renamed.copy()
exp_tpm_stripped.index = strip_list_names(exp_tpm_stripped.index)

In [None]:
ccle_ic_stripped = ccle_ic.copy()
ccle_ic_stripped.index = strip_list_names(ccle_ic_stripped.index)
ccle_ic_stripped.columns = strip_list_names(ccle_ic_stripped.columns)

In [None]:
ccle_mut_stripped = ccle_mut.copy()
ccle_mut_stripped.index = strip_list_names(ccle_mut_stripped.index)
ccle_cnv_stripped = ccle_cnv.copy()
ccle_cnv_stripped.index = strip_list_names(ccle_cnv_stripped.index)
ccle_exp_stripped = ccle_exp.copy()
ccle_exp_stripped.index = strip_list_names(ccle_exp_stripped.index)

In [None]:
# only for examine_data.ipynb: gdsc2 dataset with IC50 capped at maximum tested dose
# almost no difference to uncapped dataset, thus we did not mention it in our paper
pancancer_gdsc2_raw_capped = pancancer_gdsc2_raw.copy()
pancancer_gdsc2_raw_capped.loc[pancancer_gdsc2_raw_capped['IC50'] > pancancer_gdsc2_raw_capped['Max Conc'], 'IC50'] = pancancer_gdsc2_raw_capped.loc[pancancer_gdsc2_raw_capped['IC50'] > pancancer_gdsc2_raw_capped['Max Conc'], 'Max Conc']
pancancer_ic_gdsc2_capped = preprocess_drug_response(pancancer_gdsc2_raw_capped, auc_values=None)
pancancer_ic_gdsc2_capped_renamed = pancancer_ic_gdsc2_capped.rename(index={'KM-H2': 'KMH2', 'KMH-2': 'KMHDASH2', 'MS-1': 'MSDASH1'}).sort_index().sort_index(axis=1)
pancancer_ic_gdsc2_capped_stripped = pancancer_ic_gdsc2_capped_renamed.copy()
pancancer_ic_gdsc2_capped_stripped.index = strip_list_names(pancancer_ic_gdsc2_capped_stripped.index)
pancancer_ic_gdsc2_capped_stripped.columns = strip_list_names(pancancer_ic_gdsc2_capped_stripped.columns)

In [None]:
shared_rows_ic = sorted(set(ccle_ic_stripped.index).intersection(set(pancancer_ic_gdsc2_stripped.index)))
shared_cols_ic = set(ccle_ic_stripped.columns).intersection(set(pancancer_ic_gdsc2_stripped.columns))

# only keep pubchem drugs because we want to compare with TGSA (which requires pubchem drugs)
no_pubchemID = set()
drugname_to_pubchemID = {}
for c in shared_cols_ic:
    drug_names_drug_ID = pancancer_gdsc2_raw[['Drug Name', 'Drug ID']]
    drug_names_drug_ID.loc[:, 'Drug Name'] = strip_list_names(drug_names_drug_ID.loc[:, 'Drug Name'])
    drug_names_drug_ID = drug_names_drug_ID.sort_values(by=['Drug Name', 'Drug ID']).drop_duplicates(subset=['Drug Name'], keep='last') # arbitrary decision
    for dn, di in zip(drug_names_drug_ID['Drug Name'], drug_names_drug_ID['Drug ID']):
        if c == dn:
            if drugID_to_pubchemID_dict_GDSC2[di] != drugID_to_pubchemID_dict_GDSC2[di]:
                no_pubchemID = no_pubchemID.union([c])
            else:
                drugname_to_pubchemID[c] = drugID_to_pubchemID_dict_GDSC2[di]
shared_cols_ic = shared_cols_ic - no_pubchemID
shared_cols_ic = sorted(shared_cols_ic)

In [None]:
shared_rows_mut = sorted(set(ccle_mut_stripped.index).intersection(set(mut_all_stripped.index)))
shared_cols_mut = sorted(set(ccle_mut_stripped.columns).intersection(set(mut_all_stripped.columns)))
shared_rows_cnv = sorted(set(ccle_cnv_stripped.index).intersection(set(cnv_all_stripped.index)))
shared_cols_cnv = sorted(set(ccle_cnv_stripped.columns).intersection(set(cnv_all_stripped.columns)))
shared_rows_exp = sorted(set(ccle_exp_stripped.index).intersection(set(exp_tpm_stripped.index)))
shared_cols_exp = sorted(set(ccle_exp_stripped.columns).intersection(set(exp_tpm_stripped.columns)))

In [None]:
len(shared_rows_ic), len(shared_cols_ic), len(shared_rows_mut), len(shared_cols_mut), len(shared_rows_cnv), len(shared_cols_cnv), len(shared_rows_exp), len(shared_cols_exp)

In [None]:
gdsc2_ic_shared_with_ccle = pancancer_ic_gdsc2_stripped.loc[shared_rows_ic, shared_cols_ic].sort_index().sort_index(axis=1)
ccle_ic_shared_with_gdsc2 = ccle_ic_stripped.loc[shared_rows_ic, shared_cols_ic].sort_index().sort_index(axis=1)
gdsc2_ic_shared_with_ccle.columns = pd.Series(name_to_pubchemID_dict_GDSC2, dtype=str)[pd.Series(gdsc2_drugnames_dict_before_strip)[gdsc2_ic_shared_with_ccle.columns]]
ccle_ic_shared_with_gdsc2.columns = pd.Series(name_to_pubchemID_dict_GDSC2, dtype=str)[pd.Series(gdsc2_drugnames_dict_before_strip)[ccle_ic_shared_with_gdsc2.columns]]

# only for examine_data.ipynb
gdsc2_ic_capped_shared_with_ccle = pancancer_ic_gdsc2_capped_stripped.loc[shared_rows_ic, shared_cols_ic].sort_index().sort_index(axis=1)

In [None]:
gdsc2_mut_shared_with_ccle = mut_all_stripped.loc[shared_rows_mut, shared_cols_mut].sort_index().sort_index(axis=1)
ccle_mut_shared_with_gdsc2 = ccle_mut_stripped.loc[shared_rows_mut, shared_cols_mut].sort_index().sort_index(axis=1)
gdsc2_cnv_shared_with_ccle = cnv_all_stripped.loc[shared_rows_cnv, shared_cols_cnv].sort_index().sort_index(axis=1)
ccle_cnv_shared_with_gdsc2 = ccle_cnv_stripped.loc[shared_rows_cnv, shared_cols_cnv].sort_index().sort_index(axis=1)
gdsc2_exp_shared_with_ccle = exp_tpm_stripped.loc[shared_rows_exp, shared_cols_exp].sort_index().sort_index(axis=1)
ccle_exp_shared_with_gdsc2 = ccle_exp_stripped.loc[shared_rows_exp, shared_cols_exp].sort_index().sort_index(axis=1)

In [None]:
print(gdsc2_ic_shared_with_ccle.shape, ccle_ic_shared_with_gdsc2.shape)
print(gdsc2_mut_shared_with_ccle.shape, ccle_mut_shared_with_gdsc2.shape)
print(gdsc2_cnv_shared_with_ccle.shape, ccle_cnv_shared_with_gdsc2.shape)
print(gdsc2_exp_shared_with_ccle.shape, ccle_exp_shared_with_gdsc2.shape)

In [None]:
gdsc2_ic_shared_with_ccle_cell_lines = set(gdsc2_ic_shared_with_ccle.index)
ccle_ic_shared_with_gdsc2_cell_lines = set(ccle_ic_shared_with_gdsc2.index)

In [None]:
data = ['gdsc2_cnv_shared_with_ccle', 'gdsc2_exp_shared_with_ccle', 'gdsc2_mut_shared_with_ccle'] # alphabetical order for config.yaml
data_subsets = powerset_without_empty_set(data) # results in 7 files
data_dict = dict()
for cell_lines in ['gdsc2_ic_shared_with_ccle_cell_lines']:
    for subset in data_subsets:
        data_dict[subset] = eval(cell_lines)
        for dataset in subset:
            data_dict[subset] = data_dict[subset].intersection(eval(dataset).index)
    for k, v in data_dict.items():
        filename = '../features/shared_cell_lines/' + str(k).replace('(', '').replace(')', '').replace('\'', '').replace(',', '').replace(' ', '-') + '.' + cell_lines + '.json'
        with open(filename, 'w') as f:
            json.dump(sorted(v), f)
        print(filename, len(v))

In [None]:
data = ['ccle_cnv_shared_with_gdsc2', 'ccle_exp_shared_with_gdsc2', 'ccle_mut_shared_with_gdsc2'] # alphabetical order for config.yaml
data_subsets = powerset_without_empty_set(data) # results in 7 files
data_dict = dict()
for cell_lines in ['ccle_ic_shared_with_gdsc2_cell_lines']:
    for subset in data_subsets:
        data_dict[subset] = eval(cell_lines)
        for dataset in subset:
            data_dict[subset] = data_dict[subset].intersection(eval(dataset).index)
    for k, v in data_dict.items():
        filename = '../features/shared_cell_lines/' + str(k).replace('(', '').replace(')', '').replace('\'', '').replace(',', '').replace(' ', '-') + '.' + cell_lines + '.json'
        with open(filename, 'w') as f:
            json.dump(sorted(v), f)
        print(filename, len(v))

In [None]:
gdsc2_ic_shared_with_ccle.to_csv('../targets/gdsc2_ic_shared_with_ccle.csv')
gdsc2_mut_shared_with_ccle.to_csv('../features/cell_features/gdsc2_mut_shared_with_ccle.csv')
gdsc2_cnv_shared_with_ccle.to_csv('../features/cell_features/gdsc2_cnv_shared_with_ccle.csv')
gdsc2_exp_shared_with_ccle.to_csv('../features/cell_features/gdsc2_exp_shared_with_ccle.csv')

ccle_ic_shared_with_gdsc2.to_csv('../targets/ccle_ic_shared_with_gdsc2.csv')
ccle_mut_shared_with_gdsc2.to_csv('../features/cell_features/ccle_mut_shared_with_gdsc2.csv')
ccle_cnv_shared_with_gdsc2.to_csv('../features/cell_features/ccle_cnv_shared_with_gdsc2.csv')
ccle_exp_shared_with_gdsc2.to_csv('../features/cell_features/ccle_exp_shared_with_gdsc2.csv')

# only for examine_data.ipynb
gdsc2_ic_capped_shared_with_ccle.to_csv('../targets/gdsc2_ic_capped_shared_with_ccle.csv')

# Prepare TGDRP/TGSA data

In [None]:
cosmic_706_cancer_related_geneIDs = pd.read_csv('TGSA_Data/mu.csv', index_col=0) # does not matter which file we use because mu.csv has the same columns as cn.csv and exp.csv
enterez_NCBI_to_hugo_gene_symbol = pd.read_csv('TGSA_Data/enterez_NCBI_to_hugo_gene_symbol_march_2019.txt', sep='\t').dropna()
ID_to_name_dict = dict(zip(['(' + str(i) + ')' for i in enterez_NCBI_to_hugo_gene_symbol['NCBI Gene ID(supplied by NCBI)'].astype(int)], [i.split('~')[0] for i in enterez_NCBI_to_hugo_gene_symbol['Approved symbol']]))
cosmic_706_cancer_related_genes = sorted(cosmic_706_cancer_related_geneIDs.rename(columns=ID_to_name_dict).columns)

In [None]:
mut_all.columns = [i[:-4] for i in mut_all.columns]
cnv_all.columns = [i[:-4] for i in cnv_all.columns]
exp_tpm.columns = [i[:-4] for i in exp_tpm.columns]
cosmic_cancer_related_genes_gdsc = sorted(set(cosmic_706_cancer_related_genes).intersection(set(mut_all.columns)).intersection(set(cnv_all.columns)).intersection(set(exp_tpm.columns)))

In [None]:
with open('../features/cosmic_cancer_related_genes_gdsc.json', 'w') as f:
    json.dump(cosmic_cancer_related_genes_gdsc, f)

In [None]:
ccle_mut.columns = [i[:-4] for i in ccle_mut.columns]
ccle_cnv.columns = [i[:-4] for i in ccle_cnv.columns]
ccle_exp.columns = [i[:-4] for i in ccle_exp.columns]
cosmic_cancer_related_genes_ccle = sorted(set(cosmic_706_cancer_related_genes).intersection(set(ccle_mut.columns)).intersection(set(ccle_cnv.columns)).intersection(set(ccle_exp.columns)))

In [None]:
with open('../features/cosmic_cancer_related_genes_ccle.json', 'w') as f:
    json.dump(cosmic_cancer_related_genes_ccle, f)

In [None]:
ccle_ic = ccle_ic.rename(columns=ccle_names_to_cid_dict)
ccle_ic.to_csv('../targets/ccle_ic.csv')