In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [2]:
from pathlib import Path
# read deduplicated data
response_path = Path('./data/deduplicated_combined_single_response_rescaled_agg.txt')
cell_cancer_types_map_path = Path('./data/combined_cancer_types')
cell_name_map_path = Path('./data/cell_line_ID_mapping.txt')
drug_descriptors_mordred_path = Path('./data/deduplicated_combined_drug_descriptors_mordred.txt')
cell_rnaseq_path = Path('./data/deduplicated_combined_rnaseq_data_combat.txt')

In [3]:
df_response = pd.read_csv(response_path, sep='\t', engine='c', low_memory=False)
df_response

Unnamed: 0,SOURCE,CELL,DRUG,STUDY,AUC,IC50,EC50,EC50se,R2fit,Einf,HS,AAC1,AUC1,DSS1
0,CCLE,CCL_61,Drug_1,fake_exp,0.7153,5.6600,5.6600,0.6867,0.9533,0.0000,0.6669,0.2240,0.7760,0.1661
1,CCLE,CCL_61,Drug_6,fake_exp,0.5743,7.0040,7.0600,0.0128,0.9989,0.1652,3.1500,0.4675,0.5325,0.4438
2,CCLE,CCL_61,Drug_7,fake_exp,0.9335,,2.9570,0.0000,0.0000,0.8670,0.0000,0.0665,0.9335,0.0000
3,CCLE,CCL_61,Drug_9,fake_exp,0.8823,,5.7930,0.0020,1.0000,0.6063,2.8980,0.0784,0.9216,0.0544
4,CCLE,CCL_61,Drug_11,fake_exp,0.7019,,8.1290,0.1276,0.8550,0.5668,2.1750,0.3725,0.6275,0.3054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523365,GDSC2,CCL_393,Drug_654,19498,0.9548,,7.8900,0.0000,0.0000,0.9096,0.0000,0.0452,0.9548,0.0000
523366,GDSC2,CCL_393,Drug_655,19498,0.8190,3.0070,3.0070,46.7600,0.4604,0.0000,0.1818,0.1943,0.8057,0.1047
523367,GDSC2,CCL_393,Drug_656,19498,0.9105,,3.4040,0.0000,0.0000,0.8209,0.0000,0.0895,0.9105,0.0000
523368,GDSC2,CCL_393,Drug_657,19498,0.9566,0.2428,0.2428,233.0000,0.1946,0.0000,0.2200,0.0438,0.9562,0.0000


# Identify target cancer types

This step is equivalent to below command

``` cut -f 2,3 ./data/combined_single_response_rescaled_agg | sort | uniq > uniq_cl_drugs```

In [4]:
df_uniq_cl_drugs = df_response[['CELL','DRUG']].drop_duplicates().reset_index(drop=True)
df_uniq_cl_drugs

Unnamed: 0,CELL,DRUG
0,CCL_61,Drug_1
1,CCL_61,Drug_6
2,CCL_61,Drug_7
3,CCL_61,Drug_9
4,CCL_61,Drug_11
...,...,...
523365,CCL_393,Drug_654
523366,CCL_393,Drug_655
523367,CCL_393,Drug_656
523368,CCL_393,Drug_657


In [5]:
df_cl_cancer_map = pd.read_csv(cell_cancer_types_map_path, sep='\t', header=None, names=['CELL', 'CANCER_TYPE'])
df_cl_name_map = pd.read_csv(cell_name_map_path, sep='\t', header=0, names=['CELL', 'CELL_ID'])
df_cl_cancer_map = df_cl_cancer_map.merge(df_cl_name_map, on='CELL', how='right')
df_cl_cancer_map

Unnamed: 0,CELL,CANCER_TYPE,CELL_ID
0,CCLE.ALLSIL,Acute_Lymphoblastic_Leukemia,CCL_95
1,CCLE.DND41,Acute_Lymphoblastic_Leukemia,CCL_211
2,CCLE.KE37,Acute_Lymphoblastic_Leukemia,CCL_457
3,CCLE.LOUCY,Acute_Lymphoblastic_Leukemia,CCL_537
4,CCLE.MOLT13,Acute_Lymphoblastic_Leukemia,CCL_599
...,...,...,...
2916,gCSI.HEC-6,Uterine_Corpus_Endometrial_Carcinoma,CCL_316
2917,gCSI.KLE,Uterine_Corpus_Endometrial_Carcinoma,CCL_465
2918,gCSI.RL95-2,Uterine_Corpus_Endometrial_Carcinoma,CCL_829
2919,gCSI.SNG-M,Uterine_Corpus_Endometrial_Carcinoma,CCL_900


In [6]:
df_cl_cancer_map.drop(labels=['CELL'], axis=1, inplace=True)
df_cl_cancer_map.rename(columns={'CELL_ID':'CELL'}, inplace=True)
df_cl_cancer_map

Unnamed: 0,CANCER_TYPE,CELL
0,Acute_Lymphoblastic_Leukemia,CCL_95
1,Acute_Lymphoblastic_Leukemia,CCL_211
2,Acute_Lymphoblastic_Leukemia,CCL_457
3,Acute_Lymphoblastic_Leukemia,CCL_537
4,Acute_Lymphoblastic_Leukemia,CCL_599
...,...,...
2916,Uterine_Corpus_Endometrial_Carcinoma,CCL_316
2917,Uterine_Corpus_Endometrial_Carcinoma,CCL_465
2918,Uterine_Corpus_Endometrial_Carcinoma,CCL_829
2919,Uterine_Corpus_Endometrial_Carcinoma,CCL_900


In [7]:
df_uniq_cl_drugs

Unnamed: 0,CELL,DRUG
0,CCL_61,Drug_1
1,CCL_61,Drug_6
2,CCL_61,Drug_7
3,CCL_61,Drug_9
4,CCL_61,Drug_11
...,...,...
523365,CCL_393,Drug_654
523366,CCL_393,Drug_655
523367,CCL_393,Drug_656
523368,CCL_393,Drug_657


Merge and get top n. This step is equivalent to below command line. 

```Intersection.pl $df_dir/cell_lines/combined_cancer_types 1 uniq_cl_drugs 1 -i | cut -f 2 | sort | uniq -c | sort -nr | perl -pe 's/^ *//' | perl -pe 's/ /\t/' > top_cancer_types
```

In [8]:
df_cl_cancer_drug = df_cl_cancer_map.merge(df_uniq_cl_drugs, on='CELL', how='left', sort='true')
df_cl_cancer_drug

Unnamed: 0,CANCER_TYPE,CELL,DRUG
0,Pancreatic_Adenocarcinoma,CCL_100,Drug_1
1,Pancreatic_Adenocarcinoma,CCL_100,Drug_2
2,Pancreatic_Adenocarcinoma,CCL_100,Drug_3
3,Pancreatic_Adenocarcinoma,CCL_100,Drug_5
4,Pancreatic_Adenocarcinoma,CCL_100,Drug_6
...,...,...,...
1733659,Lung_NOS,CCL_998,Drug_496
1733660,Lung_NOS,CCL_998,Drug_497
1733661,Lung_NOS,CCL_998,Drug_499
1733662,Lung_NOS,CCL_998,Drug_500


In [9]:
df_cl_cancer_drug['CELL_DRUG'] = df_cl_cancer_drug.CELL.astype(str) + '.' + df_cl_cancer_drug.DRUG.astype(str)
df_cl_cancer_drug

Unnamed: 0,CANCER_TYPE,CELL,DRUG,CELL_DRUG
0,Pancreatic_Adenocarcinoma,CCL_100,Drug_1,CCL_100.Drug_1
1,Pancreatic_Adenocarcinoma,CCL_100,Drug_2,CCL_100.Drug_2
2,Pancreatic_Adenocarcinoma,CCL_100,Drug_3,CCL_100.Drug_3
3,Pancreatic_Adenocarcinoma,CCL_100,Drug_5,CCL_100.Drug_5
4,Pancreatic_Adenocarcinoma,CCL_100,Drug_6,CCL_100.Drug_6
...,...,...,...,...
1733659,Lung_NOS,CCL_998,Drug_496,CCL_998.Drug_496
1733660,Lung_NOS,CCL_998,Drug_497,CCL_998.Drug_497
1733661,Lung_NOS,CCL_998,Drug_499,CCL_998.Drug_499
1733662,Lung_NOS,CCL_998,Drug_500,CCL_998.Drug_500


In [10]:
top_n = df_cl_cancer_drug.groupby(['CANCER_TYPE']).count().sort_values('CELL_DRUG', ascending=False).head(21)
top_n

Unnamed: 0_level_0,CELL,DRUG,CELL_DRUG
CANCER_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lung_Adenocarcinoma,145027,145025,145027
Skin_Cutaneous_Melanoma,120684,120683,120684
Colon_Adenocarcinoma,118856,118852,118856
Breast_Invasive_Carcinoma,112673,112668,112673
Lymphoid_Leukemia,86748,86742,86748
Lung_Small_Cell_Carcinoma,66759,66756,66759
Ovarian_Serous_Cystadenocarcinoma,63839,63839,63839
Sarcoma,63803,63792,63803
Pancreatic_Adenocarcinoma,62027,62027,62027
Lung_Non-Small_Cell_Carcinoma,58341,58341,58341


In [11]:
top_n_cancer_types = top_n.index.to_list()
top_n_cancer_types

['Lung_Adenocarcinoma',
 'Skin_Cutaneous_Melanoma',
 'Colon_Adenocarcinoma',
 'Breast_Invasive_Carcinoma',
 'Lymphoid_Leukemia',
 'Lung_Small_Cell_Carcinoma',
 'Ovarian_Serous_Cystadenocarcinoma',
 'Sarcoma',
 'Pancreatic_Adenocarcinoma',
 'Lung_Non-Small_Cell_Carcinoma',
 'Glioblastoma_Multiforme',
 'Kidney_Renal_Clear_Cell_Carcinoma',
 'Esophageal_Carcinoma',
 'Lymphoid_Neoplasm_Diffuse_Large_B-cell_Lymphoma',
 'Liver_Hepatocellular_Carcinoma',
 'Stomach_Adenocarcinoma',
 'Head_and_Neck_Squamous_Cell_Carcinoma',
 'Lung_Squamous_Cell_Carcinoma',
 'Uterine_Corpus_Endometrial_Carcinoma',
 'Acute_Myeloid_Leukemia',
 'Myeloma']

## Indentify cell lines associated with the target cancer types

In [12]:
df_cl = df_cl_cancer_drug[df_cl_cancer_drug['CANCER_TYPE'].isin(top_n_cancer_types)][['CELL']].drop_duplicates().reset_index(drop=True)
df_cl

Unnamed: 0,CELL
0,CCL_100
1,CCL_1000
2,CCL_1001
3,CCL_1002
4,CCL_1003
...,...
725,CCL_990
726,CCL_992
727,CCL_993
728,CCL_994


## Identify drugs associated with the target cancer type & filtered by drug_list

In [13]:
df_drugs = df_cl_cancer_drug[df_cl_cancer_drug['CANCER_TYPE'].isin(top_n_cancer_types)][['DRUG']].drop_duplicates().reset_index(drop=True)
df_drugs

Unnamed: 0,DRUG
0,Drug_1
1,Drug_2
2,Drug_3
3,Drug_5
4,Drug_6
...,...
1488,Drug_1364
1489,Drug_1557
1490,Drug_1560
1491,Drug_1567


## Filter response by cell lines (730) and drugs (1493)

In [14]:
cl_filter = df_cl.CELL.to_list()
dr_filter = df_drugs.DRUG.to_list()

In [15]:
df_response = df_response[df_response.CELL.isin(cl_filter) & df_response.DRUG.isin(dr_filter)][['CELL','DRUG','AUC']].drop_duplicates().reset_index(drop=True)
df_response 

Unnamed: 0,CELL,DRUG,AUC
0,CCL_65,Drug_1,0.8126
1,CCL_65,Drug_3,0.4959
2,CCL_65,Drug_7,0.8691
3,CCL_65,Drug_8,0.7881
4,CCL_65,Drug_16,0.7088
...,...,...,...
411299,CCL_393,Drug_654,0.9548
411300,CCL_393,Drug_655,0.8190
411301,CCL_393,Drug_656,0.9105
411302,CCL_393,Drug_657,0.9566


## Join response data with Drug descriptor & RNASeq

In [16]:
df_rnaseq = pd.read_csv(cell_rnaseq_path, sep='\t', low_memory=False)
df_rnaseq = df_rnaseq[df_rnaseq['Sample'].isin(cl_filter)].reset_index(drop=True)

In [17]:
df_rnaseq.rename(columns={'Sample':'CELL'}, inplace=True)
df_rnaseq = df_rnaseq.set_index(['CELL'])

In [18]:
# add prefix on the attribute columns
cols = df_rnaseq.columns.to_list()
ge_cols = [f'GE_{col}' for col in cols]
df_rnaseq.columns = ge_cols

In [19]:
df_rnaseq

Unnamed: 0_level_0,GE_A1BG,GE_A1CF,GE_A2M,GE_A2ML1,GE_A3GALT2,GE_A4GALT,GE_A4GNT,GE_AAAS,GE_AACS,GE_AADAC,...,GE_ZWILCH,GE_ZWINT,GE_ZXDA,GE_ZXDB,GE_ZXDC,GE_ZYG11A,GE_ZYG11B,GE_ZYX,GE_ZZEF1,GE_ZZZ3
CELL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CCL_62,-1.421816,2.175727,-0.466091,-0.222140,-0.408307,-0.429530,-0.427922,-0.462058,-0.368245,-0.507094,...,0.949667,0.090989,0.491263,1.117663,0.445099,-0.988673,0.801770,-0.364203,-0.014804,1.247922
CCL_65,1.209724,-0.278733,-0.416899,-0.372810,1.115141,-0.753543,-0.080482,-0.130010,-0.254613,-0.436202,...,0.632992,-0.097421,-0.918279,-1.917117,-1.377897,-1.020918,0.201952,0.551684,-0.176323,1.228964
CCL_70,1.126623,-0.339088,-0.367707,-0.362765,-0.408307,-1.341159,-0.393178,0.356993,-3.297435,-0.535450,...,-0.079526,0.632668,-1.937947,-0.858835,0.396701,-1.020918,-0.114619,-0.822147,0.049804,-0.534133
CCL_71,-1.454133,0.777490,-0.373173,-0.332631,0.244599,0.317347,0.475421,0.179901,0.225168,-0.450381,...,0.854664,0.114540,-0.153528,-0.594264,-0.877783,0.309214,-1.064331,1.405687,-0.353995,0.243146
CCL_72,-1.537235,-0.278733,-0.416899,-0.372810,0.353417,0.136120,-0.184714,-1.081880,0.136787,-0.535450,...,1.931358,0.891732,0.221351,-0.205190,-1.103641,-0.988673,-0.081296,1.418064,-1.387718,0.736055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCL_42,0.412532,-0.265602,-0.420079,-0.259848,-0.310406,1.707794,-0.158447,-0.180041,0.728996,-0.383944,...,0.790846,-0.293543,1.003597,0.955632,-0.224492,1.240912,0.329235,1.404058,0.378114,1.252223
CCL_46,0.934353,-0.265602,2.265315,0.083384,-0.310406,1.769761,-0.158447,1.154452,1.315011,-0.383944,...,1.098708,-0.104170,1.660447,2.175937,1.473070,-1.008849,0.775270,0.448669,0.139148,0.981329
CCL_50,1.430925,-0.265602,-0.438163,-0.259848,-0.310406,0.053276,-0.158447,-0.808038,-0.593292,-0.383944,...,-0.112214,1.017499,-0.211576,-0.357354,-1.157218,1.111265,-0.490894,0.023324,-1.154084,-1.127772
CCL_56,-1.439092,-0.265602,-0.460767,-0.259848,-0.310406,-0.021084,-0.158447,-0.760938,0.263190,-0.383944,...,1.078184,-0.410080,-0.917690,-0.743526,-1.231836,0.851970,0.113412,-0.127183,-0.943231,0.845883


In [22]:
df_descriptor = pd.read_csv(drug_descriptors_mordred_path, sep='\t', low_memory=False, na_values='na')
df_descriptor = df_descriptor[df_descriptor.ID.isin(dr_filter)].set_index(['ID']).fillna(0)

In [23]:
df_descriptor.drop(labels=['NAME', 'CLEAN_NAME', 'SMILES'], axis=1, inplace=True)
df_descriptor[df_descriptor.columns] = scaler.fit_transform(df_descriptor[df_descriptor.columns]).astype(dtype=np.float32)
df_descriptor

Unnamed: 0_level_0,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,dd_SpMax_A,dd_SpDiam_A,dd_SpAD_A,dd_SpMAD_A,dd_LogEE_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Drug_1,0.178943,0.026685,-0.296681,0.307032,0.187441,0.380043,0.267720,0.187441,0.301024,0.372629,...,0.573775,0.374900,-0.013642,-0.550884,-0.055783,-0.031402,0.219639,0.261155,-0.394765,0.054656
Drug_2,-0.066723,-0.179484,-0.296681,0.307032,-0.008114,0.079384,0.163981,-0.008114,0.249253,0.174481,...,0.076371,-0.490116,0.128748,0.368511,-0.055783,-0.099157,-0.079380,-0.095725,-0.175268,-0.066202
Drug_3,-0.340374,-0.409640,-0.296681,0.307032,-0.247362,0.048792,-0.128580,-0.247362,0.261627,-0.058516,...,-0.523674,-0.070684,-0.381396,-0.528306,-0.055783,-0.505682,-0.401400,-0.452605,-0.353842,-0.243934
Drug_4,0.024618,-0.056992,-0.296681,-0.398811,0.037584,-0.105526,-0.024433,0.037584,-0.035223,0.239573,...,0.025458,-0.393486,0.087830,0.850482,-0.055783,-0.099157,-0.010375,-0.075898,0.186531,0.026219
Drug_5,0.773429,0.486916,-0.296681,0.307032,0.846020,0.567565,0.547725,0.846020,0.440356,0.746141,...,1.235954,0.913274,0.586806,-0.467614,-0.055783,1.188175,0.932685,1.103790,0.324182,0.612735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Drug_1563,-0.501424,-0.561854,-0.296681,-0.398811,-0.421235,-0.057452,0.024552,-0.421235,0.302990,-0.223737,...,-0.479939,-0.889142,-0.565242,-0.556871,-0.055783,-0.539559,-0.539409,-0.571565,-0.454290,-0.435885
Drug_1564,-0.893817,-0.885040,-0.296681,-0.398811,-0.841376,0.185123,0.016088,-0.841376,0.845137,-0.589147,...,-0.636688,-0.250929,-0.973778,-0.293147,-0.055784,-1.013840,-0.861430,-0.829312,-1.004893,-0.969081
Drug_1565,-0.366195,-0.401324,-0.296681,0.307032,-0.259657,0.103924,0.188986,-0.259657,0.495251,-0.071912,...,0.019703,-0.750884,-0.471206,-0.696284,-0.055783,-0.234665,-0.355397,-0.323732,-0.506374,-0.307917
Drug_1566,-1.266211,-1.101860,-0.296681,-0.398811,-1.284552,-0.134919,-0.054383,-1.284552,-1.151456,-1.403381,...,-1.501341,-1.627462,-1.161396,-1.793639,-0.055784,-1.115471,-1.321460,-1.354720,-0.404996,-1.118376


In [24]:
%%time
df_1 = df_response.merge(df_rnaseq, on='CELL', how='left', sort='true')

CPU times: user 6.94 s, sys: 11.4 s, total: 18.4 s
Wall time: 16.2 s


In [25]:
df_1.set_index(['DRUG'])

Unnamed: 0_level_0,CELL,AUC,GE_A1BG,GE_A1CF,GE_A2M,GE_A2ML1,GE_A3GALT2,GE_A4GALT,GE_A4GNT,GE_AAAS,...,GE_ZWILCH,GE_ZWINT,GE_ZXDA,GE_ZXDB,GE_ZXDC,GE_ZYG11A,GE_ZYG11B,GE_ZYX,GE_ZZEF1,GE_ZZZ3
DRUG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Drug_1,CCL_100,0.7194,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
Drug_2,CCL_100,0.7905,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
Drug_3,CCL_100,0.4439,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
Drug_5,CCL_100,0.6256,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
Drug_6,CCL_100,0.7049,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Drug_654,CCL_994,0.9743,-1.366415,3.312423,-0.444228,-0.372810,-0.408307,-0.819443,-0.427922,-0.240693,...,-0.997882,-0.839286,-0.153528,-0.127375,-0.200210,-1.012857,-0.097957,0.149436,0.195171,-0.761630
Drug_655,CCL_994,0.8121,-1.366415,3.312423,-0.444228,-0.372810,-0.408307,-0.819443,-0.427922,-0.240693,...,-0.997882,-0.839286,-0.153528,-0.127375,-0.200210,-1.012857,-0.097957,0.149436,0.195171,-0.761630
Drug_656,CCL_994,0.9229,-1.366415,3.312423,-0.444228,-0.372810,-0.408307,-0.819443,-0.427922,-0.240693,...,-0.997882,-0.839286,-0.153528,-0.127375,-0.200210,-1.012857,-0.097957,0.149436,0.195171,-0.761630
Drug_657,CCL_994,0.8585,-1.366415,3.312423,-0.444228,-0.372810,-0.408307,-0.819443,-0.427922,-0.240693,...,-0.997882,-0.839286,-0.153528,-0.127375,-0.200210,-1.012857,-0.097957,0.149436,0.195171,-0.761630


In [26]:
%%time
df_2 = df_1.merge(df_descriptor, left_on='DRUG', right_on='ID', how='left', sort='true')

CPU times: user 1min 24s, sys: 35.6 s, total: 2min
Wall time: 1min 20s


In [27]:
df_2

Unnamed: 0,CELL,DRUG,AUC,GE_A1BG,GE_A1CF,GE_A2M,GE_A2ML1,GE_A3GALT2,GE_A4GALT,GE_A4GNT,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,CCL_100,Drug_1,0.7194,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,...,0.573775,0.374900,-0.013642,-0.550884,-0.055783,-0.031402,0.219639,0.261155,-0.394765,0.054656
1,CCL_1000,Drug_1,0.8588,0.831151,-0.339088,-0.395036,-0.181962,-0.408307,0.548000,0.579653,...,0.573775,0.374900,-0.013642,-0.550884,-0.055783,-0.031402,0.219639,0.261155,-0.394765,0.054656
2,CCL_1001,Drug_1,0.8150,-1.449517,2.467445,-0.482489,-0.292453,-0.408307,-0.264778,-0.427922,...,0.573775,0.374900,-0.013642,-0.550884,-0.055783,-0.031402,0.219639,0.261155,-0.394765,0.054656
3,CCL_1002,Drug_1,0.7922,1.158940,-0.339088,-0.099885,-0.372810,0.353417,0.421690,0.475421,...,0.573775,0.374900,-0.013642,-0.550884,-0.055783,-0.031402,0.219639,0.261155,-0.394765,0.054656
4,CCL_1004,Drug_1,0.8194,0.863469,-0.339088,-0.449694,-0.302497,0.244599,0.569967,-0.288946,...,0.573775,0.374900,-0.013642,-0.550884,-0.055783,-0.031402,0.219639,0.261155,-0.394765,0.054656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411299,CCL_88,Drug_999,0.9109,-0.216848,-0.087607,-0.526215,-0.332631,-0.408307,0.020793,0.822861,...,0.655956,0.901895,0.634924,0.838391,-0.055783,0.409001,0.679669,0.746909,-0.115743,0.431448
411300,CCL_889,Drug_999,0.9730,0.992737,-0.308911,-0.373173,-0.252274,-0.408307,1.124633,-0.184714,...,0.655956,0.901895,0.634924,0.838391,-0.055783,0.409001,0.679669,0.746909,-0.115743,0.431448
411301,CCL_93,Drug_999,0.9461,-1.149429,-0.278733,-0.449694,-0.312542,-0.408307,0.729228,4.783671,...,0.655956,0.901895,0.634924,0.838391,-0.055783,0.409001,0.679669,0.746909,-0.115743,0.431448
411302,CCL_961,Drug_999,0.8778,0.969654,-0.329029,-0.438762,-0.372810,-0.408307,-1.330176,-0.010994,...,0.655956,0.901895,0.634924,0.838391,-0.055783,0.409001,0.679669,0.746909,-0.115743,0.431448


In [28]:
%%time
df_2.to_parquet('top21.parquet', index=False)

CPU times: user 2min 10s, sys: 3.55 s, total: 2min 14s
Wall time: 2min 14s


In [29]:
%%time
df_2.to_hdf('top21.h5', key='df', mode='w')

CPU times: user 3min 1s, sys: 1min 51s, total: 4min 53s
Wall time: 3min 45s


In [None]:
# post analysis - count uniq cell/drug label/profiles
cols = df_2.columns.to_list()
cl_columns = list(filter(lambda x: x.startswith('GE_'), cols))
dd_columns = list(filter(lambda x: x.startswith('dd_'), cols))

df_cl_label = df_2['CELL'].drop_duplicates()
df_cl_prof = df_2[cl_columns].drop_duplicates()
print(f'uniq cell labels: {len(df_cl_label)}, profiles: {len(df_cl_prof)}')

df_dd_label = df_2['DRUG'].drop_duplicates()
df_dd_prof = df_2[dd_columns].drop_duplicates()
print(f'uniq drug labels: {len(df_dd_label)}, profiles: {len(df_dd_prof)}')

In [30]:
# save cell line & drug labels
df_cl_label.to_csv('top21_cell.txt', header=False, index=False)
df_dd_label.to_csv('top21_drug.txt', header=False, index=False)