In [1]:
import pandas as pd

In [2]:
from pathlib import Path
# read deduplicated data
response_path = Path('./data/deduplicated_combined_single_response_rescaled_agg.txt')
cell_cancer_types_map_path = Path('./data/combined_cancer_types')
cell_name_map_path = Path('./data/cell_line_ID_mapping.txt')
drug_descriptors_mordred_path = Path('./data/deduplicated_combined_drug_descriptors_mordred.txt')
cell_rnaseq_path = Path('./data/deduplicated_combined_rnaseq_data_combat.txt')

In [3]:
df_response = pd.read_csv(response_path, sep='\t', engine='c', low_memory=False)
df_response

Unnamed: 0,SOURCE,CELL,DRUG,STUDY,AUC,IC50,EC50,EC50se,R2fit,Einf,HS,AAC1,AUC1,DSS1
0,CCLE,CCL_61,Drug_1,fake_exp,0.7153,5.6600,5.6600,0.6867,0.9533,0.0000,0.6669,0.2240,0.7760,0.1661
1,CCLE,CCL_61,Drug_6,fake_exp,0.5743,7.0040,7.0600,0.0128,0.9989,0.1652,3.1500,0.4675,0.5325,0.4438
2,CCLE,CCL_61,Drug_7,fake_exp,0.9335,,2.9570,0.0000,0.0000,0.8670,0.0000,0.0665,0.9335,0.0000
3,CCLE,CCL_61,Drug_9,fake_exp,0.8823,,5.7930,0.0020,1.0000,0.6063,2.8980,0.0784,0.9216,0.0544
4,CCLE,CCL_61,Drug_11,fake_exp,0.7019,,8.1290,0.1276,0.8550,0.5668,2.1750,0.3725,0.6275,0.3054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523365,GDSC2,CCL_393,Drug_654,19498,0.9548,,7.8900,0.0000,0.0000,0.9096,0.0000,0.0452,0.9548,0.0000
523366,GDSC2,CCL_393,Drug_655,19498,0.8190,3.0070,3.0070,46.7600,0.4604,0.0000,0.1818,0.1943,0.8057,0.1047
523367,GDSC2,CCL_393,Drug_656,19498,0.9105,,3.4040,0.0000,0.0000,0.8209,0.0000,0.0895,0.9105,0.0000
523368,GDSC2,CCL_393,Drug_657,19498,0.9566,0.2428,0.2428,233.0000,0.1946,0.0000,0.2200,0.0438,0.9562,0.0000


# Identify target cancer types

This step is equivalent to below command

``` cut -f 2,3 ./data/combined_single_response_rescaled_agg | sort | uniq > uniq_cl_drugs```

In [4]:
df_uniq_cl_drugs = df_response[['CELL','DRUG']].drop_duplicates().reset_index(drop=True)
df_uniq_cl_drugs

Unnamed: 0,CELL,DRUG
0,CCL_61,Drug_1
1,CCL_61,Drug_6
2,CCL_61,Drug_7
3,CCL_61,Drug_9
4,CCL_61,Drug_11
...,...,...
523365,CCL_393,Drug_654
523366,CCL_393,Drug_655
523367,CCL_393,Drug_656
523368,CCL_393,Drug_657


In [5]:
df_cl_cancer_map = pd.read_csv(cell_cancer_types_map_path, sep='\t', header=None, names=['CELL', 'CANCER_TYPE'])
df_cl_name_map = pd.read_csv(cell_name_map_path, sep='\t', header=0, names=['CELL', 'CELL_ID'])
df_cl_cancer_map = df_cl_cancer_map.merge(df_cl_name_map, on='CELL', how='right')
df_cl_cancer_map

Unnamed: 0,CELL,CANCER_TYPE,CELL_ID
0,CCLE.ALLSIL,Acute_Lymphoblastic_Leukemia,CCL_95
1,CCLE.DND41,Acute_Lymphoblastic_Leukemia,CCL_211
2,CCLE.KE37,Acute_Lymphoblastic_Leukemia,CCL_457
3,CCLE.LOUCY,Acute_Lymphoblastic_Leukemia,CCL_537
4,CCLE.MOLT13,Acute_Lymphoblastic_Leukemia,CCL_599
...,...,...,...
2916,gCSI.HEC-6,Uterine_Corpus_Endometrial_Carcinoma,CCL_316
2917,gCSI.KLE,Uterine_Corpus_Endometrial_Carcinoma,CCL_465
2918,gCSI.RL95-2,Uterine_Corpus_Endometrial_Carcinoma,CCL_829
2919,gCSI.SNG-M,Uterine_Corpus_Endometrial_Carcinoma,CCL_900


In [6]:
df_cl_cancer_map.drop(labels=['CELL'], axis=1, inplace=True)
df_cl_cancer_map.rename(columns={'CELL_ID':'CELL'}, inplace=True)
df_cl_cancer_map

Unnamed: 0,CANCER_TYPE,CELL
0,Acute_Lymphoblastic_Leukemia,CCL_95
1,Acute_Lymphoblastic_Leukemia,CCL_211
2,Acute_Lymphoblastic_Leukemia,CCL_457
3,Acute_Lymphoblastic_Leukemia,CCL_537
4,Acute_Lymphoblastic_Leukemia,CCL_599
...,...,...
2916,Uterine_Corpus_Endometrial_Carcinoma,CCL_316
2917,Uterine_Corpus_Endometrial_Carcinoma,CCL_465
2918,Uterine_Corpus_Endometrial_Carcinoma,CCL_829
2919,Uterine_Corpus_Endometrial_Carcinoma,CCL_900


In [7]:
df_uniq_cl_drugs

Unnamed: 0,CELL,DRUG
0,CCL_61,Drug_1
1,CCL_61,Drug_6
2,CCL_61,Drug_7
3,CCL_61,Drug_9
4,CCL_61,Drug_11
...,...,...
523365,CCL_393,Drug_654
523366,CCL_393,Drug_655
523367,CCL_393,Drug_656
523368,CCL_393,Drug_657


Merge and get top n. This step is equivalent to below command line. 

```Intersection.pl $df_dir/cell_lines/combined_cancer_types 1 uniq_cl_drugs 1 -i | cut -f 2 | sort | uniq -c | sort -nr | perl -pe 's/^ *//' | perl -pe 's/ /\t/' > top_cancer_types
```

In [8]:
df_cl_cancer_drug = df_cl_cancer_map.merge(df_uniq_cl_drugs, on='CELL', how='left', sort='true')
df_cl_cancer_drug

Unnamed: 0,CANCER_TYPE,CELL,DRUG
0,Pancreatic_Adenocarcinoma,CCL_100,Drug_1
1,Pancreatic_Adenocarcinoma,CCL_100,Drug_2
2,Pancreatic_Adenocarcinoma,CCL_100,Drug_3
3,Pancreatic_Adenocarcinoma,CCL_100,Drug_5
4,Pancreatic_Adenocarcinoma,CCL_100,Drug_6
...,...,...,...
1733659,Lung_NOS,CCL_998,Drug_496
1733660,Lung_NOS,CCL_998,Drug_497
1733661,Lung_NOS,CCL_998,Drug_499
1733662,Lung_NOS,CCL_998,Drug_500


In [9]:
df_cl_cancer_drug['CELL_DRUG'] = df_cl_cancer_drug.CELL.astype(str) + '.' + df_cl_cancer_drug.DRUG.astype(str)
df_cl_cancer_drug

Unnamed: 0,CANCER_TYPE,CELL,DRUG,CELL_DRUG
0,Pancreatic_Adenocarcinoma,CCL_100,Drug_1,CCL_100.Drug_1
1,Pancreatic_Adenocarcinoma,CCL_100,Drug_2,CCL_100.Drug_2
2,Pancreatic_Adenocarcinoma,CCL_100,Drug_3,CCL_100.Drug_3
3,Pancreatic_Adenocarcinoma,CCL_100,Drug_5,CCL_100.Drug_5
4,Pancreatic_Adenocarcinoma,CCL_100,Drug_6,CCL_100.Drug_6
...,...,...,...,...
1733659,Lung_NOS,CCL_998,Drug_496,CCL_998.Drug_496
1733660,Lung_NOS,CCL_998,Drug_497,CCL_998.Drug_497
1733661,Lung_NOS,CCL_998,Drug_499,CCL_998.Drug_499
1733662,Lung_NOS,CCL_998,Drug_500,CCL_998.Drug_500


In [10]:
top_n = df_cl_cancer_drug.groupby(['CANCER_TYPE']).count().sort_values('CELL_DRUG', ascending=False).head(21)
top_n

Unnamed: 0_level_0,CELL,DRUG,CELL_DRUG
CANCER_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Lung_Adenocarcinoma,145027,145025,145027
Skin_Cutaneous_Melanoma,120684,120683,120684
Colon_Adenocarcinoma,118856,118852,118856
Breast_Invasive_Carcinoma,112673,112668,112673
Lymphoid_Leukemia,86748,86742,86748
Lung_Small_Cell_Carcinoma,66759,66756,66759
Ovarian_Serous_Cystadenocarcinoma,63839,63839,63839
Sarcoma,63803,63792,63803
Pancreatic_Adenocarcinoma,62027,62027,62027
Lung_Non-Small_Cell_Carcinoma,58341,58341,58341


In [11]:
top_n_cancer_types = top_n.index.to_list()
top_n_cancer_types

['Lung_Adenocarcinoma',
 'Skin_Cutaneous_Melanoma',
 'Colon_Adenocarcinoma',
 'Breast_Invasive_Carcinoma',
 'Lymphoid_Leukemia',
 'Lung_Small_Cell_Carcinoma',
 'Ovarian_Serous_Cystadenocarcinoma',
 'Sarcoma',
 'Pancreatic_Adenocarcinoma',
 'Lung_Non-Small_Cell_Carcinoma',
 'Glioblastoma_Multiforme',
 'Kidney_Renal_Clear_Cell_Carcinoma',
 'Esophageal_Carcinoma',
 'Lymphoid_Neoplasm_Diffuse_Large_B-cell_Lymphoma',
 'Liver_Hepatocellular_Carcinoma',
 'Stomach_Adenocarcinoma',
 'Head_and_Neck_Squamous_Cell_Carcinoma',
 'Lung_Squamous_Cell_Carcinoma',
 'Uterine_Corpus_Endometrial_Carcinoma',
 'Acute_Myeloid_Leukemia',
 'Myeloma']

## Indentify cell lines associated with the target cancer types

In [12]:
df_cl = df_cl_cancer_drug[df_cl_cancer_drug['CANCER_TYPE'].isin(top_n_cancer_types)][['CELL']].drop_duplicates().reset_index(drop=True)
df_cl

Unnamed: 0,CELL
0,CCL_100
1,CCL_1000
2,CCL_1001
3,CCL_1002
4,CCL_1003
...,...
725,CCL_990
726,CCL_992
727,CCL_993
728,CCL_994


## Identify drugs associated with the target cancer type & filtered by drug_list

In [13]:
df_drugs = df_cl_cancer_drug[df_cl_cancer_drug['CANCER_TYPE'].isin(top_n_cancer_types)][['DRUG']].drop_duplicates().reset_index(drop=True)
df_drugs

Unnamed: 0,DRUG
0,Drug_1
1,Drug_2
2,Drug_3
3,Drug_5
4,Drug_6
...,...
1488,Drug_1364
1489,Drug_1557
1490,Drug_1560
1491,Drug_1567


## Filter response by cell lines (730) and drugs (1493)

In [14]:
cl_filter = df_cl.CELL.to_list()
dr_filter = df_drugs.DRUG.to_list()

In [15]:
df_response = df_response[df_response.CELL.isin(cl_filter) & df_response.DRUG.isin(dr_filter)][['CELL','DRUG','AUC']].drop_duplicates().reset_index(drop=True)
df_response 

Unnamed: 0,CELL,DRUG,AUC
0,CCL_65,Drug_1,0.8126
1,CCL_65,Drug_3,0.4959
2,CCL_65,Drug_7,0.8691
3,CCL_65,Drug_8,0.7881
4,CCL_65,Drug_16,0.7088
...,...,...,...
411299,CCL_393,Drug_654,0.9548
411300,CCL_393,Drug_655,0.8190
411301,CCL_393,Drug_656,0.9105
411302,CCL_393,Drug_657,0.9566


## Join response data with Drug descriptor & RNASeq

In [16]:
df_rnaseq = pd.read_csv(cell_rnaseq_path, sep='\t', low_memory=False)
df_rnaseq = df_rnaseq[df_rnaseq['Sample'].isin(cl_filter)].reset_index(drop=True)

In [17]:
df_rnaseq.rename(columns={'Sample':'CELL'}, inplace=True)
df_rnaseq = df_rnaseq.set_index(['CELL'])

In [18]:
# add prefix on the attribute columns
cols = df_rnaseq.columns.to_list()
ge_cols = [f'GE_{col}' for col in cols]
df_rnaseq.columns = ge_cols

In [19]:
df_rnaseq

Unnamed: 0_level_0,GE_A1BG,GE_A1CF,GE_A2M,GE_A2ML1,GE_A3GALT2,GE_A4GALT,GE_A4GNT,GE_AAAS,GE_AACS,GE_AADAC,...,GE_ZWILCH,GE_ZWINT,GE_ZXDA,GE_ZXDB,GE_ZXDC,GE_ZYG11A,GE_ZYG11B,GE_ZYX,GE_ZZEF1,GE_ZZZ3
CELL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CCL_62,-1.421816,2.175727,-0.466091,-0.222140,-0.408307,-0.429530,-0.427922,-0.462058,-0.368245,-0.507094,...,0.949667,0.090989,0.491263,1.117663,0.445099,-0.988673,0.801770,-0.364203,-0.014804,1.247922
CCL_65,1.209724,-0.278733,-0.416899,-0.372810,1.115141,-0.753543,-0.080482,-0.130010,-0.254613,-0.436202,...,0.632992,-0.097421,-0.918279,-1.917117,-1.377897,-1.020918,0.201952,0.551684,-0.176323,1.228964
CCL_70,1.126623,-0.339088,-0.367707,-0.362765,-0.408307,-1.341159,-0.393178,0.356993,-3.297435,-0.535450,...,-0.079526,0.632668,-1.937947,-0.858835,0.396701,-1.020918,-0.114619,-0.822147,0.049804,-0.534133
CCL_71,-1.454133,0.777490,-0.373173,-0.332631,0.244599,0.317347,0.475421,0.179901,0.225168,-0.450381,...,0.854664,0.114540,-0.153528,-0.594264,-0.877783,0.309214,-1.064331,1.405687,-0.353995,0.243146
CCL_72,-1.537235,-0.278733,-0.416899,-0.372810,0.353417,0.136120,-0.184714,-1.081880,0.136787,-0.535450,...,1.931358,0.891732,0.221351,-0.205190,-1.103641,-0.988673,-0.081296,1.418064,-1.387718,0.736055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCL_42,0.412532,-0.265602,-0.420079,-0.259848,-0.310406,1.707794,-0.158447,-0.180041,0.728996,-0.383944,...,0.790846,-0.293543,1.003597,0.955632,-0.224492,1.240912,0.329235,1.404058,0.378114,1.252223
CCL_46,0.934353,-0.265602,2.265315,0.083384,-0.310406,1.769761,-0.158447,1.154452,1.315011,-0.383944,...,1.098708,-0.104170,1.660447,2.175937,1.473070,-1.008849,0.775270,0.448669,0.139148,0.981329
CCL_50,1.430925,-0.265602,-0.438163,-0.259848,-0.310406,0.053276,-0.158447,-0.808038,-0.593292,-0.383944,...,-0.112214,1.017499,-0.211576,-0.357354,-1.157218,1.111265,-0.490894,0.023324,-1.154084,-1.127772
CCL_56,-1.439092,-0.265602,-0.460767,-0.259848,-0.310406,-0.021084,-0.158447,-0.760938,0.263190,-0.383944,...,1.078184,-0.410080,-0.917690,-0.743526,-1.231836,0.851970,0.113412,-0.127183,-0.943231,0.845883


In [20]:
df_descriptor = pd.read_csv(drug_descriptors_mordred_path, sep='\t', low_memory=False, na_values='na')
df_descriptor = df_descriptor[df_descriptor.ID.isin(dr_filter)].set_index(['ID']).fillna(0)

In [21]:
df_descriptor.drop(labels=['NAME', 'CLEAN_NAME', 'SMILES'], axis=1, inplace=True)
df_descriptor

Unnamed: 0_level_0,dd_ABC,dd_ABCGG,dd_nAcid,dd_nBase,dd_SpAbs_A,dd_SpMax_A,dd_SpDiam_A,dd_SpAD_A,dd_SpMAD_A,dd_LogEE_A,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Drug_1,26.736807,19.061773,0,1,43.343460,2.553010,4.969379,43.343460,1.313438,4.468119,...,10.617099,84.249115,439.23720,7.084471,3463.0,50,184,221,7.472222,7.083334
Drug_2,23.500933,17.128252,0,1,39.126250,2.449710,4.899419,39.126250,1.304208,4.332830,...,10.315497,65.746980,474.10666,8.779753,2677.0,48,158,185,9.111111,6.611111
Drug_3,19.896450,14.969769,0,1,33.966774,2.439199,4.702121,33.966774,1.306415,4.173748,...,9.951658,74.718376,349.17902,7.126102,2128.0,36,130,149,7.777778,5.916666
Drug_4,24.704065,18.277016,0,0,40.111744,2.386178,4.772356,40.111744,1.253492,4.377273,...,10.284626,67.813840,464.08630,9.668465,3619.0,48,164,187,11.812500,6.972222
Drug_5,34.567260,23.377972,0,1,57.545963,2.617439,5.158210,57.545963,1.338278,4.723140,...,11.018613,95.764595,586.27910,7.238014,7049.0,86,246,306,12.840278,9.263889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Drug_1563,17.775130,13.542258,0,0,30.217138,2.402695,4.805391,30.217138,1.313789,4.060940,...,9.978177,57.212070,304.15756,7.073431,1318.0,35,118,137,7.027778,5.166666
Drug_1564,12.606602,10.511310,0,0,21.156641,2.486039,4.799683,21.156641,1.410443,3.811451,...,9.883132,70.863040,204.11235,7.559716,351.0,21,90,111,2.916667,3.083333
Drug_1565,19.556349,15.047755,0,1,33.701626,2.458141,4.916282,33.701626,1.348065,4.164601,...,10.281136,60.169334,327.18558,6.816366,1456.0,44,134,162,6.638889,5.666666
Drug_1566,7.701497,8.477901,0,0,11.599388,2.376079,4.752158,11.599388,1.054490,3.255520,...,9.358847,41.419890,158.16707,4.792942,142.0,18,50,58,7.395834,2.500000


In [22]:
%%time
df_1 = df_response.merge(df_rnaseq, on='CELL', how='left', sort='true')

In [23]:
df_1.set_index(['DRUG'])

Unnamed: 0_level_0,CELL,AUC,GE_A1BG,GE_A1CF,GE_A2M,GE_A2ML1,GE_A3GALT2,GE_A4GALT,GE_A4GNT,GE_AAAS,...,GE_ZWILCH,GE_ZWINT,GE_ZXDA,GE_ZXDB,GE_ZXDC,GE_ZYG11A,GE_ZYG11B,GE_ZYX,GE_ZZEF1,GE_ZZZ3
DRUG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Drug_1,CCL_100,0.7194,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
Drug_2,CCL_100,0.7905,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
Drug_3,CCL_100,0.4439,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
Drug_5,CCL_100,0.6256,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
Drug_6,CCL_100,0.7049,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,0.113491,...,-2.137911,-1.840215,-0.528406,-0.547575,-0.893916,-1.020918,-0.914376,0.075175,-0.418602,-0.609965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Drug_654,CCL_994,0.9743,-1.366415,3.312423,-0.444228,-0.372810,-0.408307,-0.819443,-0.427922,-0.240693,...,-0.997882,-0.839286,-0.153528,-0.127375,-0.200210,-1.012857,-0.097957,0.149436,0.195171,-0.761630
Drug_655,CCL_994,0.8121,-1.366415,3.312423,-0.444228,-0.372810,-0.408307,-0.819443,-0.427922,-0.240693,...,-0.997882,-0.839286,-0.153528,-0.127375,-0.200210,-1.012857,-0.097957,0.149436,0.195171,-0.761630
Drug_656,CCL_994,0.9229,-1.366415,3.312423,-0.444228,-0.372810,-0.408307,-0.819443,-0.427922,-0.240693,...,-0.997882,-0.839286,-0.153528,-0.127375,-0.200210,-1.012857,-0.097957,0.149436,0.195171,-0.761630
Drug_657,CCL_994,0.8585,-1.366415,3.312423,-0.444228,-0.372810,-0.408307,-0.819443,-0.427922,-0.240693,...,-0.997882,-0.839286,-0.153528,-0.127375,-0.200210,-1.012857,-0.097957,0.149436,0.195171,-0.761630


In [24]:
%%time
df_2 = df_1.merge(df_descriptor, left_on='DRUG', right_on='ID', how='left', sort='true')

CPU times: user 41 s, sys: 44.1 s, total: 1min 25s
Wall time: 1min 23s


In [25]:
df_2

Unnamed: 0,CELL,DRUG,AUC,GE_A1BG,GE_A1CF,GE_A2M,GE_A2ML1,GE_A3GALT2,GE_A4GALT,GE_A4GNT,...,dd_SRW10,dd_TSRW10,dd_MW,dd_AMW,dd_WPath,dd_WPol,dd_Zagreb1,dd_Zagreb2,dd_mZagreb1,dd_mZagreb2
0,CCL_100,Drug_1,0.7194,-1.440283,3.845564,-0.471557,-0.362765,0.244599,0.767670,-0.427922,...,10.617099,84.249115,439.23720,7.084471,3463.0,50,184,221,7.472222,7.083334
1,CCL_1000,Drug_1,0.8588,0.831151,-0.339088,-0.395036,-0.181962,-0.408307,0.548000,0.579653,...,10.617099,84.249115,439.23720,7.084471,3463.0,50,184,221,7.472222,7.083334
2,CCL_1001,Drug_1,0.8150,-1.449517,2.467445,-0.482489,-0.292453,-0.408307,-0.264778,-0.427922,...,10.617099,84.249115,439.23720,7.084471,3463.0,50,184,221,7.472222,7.083334
3,CCL_1002,Drug_1,0.7922,1.158940,-0.339088,-0.099885,-0.372810,0.353417,0.421690,0.475421,...,10.617099,84.249115,439.23720,7.084471,3463.0,50,184,221,7.472222,7.083334
4,CCL_1004,Drug_1,0.8194,0.863469,-0.339088,-0.449694,-0.302497,0.244599,0.569967,-0.288946,...,10.617099,84.249115,439.23720,7.084471,3463.0,50,184,221,7.472222,7.083334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411299,CCL_88,Drug_999,0.9109,-0.216848,-0.087607,-0.526215,-0.332631,-0.408307,0.020793,0.822861,...,10.666930,95.521210,598.06256,9.646171,6163.0,63,224,270,9.555555,8.555555
411300,CCL_889,Drug_999,0.9730,0.992737,-0.308911,-0.373173,-0.252274,-0.408307,1.124633,-0.184714,...,10.666930,95.521210,598.06256,9.646171,6163.0,63,224,270,9.555555,8.555555
411301,CCL_93,Drug_999,0.9461,-1.149429,-0.278733,-0.449694,-0.312542,-0.408307,0.729228,4.783671,...,10.666930,95.521210,598.06256,9.646171,6163.0,63,224,270,9.555555,8.555555
411302,CCL_961,Drug_999,0.8778,0.969654,-0.329029,-0.438762,-0.372810,-0.408307,-1.330176,-0.010994,...,10.666930,95.521210,598.06256,9.646171,6163.0,63,224,270,9.555555,8.555555


In [26]:
%%time
df_2.to_parquet('top21.parquet', index=False)

CPU times: user 2min 9s, sys: 2.7 s, total: 2min 12s
Wall time: 2min 12s


In [None]:
%%time
df_2.to_hdf('top21.h5', key='df', mode='w')