In [1]:
import flexynesis 
import torch
torch.set_num_threads(12)

# Modeling Drug Response Using Publicly Available Pharmocogenomics Datasets

We build models using CCLE data and make predictions on GDSC data. 

# Download Data

In [2]:
!wget -O ccle_gdsc.tgz "https://bimsbstatic.mdc-berlin.de/akalin/buyar/flexynesis-benchmark-datasets/dataset1.tgz" && tar -xzvf ccle_gdsc.tgz

--2023-09-18 14:37:18--  https://bimsbstatic.mdc-berlin.de/akalin/buyar/flexynesis-benchmark-datasets/dataset1.tgz
Resolving bimsbstatic.mdc-berlin.de (bimsbstatic.mdc-berlin.de)... 141.80.181.47, 141.80.181.46
Connecting to bimsbstatic.mdc-berlin.de (bimsbstatic.mdc-berlin.de)|141.80.181.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 55132132 (53M) [application/octet-stream]
Saving to: ‘ccle_gdsc.tgz’


2023-09-18 14:37:19 (35.6 MB/s) - ‘ccle_gdsc.tgz’ saved [55132132/55132132]

dataset1/
dataset1/train/
dataset1/train/clin.csv
dataset1/train/cnv.csv
dataset1/train/gex.csv
dataset1/test/
dataset1/test/gex.csv
dataset1/test/cnv.csv
dataset1/test/clin.csv


## Define Data Importer 

In [3]:
data_importer = flexynesis.DataImporter(path ='./dataset1/', 
                                       data_types = ['gex', 'cnv'], log_transform=False, concatenate=False, min_features=1000, top_percentile=0.2)

In [4]:
train_dataset, test_dataset = data_importer.import_data()

importing  ./dataset1/train/clin.csv
importing  ./dataset1/train/cnv.csv
importing  ./dataset1/train/gex.csv
importing  ./dataset1/test/gex.csv
importing  ./dataset1/test/cnv.csv
importing  ./dataset1/test/clin.csv
Number of NA values:  0
DataFrame gex - Removed 0 features.
Number of NA values:  0
DataFrame cnv - Removed 0 features.
DataFrame gex - Removed 0 samples (0.00%).
DataFrame cnv - Removed 0 samples (0.00%).
Implementing feature selection using laplacian score for layer: gex with  3422 features


Calculating Laplacian scores: 100%|██████████| 3422/3422 [00:00<00:00, 19603.62it/s]
Removing redundant features among top scoring ones: 100%|██████████| 1342/1342 [00:00<00:00, 1901.18it/s]

Implementing feature selection using laplacian score for layer: cnv with  157 features
Returning original matrix, demanded # of features is  larger than existing number of features
Number of NA values:  0
DataFrame gex - Removed 0 features.
Number of NA values:  0
DataFrame cnv - Removed 0 features.
DataFrame gex - Removed 0 samples (0.00%).
DataFrame cnv - Removed 0 samples (0.00%).
Harmonizing features between train and test
normalizing data
normalizing data





In [6]:
tuner = flexynesis.HyperparameterTuning(train_dataset, 
                                        model_class = flexynesis.supervised_vae, 
                                        target_variables = "Erlotinib",
                                        batch_variables = None,
                                        config_name = "SVAE", 
                                        n_iter=5)    

In [None]:
model, best_params = tuner.perform_tuning()

In [9]:
y_pred_dict = model.predict(test_dataset)

In [10]:
metrics_df = flexynesis.evaluate_wrapper(y_pred_dict, test_dataset)

In [13]:
import pandas as pd 
df_list = []
for var in model.target_variables:
    df_list.append(model.compute_feature_importance(var, steps = 20))
df_imp = pd.concat(df_list, ignore_index = True)

In [14]:
df_imp.sort_values(by = 'importance', ascending=False).head(10).name

166      ANXA3
140     AMOTL2
132     ALS2CL
88       AFTPH
3        AADAT
176     ARFIP2
146    ANGPTL4
251      MFAP3
268     PARP11
67     ADAMTS5
Name: name, dtype: object