In [1]:
import numpy as np
import pandas as pd

# import data for breat cancer datasets
cna_data_frame = pd.read_csv("./dream_proteogenomics_challenge_dataset/sub_challenge_2_3/breast_cancer_dataset/retrospective_breast_CNA_median_sort_common_gene_16884.txt", 
                header=0, index_col=0, sep="\t")
rna_data_frame = pd.read_csv("./dream_proteogenomics_challenge_dataset/sub_challenge_2_3/breast_cancer_dataset/retrospective_breast_rna_seq_sort_common_gene_15115.txt", 
                header=0, index_col=0, sep="\t")
proteome_data_frame = pd.read_csv("./dream_proteogenomics_challenge_dataset/sub_challenge_2_3/breast_cancer_dataset/retrospective_breast_proteome_filtered.txt", 
                header=0, index_col=0, sep="\t")

In [3]:
# convert rna data to numpy array to impute
rna_data = rna_data_frame.values

In [4]:
# use knn to fill in missing values in rna data
from fancyimpute import KNN
knn = KNN(k=5, verbose=0)

rna_data = knn.complete(rna_data)

# proteome data also has missing values, but I am ignoring rows with missing values for now
# proteome_data = knn.complete(proteome_data)

Using cuDNN version 6021 on context None
Mapped name None to device cuda: GeForce GTX 980M (0000:01:00.0)


In [5]:
# update rna data frame with imputed data
rna_data_frame = pd.DataFrame(rna_data, index = rna_data_frame.index, columns = rna_data_frame.columns)

In [6]:
# find common proteins and patients across all datasets
proteins = proteome_data_frame.index.intersection(cna_data_frame.index).intersection(rna_data_frame.index)
patients = proteome_data_frame.columns.intersection(cna_data_frame.columns).intersection(rna_data_frame.columns)

In [8]:
# locate common data and return as numpy array
cna_data = cna_data_frame.loc[proteins, patients].values
rna_data = rna_data_frame.loc[proteins, patients].values
proteome_data = proteome_data_frame.loc[proteins, patients].values

In [9]:
# masks for proteins with complete data and with missing value
complete_proteins = proteins[~(np.isnan(proteome_data).any(axis=1))]
missing_value_proteins = proteins[np.isnan(proteome_data).any(axis=1)]

In [10]:
# use complete data as training data
cna_training_data = cna_data_frame.loc[complete_proteins, patients].values
rna_training_data = rna_data_frame.loc[complete_proteins, patients].values
proteome_training_data = proteome_data_frame.loc[complete_proteins, patients].values

In [11]:
# standard scaling
from sklearn.preprocessing import StandardScaler

cna_scaler = StandardScaler()
rna_scaler = StandardScaler()
proteome_scaler = StandardScaler()

cna_training_data = cna_scaler.fit_transform(cna_training_data)
rna_training_data = rna_scaler.fit_transform(rna_training_data)
proteome_training_data = proteome_scaler.fit_transform(proteome_training_data)

In [16]:
# dimensionality of data
num_genes, num_patients = cna_training_data.shape

In [17]:
# regression model
from keras.layers import Input, Dense, Concatenate, Dropout, BatchNormalization, Activation
from keras.models import Model
from keras.regularizers import l1, l2

Using TensorFlow backend.


In [18]:
# construct model
dropout = 0.2
num_hidden = [64]
reg = l2(1e-3)

cna = Input(shape=(num_patients,))

rna = Input(shape=(num_patients,))

y = Concatenate()([cna, rna])

for h in num_hidden:
    y = Dense(h, activation="relu", kernel_regularizer=reg)(y)
    y = BatchNormalization()(y)
    y = Dropout(dropout)(y)
    
y = Dense(num_patients, kernel_regularizer=reg)(y)

regression_model = Model([cna, rna], y)
regression_model.compile(optimizer="adam", loss="mse")

In [21]:
regression_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 77)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 77)            0                                            
____________________________________________________________________________________________________
concatenate_1 (Concatenate)      (None, 154)           0                                            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 64)            9920                                         
___________________________________________________________________________________________

In [19]:
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor="val_loss", patience=1000, min_delta=0, )

from keras_tqdm import TQDMNotebookCallback
tqdm = TQDMNotebookCallback(leave_inner=False, leave_outer=True)

In [20]:
# train model
regression_model.fit([cna_training_data, rna_training_data], proteome_training_data, 
                     verbose=0, epochs=10000, batch_size=128, shuffle=True, validation_split=0.2,
                     callbacks=[early_stopping, tqdm])

KeyboardInterrupt: 

          896/|/[loss: nan]  18%|| 896/4846 [00:18<00:38, 102.27it/s]

In [49]:
# protein abundance predictions
abundance_predictions = regression_model.predict([cna_data, rna_data])

In [50]:
abundance_predictions[0]

array([ -6.82100058e-01,  -2.56733030e-01,  -2.00909913e-01,
         8.23054910e-01,   3.70218933e-01,   1.79737449e-01,
         8.89809579e-02,   6.05575383e-01,   5.25549889e-01,
         1.09271765e+00,   4.89442050e-01,   1.03979611e+00,
         8.34455311e-01,   6.02376461e-01,   1.03729892e+00,
        -3.61817420e-01,  -5.55552602e-01,   9.76115704e-01,
         9.44922805e-01,   1.74032431e-03,  -1.79783076e-01,
         4.33013588e-01,   3.07582676e-01,   4.59638238e-03,
        -1.24766558e-01,   1.02880269e-01,  -5.35111666e-01,
        -2.32684284e-01,   6.50270641e-01,   1.30939305e-01,
        -5.82404211e-02,   1.02170348e+00,   2.21706137e-01,
        -3.97738397e-01,   6.15926832e-02,   7.72632658e-01,
         4.64321554e-01,   1.03759974e-01,   6.93065345e-01,
         3.04191709e-01,  -7.23502576e-01,  -1.26853958e-01,
        -2.92388260e-01,  -2.83210874e-01,  -1.21441036e-01,
         4.34939563e-01,   1.23527575e+00,  -1.75851300e-01,
        -1.55331820e-01,

In [51]:
proteome_data[0]

array([-0.27630672,  0.01275913, -0.05321061,  0.7227355 , -0.06371484,
        0.36992851,  0.24325388,  0.65878829,  0.01259709,  0.66040108,
        0.04895953,  0.99471189,  0.39949655,  0.7906901 ,  1.46088147,
        0.3146815 , -0.40517284,  0.04653142,  0.51521998, -0.31876222,
       -0.73156853, -0.12497422,  0.14667959,  0.47113405, -0.55770873,
        0.06877902,  0.047882  , -0.38025886,  0.63592276,  0.0221391 ,
       -0.54852515,  1.2062614 ,  0.05383457, -0.41756088,  0.5188831 ,
        0.15998838,  0.27399504, -0.07887607, -0.00342251, -0.40215398,
       -0.40677194,  0.25721829, -0.06691453, -1.26508033, -0.01253816,
       -0.54888199,  1.46309005,  0.45347341, -0.05272791, -0.29023065,
       -0.66880365, -0.64128345,  1.01948703,  0.52762039, -0.4423625 ,
        0.55994559, -0.10680315, -0.17640062,  0.98721957,  1.20094576,
        0.10842626,  0.07710229,  0.19853758, -0.25274997,  0.31796977,
       -0.08976698,  0.51728521,  1.06340826,  0.09968483, -0.09