In [6]:
import numpy as np
import pandas as pd

# import data for breast cancer datasets
cna_data_frame = pd.read_csv("./dream_proteogenomics_challenge_dataset/sub_challenge_2_3/breast_cancer_dataset/retrospective_breast_CNA_median_sort_common_gene_16884.txt", 
                header=0, index_col=0, sep="\t")
rna_data_frame = pd.read_csv("./dream_proteogenomics_challenge_dataset/sub_challenge_2_3/breast_cancer_dataset/retrospective_breast_rna_seq_sort_common_gene_15115.txt", 
                header=0, index_col=0, sep="\t")
proteome_data_frame = pd.read_csv("./dream_proteogenomics_challenge_dataset/sub_challenge_2_3/breast_cancer_dataset/retrospective_breast_proteome_filtered.txt", 
                header=0, index_col=0, sep="\t")

In [7]:
cna_data_frame.values.shape

(16884, 77)

In [8]:
rna_data_frame.values.shape

(15115, 77)

In [9]:
proteome_data_frame.values.shape

(10597, 105)

In [10]:
# convert rana data to numpy array to impute
rna_data = rna_data_frame.values

In [11]:
# use knn to fill in missing values in rna data
from fancyimpute import KNN
knn = KNN(k=5, verbose=0)

rna_data = knn.complete(rna_data)

# proteome data also has missing values, but I am ignoring rows with missing values for now
# proteome_data = knn.complete(proteome_data)

In [13]:
# update rna data frame with imputed data
rna_data_frame = pd.DataFrame(rna_data, index = rna_data_frame.index, columns = rna_data_frame.columns)

In [14]:
# find common proteins and patients across all datasets
proteins = proteome_data_frame.index.intersection(cna_data_frame.index).intersection(rna_data_frame.index)
patients = proteome_data_frame.columns.intersection(cna_data_frame.columns).intersection(rna_data_frame.columns)

In [15]:
len(proteins)

9012

In [16]:
len(patients)

77

In [17]:
# locate common data and return as numpy array
cna_data = cna_data_frame.loc[proteins, patients].values
rna_data = rna_data_frame.loc[proteins, patients].values
proteome_data = proteome_data_frame.loc[proteins, patients].values

In [18]:
# masks for proteins with complete data and with missing value
complete_proteins = proteins[~(np.isnan(proteome_data).any(axis=1))]
missing_value_proteins = proteins[np.isnan(proteome_data).any(axis=1)]

In [19]:
# use complete data as training data
cna_training_data = cna_data_frame.loc[complete_proteins, patients].values
rna_training_data = rna_data_frame.loc[complete_proteins, patients].values
proteome_training_data = proteome_data_frame.loc[complete_proteins, patients].values

In [20]:
# standard scaling
from sklearn.preprocessing import StandardScaler

cna_scaler = StandardScaler()
rna_scaler = StandardScaler()
proteome_scaler = StandardScaler()

cna_training_data = cna_scaler.fit_transform(cna_training_data)
rna_training_data = rna_scaler.fit_transform(rna_training_data)
proteome_training_data = proteome_scaler.fit_transform(proteome_training_data)

In [21]:
# dimensionality of data
num_genes, num_patients = cna_training_data.shape

In [22]:
# regression model
from keras.layers import Input, Dense, Concatenate, Dropout, BatchNormalization, Activation
from keras.models import Model
from keras.regularizers import l1, l2

Using TensorFlow backend.


In [23]:
# construct model
dropout = 0.2
num_hidden = [64, 64]
reg = l2(1e-3)

cna = Input(shape=(num_patients,))

rna = Input(shape=(num_patients,))

y = Concatenate()([cna, rna])

for h in num_hidden:
    y = Dense(h, activation="relu", kernel_regularizer=reg)(y)
    y = BatchNormalization()(y)
    y = Dropout(dropout)(y)
    
y = Dense(num_patients, kernel_regularizer=reg)(y)

regression_model = Model([cna, rna], y)
regression_model.compile(optimizer="adam", loss="mse")

In [24]:
regression_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 77)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 77)            0                                            
____________________________________________________________________________________________________
concatenate_1 (Concatenate)      (None, 154)           0                                            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 64)            9920                                         
___________________________________________________________________________________________

In [25]:
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor="val_loss", patience=1000, min_delta=0, )

from keras_tqdm import TQDMNotebookCallback
tqdm = TQDMNotebookCallback(leave_inner=False, leave_outer=True)

In [26]:
# train model
regression_model.fit([cna_training_data, rna_training_data], proteome_training_data, 
                     verbose=0, epochs=10000, batch_size=128, shuffle=True, validation_split=0.2,
                     callbacks=[early_stopping, tqdm])

KeyboardInterrupt: 

In [36]:
# protein abundance predictions
abundance_predictions = regression_model.predict([cna_training_data, rna_training_data])

In [37]:
abundance_predictions[0]

array([-0.59273922,  0.33661628, -0.18270642,  0.97955567,  1.08380818,
        0.51743466,  0.20083116,  0.81438828,  1.08634794,  1.4711318 ,
        0.65549827,  1.25322866,  0.63456285,  0.78632605,  1.32807457,
       -0.29054323,  0.01207206,  0.70493257,  0.68769491, -0.11013012,
       -0.19304174,  0.62145776,  0.48704165, -0.11669907, -0.30114099,
        0.15262258, -0.12054441,  0.09665491,  0.7446714 , -0.0650611 ,
       -0.09825604,  1.35582459,  0.2131383 , -0.17842478,  0.49355331,
        0.88744068,  0.68147522, -0.02957805,  0.55244315,  0.23228718,
       -0.75318813, -0.03211762, -0.33791441, -0.43380883, -0.32816824,
        0.2315356 ,  1.21954691,  0.05777193,  0.24214992, -0.38445684,
       -0.3786284 , -0.35150191,  1.33116508,  0.84513283, -0.50623673,
        0.91230255,  0.29141679, -0.29972315,  1.64704514,  1.89052951,
        0.82801116,  0.35355037,  1.00779903, -0.46924111,  0.67512381,
       -0.07365488,  0.37716913,  0.66994482,  0.5225535 , -0.13

In [29]:
proteome_data[0]

array([-0.27630672,  0.01275913, -0.05321061,  0.7227355 , -0.06371484,
        0.36992851,  0.24325388,  0.65878829,  0.01259709,  0.66040108,
        0.04895953,  0.99471189,  0.39949655,  0.7906901 ,  1.46088147,
        0.3146815 , -0.40517284,  0.04653142,  0.51521998, -0.31876222,
       -0.73156853, -0.12497422,  0.14667959,  0.47113405, -0.55770873,
        0.06877902,  0.047882  , -0.38025886,  0.63592276,  0.0221391 ,
       -0.54852515,  1.2062614 ,  0.05383457, -0.41756088,  0.5188831 ,
        0.15998838,  0.27399504, -0.07887607, -0.00342251, -0.40215398,
       -0.40677194,  0.25721829, -0.06691453, -1.26508033, -0.01253816,
       -0.54888199,  1.46309005,  0.45347341, -0.05272791, -0.29023065,
       -0.66880365, -0.64128345,  1.01948703,  0.52762039, -0.4423625 ,
        0.55994559, -0.10680315, -0.17640062,  0.98721957,  1.20094576,
        0.10842626,  0.07710229,  0.19853758, -0.25274997,  0.31796977,
       -0.08976698,  0.51728521,  1.06340826,  0.09968483, -0.09

In [30]:
from sklearn.metrics import mean_squared_error as mse

In [31]:
def rmse (y_pred, y_true): 
    # rsme prediction and ground truth
    return np.sqrt(mse(y_true, y_pred))

In [38]:
abundance_predictions.shape

(6058, 77)

In [39]:
proteome_training_data.shape

(6058, 77)

In [40]:
rmse(abundance_predictions, proteome_training_data)

0.75012842228031629