##Methods:

1) Sample Mean

2) SoftImpute

3) IterativeSVD

4) MICE

5) Matrix Factorisation

6) NuclearNormMinimization

7) Autoencoder with adapted objective

8) PCA + Autoencoder

In [1]:
# read in data

import numpy as np
import pandas as pd

# training data
dfs = [pd.read_csv("./dream_proteogenomics_challenge_dataset/sub_challenge_1/data_obs_{}.txt".format(i), 
                header=0, index_col=0, sep="\t") for i in range(1, 11)]

# ground truth
ground_truth_table = pd.read_csv("./dream_proteogenomics_challenge_dataset/sub_challenge_1/data_true.txt", 
                header=0, index_col=0, sep="\t")

In [2]:
# conver from data frame ot numpy array
datas = [df.values for df in dfs]
ground_truth = ground_truth_table.values

In [4]:
# dimensionality of data
num_samples, num_features = ground_truth.shape

In [5]:
from sklearn.metrics import mean_squared_error as mse

In [6]:
# root mean squared error with three masks
def rmse (original_data, y_pred, y_true=ground_truth): 
    # rsme prediction and ground truth
    rmse_no_mask = np.sqrt(mse(y_true, y_pred))
    
    # ignore all zeros in the ground truth data
    no_zeros = y_true > 0
    rmse_no_zeros = np.sqrt(mse(y_true[no_zeros], y_pred[no_zeros]))
    
    # ignore zeros and only consider data that was originally nan in the training data
    nan_no_zeros = np.isnan(original_data) & (y_true > 0)
    rmse_nan_no_zeros = np.sqrt(mse(y_true[nan_no_zeros], y_pred[nan_no_zeros]))
    
    # concatenate all three results
    return np.array([rmse_no_mask, rmse_no_zeros, rmse_nan_no_zeros])

In [7]:
# imputation methods
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from fancyimpute import SimpleFill, KNN, SoftImpute, IterativeSVD, MICE, MatrixFactorization, NuclearNormMinimization
from keras.models import Model
from keras.layers import Dense, Input, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
from keras.objectives import binary_crossentropy, mean_squared_error
import keras.backend as K
import tensorflow as tf
from keras_tqdm import TQDMNotebookCallback

# impute with sample mean
def sample_mean(data, **kwargs):
    fill = SimpleFill(fill_method="mean")
    return fill.complete(data)

# impute with knn-3
def knn_3(data, **kwargs):
    fill = KNN(k=3, verbose=0)
    return fill.complete(data)

# impute with knn-5
def knn_5(data, **kwargs):
    fill = KNN(k=5, verbose=0)
    return fill.complete(data)

# knn for any k
def knn(data, k, **kwargs):
    fill = KNN(k=k, verbose=0)
    return fill.complete(data)

# softimpute from fancyimpute package
def soft_impute(data, **kwargs):
    fill = SoftImpute(verbose=0)
    return fill.complete(data)

# iterativeSVD from fancy impute package
def iterative_SVD(data, **kwargs):
    fill = IterativeSVD(verbose=0)
    return fill.complete(data)

# MICE for fancyimpute package
def mice(data, **kwargs):
    fill = MICE(verbose=0)
    return fill.complete(data)

# modified autoencoder that does not propagate error from missing values
def modified_autoencoder(data, num_hidden=[32], dropout=0.1, **kwargs):
    
    # to normalise the data we must impute 
    mean_imputer = SimpleFill(fill_method="mean")
    data_imputed = mean_imputer.complete(data)
    
    # standard scaling for normalisation
    standard_scaler = StandardScaler()
    data_imputed_and_scaled = standard_scaler.fit_transform(data_imputed)
    
    # replace all missing values with 0 so they do not contribute to input
    data_imputed_and_scaled[np.isnan(data)] = 0
    
    # maintain nan in target data so we know which outputs should not prodice any error
    data_scaled_with_nan = np.array([[data_imputed_and_scaled[i, j] if ~np.isnan(data[i, j]) else np.nan
                                     for j in range(num_features)] for i in range(num_samples)])
    
    # custom MSE that only produces error on non-nan terms
    def custom_MSE(y_true, y_pred):
    
        y_true = K.flatten(y_true)
        y_pred = K.flatten(y_pred)

        # mask for targets that are not nan
        mask = ~tf.is_nan(y_true)

        # apply the mask to targets and output of network and then compute MSE with what remains
        y_true = tf.boolean_mask(tensor=y_true, mask=mask)
        y_pred = tf.boolean_mask(tensor=y_pred, mask=mask)

        return mean_squared_error(y_true, y_pred)

    
    # construct model
    x = Input(shape=(num_features,))
    
    # first fully connected layer layer
    y = Dense(num_hidden[0], activation="relu")(x)
    y = BatchNormalization()(y)
    y = Dropout(dropout)(y)

    # all remaining fully connected layers
    for h in num_hidden[1:] + num_hidden[-2::-1]:
        y = Dense(h, activation="relu")(y)
        y = BatchNormalization()(y)
        y = Dropout(dropout)(y)
    
    # output -- no activation function 
    y = Dense(num_features, activation="linear")(y)
    autoencoder = Model(x, y)
    autoencoder.compile(optimizer="adam", loss=custom_binary_crossentropy)
    early_stopping = EarlyStopping(monitor="loss", patience=1000, min_delta=0)
    tqdm = TQDMNotebookCallback(leave_inner=False, leave_outer=True)
    # train model
    autoencoder.fit(data_imputed_and_scaled, data_scaled_with_nan, 
                    verbose=0, epochs=10000, batch_size=100, callbacks=[early_stopping, tqdm])
    # predict data
    prediction = autoencoder.predict(data_imputed_and_scaled)
    
    # reverse normalise and return
    return standard_scaler.inverse_transform(prediction)

# PCA and then autoencoder
def pca_autoencoder(data, num_hidden=[32], dropout=0.1, pca_dim=64, **kwargs):
    
    #construct model
    x = Input(shape=(pca_dim,))
    y = Dropout(1e-8)(x)
    for h in num_hidden + num_hidden[-2::-1]:
        y = Dense(h, activation="relu")(y)
        y = BatchNormalization()(y)
        y = Dropout(dropout)(y)
    y = Dense(pca_dim)(y)
    
    autoencoder = Model(x, y)
    autoencoder.compile(optimizer="adam", loss="mse")
    
    
    # project with pca
    mean_imputer = SimpleFill()
    data_imputed = mean_imputer.complete(data)
    pca = PCA(n_components=pca_dim)
    data_transformed = pca.fit_transform(data_imputed)
    early_stopping = EarlyStopping(monitor="loss", patience=100, min_delta=0)
    tqdm = TQDMNotebookCallback(leave_inner=False, leave_outer=True)
    autoencoder.fit(data_transformed, data_transformed, 
                    verbose=0, epochs=10000, batch_size=100, callbacks=[early_stopping, tqdm])
    
    prediction = autoencoder.predict(data_transformed)
    
    return pca.inverse_transform(prediction)

imputation_methods = [sample_mean, knn_3, knn_5, soft_impute, iterative_SVD, mice,
                     modified_autoencoder, pca_autoencoder]


ERROR (theano.gpuarray): Could not initialize pygpu, support disabled
Traceback (most recent call last):
  File "/home/david/miniconda2/lib/python2.7/site-packages/theano/gpuarray/__init__.py", line 179, in <module>
    use(config.device)
  File "/home/david/miniconda2/lib/python2.7/site-packages/theano/gpuarray/__init__.py", line 166, in use
    init_dev(device, preallocate=preallocate)
  File "/home/david/miniconda2/lib/python2.7/site-packages/theano/gpuarray/__init__.py", line 65, in init_dev
    sched=config.gpuarray.sched)
  File "pygpu/gpuarray.pyx", line 634, in pygpu.gpuarray.init (pygpu/gpuarray.c:9424)
  File "pygpu/gpuarray.pyx", line 584, in pygpu.gpuarray.pygpu_init (pygpu/gpuarray.c:9115)
  File "pygpu/gpuarray.pyx", line 1057, in pygpu.gpuarray.GpuContext.__cinit__ (pygpu/gpuarray.c:13417)
GpuArrayException: No cuda device available
Using TensorFlow backend.


In [8]:
# compute mean rmse across a number of repreats
def mean_rmse(data, imputation_method, num_repeats=1, **kwargs):
    
    imputed_predictions = [imputation_method(data, **kwargs) for i in range(num_repeats)]
    
    rmses = np.array([rmse(data, imputed_prediction) for imputed_prediction in imputed_predictions])

    return np.array([rmses.mean(axis=0), rmses.std(axis=0) / np.sqrt(num_repeats)])

In [12]:
# iterate over all training data and imputation methods and compute mean rmse for num repeats
rmses = np.array([[mean_rmse(data, imputation_method, num_repeats=1) for data in datas] 
                  for imputation_method in imputation_methods])

KeyboardInterrupt: 

In [11]:
rmses.shape

(8, 10, 2, 3)

In [14]:
rmses[:,:,0, 2]

array([[ 2.7597732 ,  2.76309638,  2.75672952,  2.75636435,  2.76485654,
         2.76196537,  2.76340014,  2.76509131,  2.75448701,  2.77039009],
       [ 0.38680467,  0.38285721,  0.38561355,  0.38344047,  0.38623934,
         0.38712029,  0.38773104,  0.38611536,  0.38358458,  0.384058  ],
       [ 0.37879539,  0.3745887 ,  0.37762995,  0.37547017,  0.3781731 ,
         0.37907788,  0.37988369,  0.37869369,  0.37573435,  0.37556994],
       [ 0.61878808,  0.61355408,  0.61921208,  0.6157398 ,  0.62097853,
         0.61875009,  0.61796618,  0.62030517,  0.61419182,  0.61377519],
       [ 0.42367615,  0.57823638,  0.46558489,  0.53764899,  0.44285712,
         0.63407889,  0.60058502,  0.66988099,  0.44737126,  0.41292973],
       [ 0.47535888,  0.47287617,  0.47466673,  0.47235295,  0.47387391,
         0.47581249,  0.47394509,  0.47590199,  0.47230753,  0.47212648],
       [ 0.91146462,  0.94572842,  0.96605951,  0.97682642,  0.97955146,
         0.92247776,  1.01936476,  0.95515879

In [15]:
np.savetxt(X=rmses[:,:,0,0], fname="rmses_no_mask_repeat.csv", delimiter=",")
np.savetxt(X=rmses[:,:,0,1], fname="rmses_no_zeros_repeat.csv", delimiter=",")
np.savetxt(X=rmses[:,:,0,2], fname="rmses_nan_no_zeros_repeat.csv", delimiter=",")

In [12]:
print "done"

done


In [21]:
no_mask_df = pd.read_csv("rmses_no_mask.csv", delimiter=",", header=None)
no_mask_df.index = ["mean", "knn-3", "knn-5", "SoftImpute", "IterativeSVD", "MICE", "Autoencoder", "PCA + Autoencoder"]
no_mask_df.columns = ["training data {}".format(i) for i in range (1, 11)]
no_mask_df["mean"] = no_mask_df.mean(axis=1)

In [22]:
print no_mask_df

                   training data 1  training data 2  training data 3  \
mean                      7.133199         7.132806         7.132971   
knn-3                     7.026231         7.026359         7.026145   
knn-5                     7.025893         7.026301         7.026007   
SoftImpute                6.884854         6.884924         6.884749   
IterativeSVD              7.013616         7.009545         7.011087   
MICE                      7.024887         7.025130         7.024706   
Autoencoder               7.030034         7.040505         7.019529   
PCA + Autoencoder         7.118199         7.117698         7.117205   

                   training data 4  training data 5  training data 6  \
mean                      7.131805         7.132475         7.132670   
knn-3                     7.026199         7.025655         7.026029   
knn-5                     7.026201         7.025750         7.025946   
SoftImpute                6.884928         6.884682         6.8

In [32]:
mask_zeros_df = pd.read_csv("rmses_no_zero.csv", delimiter=",", header=None)
mask_zeros_df.index = ["mean", "knn-3", "knn-5", "SoftImpute", "IterativeSVD", "MICE", "Autoencoder", "PCA + Autoencoder"]
mask_zeros_df.columns = ["training data {}".format(i) for i in range (1, 11)]
mask_zeros_df["mean"] = mask_zeros_df.mean(axis=1)

In [33]:
print mask_zeros_df

                   training data 1  training data 2  training data 3  \
mean                      1.302482         1.303398         1.299847   
knn-3                     0.182553         0.180600         0.181824   
knn-5                     0.178773         0.176700         0.178059   
SoftImpute                0.292039         0.289424         0.291970   
IterativeSVD              0.199955         0.272764         0.219532   
MICE                      0.224463         0.223010         0.223827   
Autoencoder               0.460790         0.453687         0.452376   
PCA + Autoencoder         1.028992         1.020798         1.003990   

                   training data 4  training data 5  training data 6  \
mean                      1.301221         1.305107         1.302762   
knn-3                     0.181014         0.182318         0.182597   
knn-5                     0.177251         0.178511         0.178803   
SoftImpute                0.290678         0.293123         0.2

In [34]:
mask_nan_zeros_df = pd.read_csv("rmses_nan_no_zero.csv", delimiter=",", header=None)
mask_nan_zeros_df.index = ["mean", "knn-3", "knn-5", "SoftImpute", "IterativeSVD", "MICE", "Autoencoder", "PCA + Autoencoder"]
mask_nan_zeros_df.columns = ["training data {}".format(i) for i in range (1, 11)]
mask_nan_zeros_df["mean"] = mask_nan_zeros_df.mean(axis=1)

In [36]:
print mask_nan_zeros_df

                   training data 1  training data 2  training data 3  \
mean                      2.759773         2.763096         2.756730   
knn-3                     0.386805         0.382857         0.385614   
knn-5                     0.378795         0.374589         0.377630   
SoftImpute                0.618788         0.613554         0.619212   
IterativeSVD              0.423676         0.578236         0.465585   
MICE                      0.475537         0.472904         0.474732   
Autoencoder               0.498770         0.478248         0.496549   
PCA + Autoencoder         1.297658         1.307910         1.301271   

                   training data 4  training data 5  training data 6  \
mean                      2.756364         2.764857         2.761965   
knn-3                     0.383440         0.386239         0.387120   
knn-5                     0.375470         0.378173         0.379078   
SoftImpute                0.615740         0.620979         0.6