In [92]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from fancyimpute import SimpleFill, KNN, SoftImpute, IterativeSVD, MICE, MatrixFactorization, NuclearNormMinimization
from keras.models import Model
from keras.layers import Dense, Input, Dropout, BatchNormalization
from keras.objectives import binary_crossentropy, mean_squared_error
from keras.callbacks import EarlyStopping
import keras.backend as K
import tensorflow as tf
from functools import partial

# root mean squared error with three masks
def rmse (original_data, y_pred, y_true): 
    # rsme prediction and ground truth
    rmse_no_mask = np.sqrt(mse(y_true, y_pred))
    
    # ignore all zeros in the ground truth data
    no_zeros = y_true > 0
    rmse_no_zeros = np.sqrt(mse(y_true[no_zeros], y_pred[no_zeros]))
    
    # ignore zeros and only consider data that was originally nan in the training data
    nan_no_zeros = np.isnan(original_data) & (y_true > 0)
    rmse_nan_no_zeros = np.sqrt(mse(y_true[nan_no_zeros], y_pred[nan_no_zeros]))
    
    # concatenate all three results
    return np.array([rmse_no_mask, rmse_no_zeros, rmse_nan_no_zeros])

# compute mean rmse across a number of repreats
def mean_rmse(data, imputation_method, y_true, num_repeats=1, **kwargs):
    
    imputed_predictions = [imputation_method(data, **kwargs) for i in range(num_repeats)]
    
    rmses = np.array([rmse(data, imputed_prediction, y_true) for imputed_prediction in imputed_predictions])

    return rmses.mean(axis=0)

# imputation methods
# impute with sample mean
def sample_mean(data, **kwargs):
    fill = SimpleFill(fill_method="mean")
    return fill.complete(data)

# impute with knn-3
def knn_3(data, **kwargs):
    fill = KNN(k=3, verbose=0)
    return fill.complete(data)

# impute with knn-5
def knn_5(data, **kwargs):
    fill = KNN(k=5, verbose=0)
    return fill.complete(data)

# knn for any k
def knn(data, k, **kwargs):
    fill = KNN(k=k, verbose=0)
    return fill.complete(data)

def knn_bootstrap(data, k, sample_size, num_samples, **kwargs):
    
    num_proteins = data.shape[0]
    
    idxs = np.array([])
    imputed_predictions = None
    
    sample = 0
    while sample < num_samples and np.array([x not in idxs for x in range(num_proteins)]).any():
        
        idx = np.random.choice(num_proteins, sample_size, replace=True)
        
        bootstrap_sample = data[idx]
        imputed_prediction = knn(bootstrap_sample, k, **kwargs)
        
        idxs = np.append(idxs, idx)
        if imputed_predictions is None:
            imputed_predictions = imputed_prediction
        else:
            imputed_predictions = np.row_stack([imputed_predictions, imputed_prediction])
                                      
    imputed_predictions = np.array(imputed_predictions)
    
    complete_prediction = np.zeros_like(data)
    
    for i in range(num_proteins):
        
        row_predictions = imputed_predictions[np.where(idxs==i)[0]]
        if len(row_predictions) == 1:
            complete_prediction[i] = row_predictions
        else:
            complete_prediction[i] = row_predictions.mean(axis=0)
        
    return complete_prediction



# softimpute from fancyimpute package
# def soft_impute(data, **kwargs):
#     fill = SoftImpute(verbose=0)
#     return fill.complete(data)

# removing to focus on optimising iterativeSVD

# # iterativeSVD from fancy impute package
def iterative_SVD(data, **kwargs):
    fill = IterativeSVD(verbose=0)
    return fill.complete(data)

# # MICE for fancyimpute package
# def mice(data, **kwargs):
#     fill = MICE(verbose=0)
#     return fill.complete(data)

# modified autoencoder that does not propagate error from missing values
def modified_autoencoder(data, num_hidden=[32], dropout=0.1, **kwargs):
    
    # dimensionality of data
    num_proteins, num_features = data.shape
    
    # to normalise the data we must impute 
    mean_imputer = SimpleFill(fill_method="mean")
    data_imputed = mean_imputer.complete(data)
    
    # standard scaling for normalisation
    standard_scaler = StandardScaler()
    data_imputed_and_scaled = standard_scaler.fit_transform(data_imputed)
    
    # replace all missing values with 0 so they do not contribute to input
    data_imputed_and_scaled[np.isnan(data)] = 0
    
    # maintain nan in target data so we know which outputs should not prodice any error
    data_scaled_with_nan = np.array([[data_imputed_and_scaled[i, j] if ~np.isnan(data[i, j]) else np.nan
                                     for j in range(num_features)] for i in range(num_proteins)])
    
    # custom MSE that only produces error on non-nan terms
    def custom_MSE(y_true, y_pred):
    
        y_true = K.flatten(y_true)
        y_pred = K.flatten(y_pred)

        # mask for targets that are not nan
        mask = ~tf.is_nan(y_true)

        # apply the mask to targets and output of network and then compute MSE with what remains
        y_true = tf.boolean_mask(tensor=y_true, mask=mask)
        y_pred = tf.boolean_mask(tensor=y_pred, mask=mask)

        return mean_squared_error(y_true, y_pred)

    
    # construct model
    x = Input(shape=(num_features,))
    
    # first fully connected layer layer
    y = Dense(num_hidden[0], activation="relu")(x)
    y = BatchNormalization()(y)
    y = Dropout(dropout)(y)

    # all remaining fully connected layers
    for h in num_hidden[1:] + num_hidden[-2::-1]:
        y = Dense(h, activation="relu")(y)
        y = BatchNormalization()(y)
        y = Dropout(dropout)(y)
    
    # output -- no activation function 
    y = Dense(num_features, activation="linear")(y)
    autoencoder = Model(x, y)
    autoencoder.compile(optimizer="adam", loss=custom_MSE)
    early_stopping = EarlyStopping(monitor="loss", patience=100, min_delta=0)
    # train model
    autoencoder.fit(data_imputed_and_scaled, data_scaled_with_nan, 
                    verbose=0, epochs=10000, callbacks=[early_stopping])
    
    print "trained autoencoder"
    # predict data
    prediction = autoencoder.predict(data_imputed_and_scaled)
    
    # reverse normalise and return
    return standard_scaler.inverse_transform(prediction)

# PCA and then autoencoder
def pca_autoencoder(data, num_hidden=[32], dropout=0.1, pca_dim=64, **kwargs):
    
    
    # dimensionality of data
    num_proteins, num_features = data.shape
    
    #construct model
    x = Input(shape=(pca_dim,))
    y = Dropout(1e-8)(x)
    for h in num_hidden + num_hidden[-2::-1]:
        y = Dense(h, activation="relu")(y)
        y = BatchNormalization()(y)
        y = Dropout(dropout)(y)
    y = Dense(pca_dim)(y)
    
    autoencoder = Model(x, y)
    autoencoder.compile(optimizer="adam", loss="mse")
    
    
    # project with pca
    mean_imputer = SimpleFill()
    data_imputed = mean_imputer.complete(data)
    pca = PCA(n_components=pca_dim)
    data_transformed = pca.fit_transform(data_imputed)
    early_stopping = EarlyStopping(monitor="loss", patience=1000, min_delta=0)
    autoencoder.fit(data_transformed, data_transformed, 
                    verbose=0, epochs=10000, callbacks=[early_stopping])
    
    prediction = autoencoder.predict(data_transformed)
    
    return pca.inverse_transform(prediction)



def main():
    
    print "Loading data"

    # training data
    dfs = [pd.read_csv("../data/sub_challenge_1/data_obs_{}.txt".format(i), 
                    header=0, index_col=0, sep="\t") for i in range(1, 11)]

    # ground truth
    ground_truth_table = pd.read_csv("../data/sub_challenge_1/data_true.txt", 
                    header=0, index_col=0, sep="\t")

    # conver from data frame ot numpy array
    datas = [df.values for df in dfs]
    ground_truth = ground_truth_table.values

    # list of imputation tecniques
#     imputation_methods = [sample_mean, knn_3, knn_5, soft_impute, 
#                           modified_autoencoder, pca_autoencoder]


#     imputation_methods = [partial(modified_autoencoder, num_hidden=[32]),#]
#                          partial(modified_autoencoder, num_hidden=[64, 32]),
#                          partial(modified_autoencoder, num_hidden=[128, 64, 32])]
#     imputation_method_names = ["autoencoder_32", #]
#                                "autoencoder_64_32", 
#                                "autoencoder_128_64_32"]
#     imputation_methods = [iterative_SVD,
#                          partial(iterative_SVD, )]
    imputation_methods = [#knn_5, 
                          partial(knn_bootstrap, sample_size=100, num_samples = 1000, k=5),
        partial(knn_bootstrap, sample_size=ground_truth.shape[0], num_samples = 100, k=5),
        partial(knn_bootstrap, sample_size=100, num_samples = 1000, k=5),]
    imputation_method_names = ["bootstrap_knn_100_100", "bootstrap_knn_num_proteins_100", "bootstrap_knn_100_1000"]
    
    print "Computing rmse"
    
    # iterate over all training data and imputation methods and compute mean rmse for num repeats
    rmses = np.array([[mean_rmse(data, imputation_method, ground_truth, num_repeats=1) for data in datas] 
                      for imputation_method in imputation_methods])
    
    print "Saving rmse to file"
    
    data = rmses[:,:,2]
    
    data_df = pd.DataFrame(data, index=imputation_method_names, 
                           columns=["training_data_{}".format(i) for i in range(1, 11)])
    data_df.to_csv("../results/subchallenge_1/{}_rmses_only_nan_ignore_zeros.csv".format("_".join(imputation_method_names)), sep=",")
    
    
    print "RMSE"
    print data
    
    # save to file
#     np.savetxt(X=rmses[:,:,0], 
#                fname="../results/subchallenge_1/{}_rmses_no_mask.csv".format("_".join(imputation_method_names)), delimiter=",")
#     np.savetxt(X=rmses[:,:,1], 
#                fname="../results/subchallenge_1/{}_rmses_ignore_zeros.csv".format("_".join(imputation_method_names)), delimiter=",")
#     np.savetxt(X=rmses[:,:,2], 
#                fname="../results/subchallenge_1/{}_rmses_only_nan_ignore_zeros.csv".format("_".join(imputation_method_names)), 
#                delimiter=",")
    

if __name__ == "__main__":
    main()

Loading data
Computing rmse
Saving rmse to file
RMSE
[[ 0.48025014  0.48158118  0.48047411  0.47901986  0.48033528  0.48376452
   0.48452314  0.48262392  0.48283767  0.48443891]
 [ 0.3837956   0.37947918  0.38237012  0.38034215  0.38368019  0.38396877
   0.38545407  0.38270399  0.38047192  0.38018053]
 [ 0.48294988  0.47640953  0.48014478  0.47834866  0.48017847  0.48542613
   0.48424895  0.48213823  0.48037038  0.4830848 ]]


In [22]:
def load_data(filename):
    
    df = pd.read_csv(filename, sep="\t", header=0, index_col=0, )
    
    return df.values, df.index, df.columns

In [84]:
def knn_bootstrap(data, k, sample_size, num_samples, **kwargs):
    
    num_proteins = data.shape[0]
    
    idxs = np.array([])
    imputed_predictions = None
    
    sample = 0
    while sample < num_samples and np.array([x not in idxs for x in range(num_proteins)]).any():
        
        idx = np.random.choice(num_proteins, sample_size, replace=True)
        
        bootstrap_sample = data[idx]
        imputed_prediction = knn(bootstrap_sample, k, **kwargs)
        
        idxs = np.append(idxs, idx)
        if imputed_predictions is None:
            imputed_predictions = imputed_prediction
        else:
            imputed_predictions = np.row_stack([imputed_predictions, imputed_prediction])
                                      
    imputed_predictions = np.array(imputed_predictions)
    
    complete_prediction = np.zeros_like(data)
    
    for i in range(num_proteins):
        
        row_predictions = imputed_predictions[np.where(idxs==i)[0]]
        if len(row_predictions) == 1:
            complete_prediction[i] = row_predictions
        else:
            complete_prediction[i] = row_predictions.mean(axis=0)
        
    return complete_prediction

In [85]:
data, index, columns = load_data("../data/sub_challenge_1/data_obs_1.txt")

In [86]:
knn_bootstrap(data, sample_size=500, k=5, num_samples=100)

array([[ 30.39102228,  29.10387767,  30.07013839, ...,  29.38377411,
         29.53601563,  29.67869975],
       [ 30.38635995,  29.09353396,  30.05899567, ...,  29.37105018,
         29.52330719,  29.66496016],
       [ 29.86446814,  29.09184396,  30.05933845, ...,  29.36936577,
         29.52366181,  29.72324976],
       ..., 
       [ 23.08616549,  21.93517499,  23.02122151, ...,  22.40058085,
         23.30060219,  23.64940526],
       [ 21.26722578,  21.76599461,  21.25573788, ...,  21.32673303,
         20.59719677,  21.47215192],
       [ 26.3676424 ,  24.64069933,  25.9333985 , ...,  25.5465123 ,
         26.21346231,  26.59695673]])

In [50]:
a = np.array([[1,2,3,4], [3,4,5,6]])

In [51]:
empty = np.row_stack([a, a])

In [75]:
empty.mean(axis=0)

array([ 2.,  3.,  4.,  5.])

In [60]:
a = None

In [80]:
len(empty[0].shape)

1

In [88]:
df = pd.DataFrame()

In [None]:
df.to_csv()