A comparison of `IterativeSVD`, `SoftImpute` and `KNN` methods using the `sleipnir_fancyimpute` Docker image

In [1]:
import argparse
import pandas as pd
import numpy as np
import random
import time
from fancyimpute import KNN, BiScaler, SoftImpute, IterativeSVD
from sklearn import preprocessing

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
input_file = 'impute_requirements/sampled_data/all_log_1663_MCAR_genes_15000_samples_1000.pcl'

In [3]:
random.seed(123)
np.random.seed(123)

In [4]:
# read in data and transpose
data = pd.read_csv(input_file, sep='\t', header=0, index_col=0, 
				   error_bad_lines=False)
new_data = data.copy()
transposed = new_data.T

In [5]:
svd_t0 = time.time()
# standard scaled
imputed_svd = IterativeSVD(rank=10).fit_transform(transposed)
svd_t1 = time.time()

[IterativeSVD] Iter 1: observed MAE=1.474894
[IterativeSVD] Iter 2: observed MAE=0.908067
[IterativeSVD] Iter 3: observed MAE=0.683420
[IterativeSVD] Iter 4: observed MAE=0.526641
[IterativeSVD] Iter 5: observed MAE=0.483810
[IterativeSVD] Iter 6: observed MAE=0.479939
[IterativeSVD] Iter 7: observed MAE=0.479618
[IterativeSVD] Iter 8: observed MAE=0.479640
[IterativeSVD] Iter 9: observed MAE=0.479674


In [6]:
transposed_mat = transposed.as_matrix()
biscaler = BiScaler()

softimpute_t0 = time.time()
# perform the scaling appropriate for this imputation strategy
transposed_normalized = biscaler.fit_transform(transposed_mat)

# the imputation itself
imputed_softimpute = SoftImpute().fit_transform(transposed_normalized)

# we don't want the transformed values and we want samples to be columns
inverse_softimpute = biscaler.inverse_transform(imputed_softimpute)
softimpute_t1 = time.time()

  """Entry point for launching an IPython kernel.


[BiScaler] Initial log residual value = 12.793512
[BiScaler] Iter 1: log residual = 6.534825, log improvement ratio=6.258687
[BiScaler] Iter 2: log residual = 6.142832, log improvement ratio=0.391992
[BiScaler] Iter 3: log residual = 4.319951, log improvement ratio=1.822881
[BiScaler] Iter 4: log residual = 2.208265, log improvement ratio=2.111686
[BiScaler] Iter 5: log residual = 0.291531, log improvement ratio=1.916734
[BiScaler] Iter 6: log residual = -1.474242, log improvement ratio=1.765773
[BiScaler] Iter 7: log residual = -3.149361, log improvement ratio=1.675118
[BiScaler] Iter 8: log residual = -4.772535, log improvement ratio=1.623174
[BiScaler] Iter 9: log residual = -6.366557, log improvement ratio=1.594022
[BiScaler] Iter 10: log residual = -7.944430, log improvement ratio=1.577873
[BiScaler] Iter 11: log residual = -9.513400, log improvement ratio=1.568970
[BiScaler] Iter 12: log residual = -11.077466, log improvement ratio=1.564066
[BiScaler] Iter 13: log residual = -12.

In [7]:
scaler = preprocessing.StandardScaler(copy=True)
scaler.fit(transposed)

knn_t0 = time.time()
scaled = pd.DataFrame(scaler.transform(transposed),
                      index=transposed.index,
                      columns=transposed.columns
)

# perform the imputation, setting k=10 as is standard for gene expression data
imputed_knn_row = KNN(k=10).fit_transform(scaled)

# inverse transformation -- we don't want the standard scores
inverse_knn_row = scaler.inverse_transform(imputed_knn_row)
knn_t1 = time.time()

Imputing row 1/1000 with 4532 missing, elapsed time: 119.958
Imputing row 101/1000 with 4637 missing, elapsed time: 125.990
Imputing row 201/1000 with 4525 missing, elapsed time: 132.005
Imputing row 301/1000 with 4415 missing, elapsed time: 137.993
Imputing row 401/1000 with 4513 missing, elapsed time: 144.070
Imputing row 501/1000 with 4518 missing, elapsed time: 150.093
Imputing row 601/1000 with 4563 missing, elapsed time: 156.162
Imputing row 701/1000 with 4435 missing, elapsed time: 162.200
Imputing row 801/1000 with 4328 missing, elapsed time: 168.303
Imputing row 901/1000 with 4506 missing, elapsed time: 174.321


In [8]:
"IterativeSVD: " + str(svd_t1-svd_t0)

'IterativeSVD: 10.729404926300049'

In [9]:
"KNN: " + str(knn_t1-knn_t0)

'KNN: 180.74890208244324'

In [10]:
"SoftImpute: " + str(softimpute_t1-softimpute_t0)

'SoftImpute: 152.42456245422363'