In [1]:
import numpy as np
import numpy.ma as ma
import nmf as nimfa
import copy
import csv
from sklearn import cluster

import sys
sys.path.append("..")
import distance_correlation as dc
from statistics import median, mean
import utils as ut
from FastSTMF import FastSTMF

  warn("PIL must be installed to run CBCL images example.")
  warn("PIL must be installed to run ORL images example.")


In [2]:
rank = 4  # set rank depending on the dataset
missing_value = 0 
repeat = 10  # 10
sparsity = 0.2  # 20%
init_tmf = 'random_vcol'
version = "OV/v5" 
time_in_seconds = 300 # 300 for small matrices or 600 for large matrices
epsilon = 0.0000001
folder = "results_td_real/"

In [3]:
# select dataset
original_data = np.genfromtxt("../multi_omic/OV/OV_gene_subset.txt", delimiter="\t").T
original_data = np.delete(original_data, 0, axis=0)
original_data = np.delete(original_data, 0, axis=1)

# feature agglomeration columns
agglo = cluster.FeatureAgglomeration(n_clusters=100) # 100 for small matrices or 1000 for large matrices
agglo.fit(original_data)
original_data = agglo.transform(original_data)

# polo
original_data, _, _ = ut.polo_clustering(original_data) # comment for large matrices

original_data += epsilon
ut.check_zeros(original_data)
data = copy.deepcopy(original_data)
data = ut.create_matrix_with_missing_values(data, sparsity, missing_value)  # create matrix with missing values
data_missing = copy.deepcopy(data)
data = ma.masked_equal(data, missing_value)  # create masked array
print(original_data.shape)

(291, 100)


In [4]:
X_maxplus_orig, X_missing_values, X_maxplus = original_data, data_missing, data
s = 1 # one dataset
transpose = False
approx_error_stmf, pred_error_stmf, corr_stmf = [], [], []

# FastSTMF
for i in range(rank, rank+1): # rank=3
    for j in range(0, repeat): # 10x
        model = FastSTMF(rank=i, initialization=init_tmf, threshold=time_in_seconds)
        model.fit(X_maxplus)
        model.get_statistics(version, s, j, folder, transpose)
        approx = model.predict_all()
        # metrics
        corr, a, b = dc.dcor(original_data, approx)
        corr_stmf.append(corr)
        approx_error_stmf.append(ut.rmse_approx(original_data, data_missing, approx, missing_value))
        pred_error_stmf.append(ut.rmse(original_data, data_missing, approx, missing_value))

In [5]:
print("DISTANCE CORRELATION FAST-STMF")
print("rank " + str(i) + ", max: " + str(max(corr_stmf)))
print("rank " + str(i) + ", min: " + str(min(corr_stmf)))
print("rank " + str(i) + ", median: " + str(median(corr_stmf)))

print("PREDICTION ERROR FAST-STMF")
print("rank " + str(i) + ", max: " + str(max(pred_error_stmf)))
print("rank " + str(i) + ", min: " + str(min(pred_error_stmf)))
print("rank " + str(i) + ", median: " + str(median(pred_error_stmf)))

print("APPROXIMATION ERROR FAST-STMF")
print("rank " + str(i) + ", max: " + str(max(approx_error_stmf)))
print("rank " + str(i) + ", min: " + str(min(approx_error_stmf)))
print("rank " + str(i) + ", median: " + str(median(approx_error_stmf)))

DISTANCE CORRELATION FAST-STMF
rank 4, max: 0.8206565015916526
rank 4, min: 0.6209793905255387
rank 4, median: 0.7179039383280508
PREDICTION ERROR FAST-STMF
rank 4, max: 1.5419076197319128
rank 4, min: 1.3824610286758712
rank 4, median: 1.461131831095708
APPROXIMATION ERROR FAST-STMF
rank 4, max: 1.5100874385786205
rank 4, min: 1.3580842293484527
rank 4, median: 1.4213119698901244


In [6]:
np.savetxt(folder + version + '/OV_corr_fast_stmf.csv', np.array(corr_stmf), delimiter=',')
np.savetxt(folder + version + '/OV_approx_fast_stmf.csv', np.array(approx_error_stmf), delimiter=',')
np.savetxt(folder + version + '/OV_pred_fast_stmf.csv', np.array(pred_error_stmf), delimiter=',')