# NMF RANK DETERMINATION

In [15]:
import os
from sklearn.decomposition import NMF
import pandas as pd
import numpy as np
from IPython.display import clear_output
import time
from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import linkage, cophenet

DATA_PATH = r'../datasets'

DATASET_NAME = r'swimmer_2.csv'          # Swimmer
# DATASET_NAME = r'Sausage Raw NIR.csv'  # Sausage
# DATASET_NAME = r'ALL-AML Brunet.csv'   # Brunet

# Reading the data

In [16]:
# Reading the data
df = pd.read_csv(os.path.join(DATA_PATH, DATASET_NAME))
display(df)

# Date Processing
m0 = df.values[:, 2:].astype(np.float_)  # Swimmer_2 Dataset
# m0 = df.values[:, 8:].astype(np.float_)  # Sausage Dataset
# m0 = df.values[:, 1:].astype(np.float_)  # esg_posneg Dataset
# m0 = df.values[:, 2:].astype(np.float_)  # Brunet Dataset

(n, p) = np.shape(m0)

# for Brunet Dataset only:
# m0 = np.log2(df.values[:, 2:].astype(np.float_))
# m0 -= np.repeat(np.min(m0, axis=0)[:, np.newaxis].T, n, axis=0)

Unnamed: 0,Label,Column 2,Row 1,Row 2,Row 3,Row 4,Row 5,Row 6,Row 7,Row 8,...,Row 1115.1,Row 1116.1,Row 1117.1,Row 1118.1,Row 1119.1,Row 1121,Row 1121.1,Row 1122,Row 1123,Row 1124
0,c111112,image,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,c111113,image,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,c111114,image,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,c111115,image,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,c111116,image,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,c11111253,image,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
252,c11111254,image,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
253,c11111255,image,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
254,c11111256,image,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


# Parameters

In [None]:
min_comp = 10
max_comp = 20
n_runs = 50
iter_max = 10

# Main Workflow

In [29]:
time_start = time.time()

test_w = np.zeros(max_comp)
test_h = np.zeros(max_comp)
c_w = np.zeros(n)
c_h = np.zeros(p)
iln1 = np.triu_indices(n, 1)
ilp1 = np.triu_indices(p, 1)

for n_comp in range(min_comp, max_comp + 1):
    my_nmfmodel = NMF(n_components=n_comp, init='nndsvda', solver='cd', beta_loss='frobenius', max_iter=iter_max, random_state=0)
    w4 = my_nmfmodel.fit_transform(m0)
    h4 = my_nmfmodel.components_.T
    error = np.linalg.norm(m0 - w4 @ h4.T)
    co_w = np.zeros(int(n * (n - 1) / 2))
    co_h = np.zeros(int(p * (p - 1) / 2))
    my_nmfmodel_random = NMF(n_components=n_comp, init='custom', solver='cd', beta_loss='frobenius', max_iter=iter_max, random_state=0)

    for i_run in range(0, n_runs):
        print('n_comp = ' + str(n_comp) + '; i_run = ' + str(i_run))
        w4_init = np.random.rand(n, n_comp); h4_init = np.random.rand(p, n_comp).T
        w4 = my_nmfmodel_random.fit_transform(m0, W=w4_init, H=h4_init)
        h4 = my_nmfmodel_random.components_.T
        c_w = np.argmax(normalize(w4, axis=0), axis=1)
        c_h = np.argmax(normalize(h4, axis=0), axis=1)
        # co_w += np.array([c_w[i] == c_w[j] for i in range(0,n-1) for j in range(i+1,n)])
        # co_h += np.array([c_h[i] == c_h[j] for i in range(0,p-1) for j in range(i+1,p)])
        co_w += np.equal.outer(c_w, c_w)[iln1]
        co_h += np.equal.outer(c_h, c_h)[ilp1]

    co_w = 1 - co_w / n_runs
    co_h = 1 - co_h / n_runs
    cpc_w, cp_w = cophenet(linkage(co_w, method='ward'), co_w)
    cpc_h, cp_h = cophenet(linkage(co_h, method='ward'), co_h)
    clear_output(wait=True)
    test_w[n_comp - 1] = cpc_w / error
    test_h[n_comp - 1] = cpc_h / error

time_elapsed = (time.time() - time_start)
print(time_elapsed)

49.76531171798706


# CUMSUM Calculation

In [30]:
cusum = np.zeros(max_comp)
test = np.sqrt(test_w * test_h)
cusum[min_comp - 2] = (test[min_comp - 2] - test[min_comp - 1] > 0)

for n_comp in range(min_comp, max_comp):
    if test[n_comp - 1] - test[n_comp] > 0:
        deltax = 1
    else:
        deltax = -1
    cusum[n_comp - 1] = max(cusum[n_comp - 2] + deltax, 0)
cusum[max_comp - 1] = cusum[max_comp - 2]

Estimated rank =  17


# Rank Estimation

In [None]:
n_comp_est = 999

for n_comp in range(1, max_comp - 2):
    if cusum[n_comp - 1] * cusum[n_comp] * cusum[n_comp + 1] > 0:
        n_comp_est = n_comp
        break

print('Estimated rank = ', n_comp_est)

In [None]:
test_w

# Saving the results

In [None]:
# np.savetxt(DATA_PATH + r'\brunet\test_scipy_cd_brunet.csv',  np.concatenate((test_w, test_h), axis=0), delimiter=',')
# np.savetxt(DATA_PATH + r'\sausage\test_scipy_cd_sausage.csv',  np.concatenate((test_w, test_h), axis=0), delimiter=',')
np.savetxt(DATA_PATH + r'\swimmer\test_scipy_cd_swimmer.csv',  np.concatenate((test_w, test_h), axis=0), delimiter=',')
# np.savetxt(DATA_PATH + r'\esg\test_scipy_cd_esg.csv',  np.concatenate((test_w, test_h), axis=0), delimiter=',')
np.savetxt(DATA_PATH + r'\foo.csv', 1-co_h, delimiter=',')