# NMF RANK DETERMINATION

In [None]:
%load_ext autoreload
%autoreload 2

# Libraries

In [None]:
from sklearn.decomposition import NMF
from IPython.display import clear_output
import time
from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import linkage, cophenet
from tqdm import tqdm
from nmf_utils import *

# Ignore ConvergenceWarning messages from Scikit-learn
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore", category=ConvergenceWarning)

DATA_PATH = r'../datasets'

DATASET_NAME = r'swimmer_2.csv'          # Swimmer
# DATASET_NAME = r'Sausage Raw NIR.csv'  # Sausage
# DATASET_NAME = r'ALL-AML Brunet.csv'   # Brunet

# Read Dataset

In [None]:
df, m0 = read_dataset(DATA_PATH, DATASET_NAME, show_data=False)
(n, p) = np.shape(m0)
c_w = np.zeros(n)
c_h = np.zeros(p)
iln1 = np.triu_indices(n, 1)
ilp1 = np.triu_indices(p, 1)

# Parameters

In [None]:
min_comp = 10  # default=10
max_comp = 20  # default=20
n_runs = 50    # default=50
iter_max = 10  # default=10

test_w = np.zeros(max_comp)
test_h = np.zeros(max_comp)

# Main Workflow

In [None]:
time_start = time.time()

for n_comp in tqdm(range(min_comp, max_comp + 1)):
    my_nmfmodel = NMF(n_components=n_comp, init='nndsvda', solver='cd', beta_loss='frobenius', max_iter=iter_max, random_state=0)
    w4 = my_nmfmodel.fit_transform(m0)
    h4 = my_nmfmodel.components_.T
    error = np.linalg.norm(m0 - w4 @ h4.T)
    co_w = np.zeros(int(n * (n - 1) / 2))
    co_h = np.zeros(int(p * (p - 1) / 2))
    my_nmfmodel_random = NMF(n_components=n_comp, init='custom', solver='cd', beta_loss='frobenius', max_iter=iter_max, random_state=0)

    for i_run in tqdm(range(0, n_runs)):
        # print('n_comp = ' + str(n_comp) + '; i_run = ' + str(i_run))
        w4_init = np.random.rand(n, n_comp); h4_init = np.random.rand(p, n_comp).T
        w4 = my_nmfmodel_random.fit_transform(m0, W=w4_init, H=h4_init)
        h4 = my_nmfmodel_random.components_.T
        c_w = np.argmax(normalize(w4, axis=0), axis=1)
        c_h = np.argmax(normalize(h4, axis=0), axis=1)
        # co_w += np.array([c_w[i] == c_w[j] for i in range(0,n-1) for j in range(i+1,n)])
        # co_h += np.array([c_h[i] == c_h[j] for i in range(0,p-1) for j in range(i+1,p)])
        co_w += np.equal.outer(c_w, c_w)[iln1]
        co_h += np.equal.outer(c_h, c_h)[ilp1]

    co_w = 1 - co_w / n_runs
    co_h = 1 - co_h / n_runs
    
    cpc_w, cp_w = cophenet(linkage(co_w, method='ward'), co_w)
    cpc_h, cp_h = cophenet(linkage(co_h, method='ward'), co_h)
    
    clear_output(wait=True)
    test_w[n_comp - 1] = cpc_w / error
    test_h[n_comp - 1] = cpc_h / error

time_elapsed = (time.time() - time_start)
print(time_elapsed)

# Cusum Calculation

In [None]:
cusum = cusum_calculation(min_comp, max_comp, test_w, test_h)

# Rank Estimation

In [None]:
estimated_rank = rank_estimation(cusum, max_comp)
print(estimated_rank)

In [None]:
test_w

# Saving the results

In [None]:
# # np.savetxt(DATA_PATH + r'\brunet\test_scipy_cd_brunet.csv',  np.concatenate((test_w, test_h), axis=0), delimiter=',')
# # np.savetxt(DATA_PATH + r'\sausage\test_scipy_cd_sausage.csv',  np.concatenate((test_w, test_h), axis=0), delimiter=',')
# np.savetxt(DATA_PATH + r'\swimmer\test_scipy_cd_swimmer.csv',  np.concatenate((test_w, test_h), axis=0), delimiter=',')
# # np.savetxt(DATA_PATH + r'\esg\test_scipy_cd_esg.csv',  np.concatenate((test_w, test_h), axis=0), delimiter=',')
# np.savetxt(DATA_PATH + r'\foo.csv', 1 - co_h, delimiter=',')