# Libraries

In [1]:
import os

# go up one directory
os.chdir("..")

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn_extra.cluster import KMedoids
from functions import cosmic_val
from models.nmf import NMF_mult_tol
from functions import cosmic_val
from sklearn.decomposition import NMF as nmf_sklearn
from functions import data_handling as dh


# Data

In [7]:
data_path = "data/catalogues_Ovary_SBS.tsv"
cosmic_path = "data/COSMIC_v3.4_SBS_GRCh37.txt"
output_folder = "data/processed"
output_filename = "Ordered_Ovary_SBS.csv"
ordered_cosmic_filename = "ordered_cosmic.csv"
ordered_data_path = os.path.join(output_folder, output_filename)
ordered_cosmic_path = os.path.join(output_folder, ordered_cosmic_filename)

In [8]:
dh.load_preprocess_data(data_path, cosmic_path, sep1 = "\t", sep2 = "\t", output_folder = output_folder, output_filename = output_filename, out_cosmic_filename = ordered_cosmic_filename)

In [12]:
# load data
data = pd.read_csv(ordered_data_path)
cosmic = pd.read_csv(ordered_cosmic_path, index_col = 0)

In [None]:
LATENT_DIM = 4
TOLERANCE = 1e-10
MAX_ITERATIONS = 100_000_000


print(data.shape)

E_init = np.random.rand(data.shape[1], LATENT_DIM)

print(E_init.shape)

In [None]:
losses_train = []
signatures = []
iterations = 5


# for i in tqdm(range(iterations)):
    
#     # Applying NMF
#     signatures_nmf, exposures_nmf, loss_nmf, _, _, n_iter_nmf = NMF_mult_tol(data.to_numpy(),
#                                                                              rank = LATENT_DIM,
#                                                                              tol = TOLERANCE,
#                                                                              mse=True,
#                                                                              G_0 = E_init.T)

#     # Calculating signatures and exposures for NMF
#     diagonals_nmf = signatures_nmf.sum(axis=0)
#     exposures_nmf = exposures_nmf.T @ np.diag(diagonals_nmf)
#     signatures_nmf = (signatures_nmf) @ np.diag(1 / diagonals_nmf)
    
#     losses_train.append(loss_nmf[-1])
#     signatures.append(signatures_nmf)

for i in tqdm(range(iterations)):

    # Sklearn NMF

    nmf = nmf_sklearn(n_components = LATENT_DIM, init = 'random', random_state = 0, max_iter = MAX_ITERATIONS, tol = TOLERANCE)

    signatures_nmf = nmf.fit_transform(data)
    exposures_nmf = nmf.components_
    loss = nmf.reconstruction_err_

    diagonals_nmf = signatures_nmf.sum(axis=0)
    exposures_nmf = exposures_nmf.T @ np.diag(diagonals_nmf)
    signatures_nmf = (signatures_nmf) @ np.diag(1 / diagonals_nmf)
    
    losses_train.append(loss)
    signatures.append(signatures_nmf)


In [None]:
print("Losses train: ", np.mean(losses_train))

In [None]:
print("Signatures: ", np.shape(signatures))

In [10]:
all_signatures = np.hstack(signatures)

In [11]:
signature_test = signatures[-1]

In [None]:
print(signature_test.shape)

In [None]:
pam = KMedoids(n_clusters = LATENT_DIM, metric='cosine').fit(all_signatures.T)
consensus_signatures = all_signatures[:, pam.medoid_indices_]

In [None]:
print(consensus_signatures.shape)

In [None]:
match = cosmic_val.compute_match(signature_test, cosmic)

In [None]:
match