# Libraries

In [1]:
import os

# go up one directory
os.chdir("..")

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn_extra.cluster import KMedoids
from functions import cosmic_val
from models.nmf import NMF_mult_tol
from functions import cosmic_val



# Data

In [2]:
# load data
data = pd.read_csv("data/catalogues_Ovary_SBS.tsv", sep="\t")

cosmic = pd.read_csv("data/COSMIC_v3.4_SBS_GRCh37.txt", sep="\t")
cosmic = cosmic.drop(columns = ["Type"])

In [None]:
LATENT_DIM = 4
TOLERANCE = 1e-10
MAX_ITERATIONS = 100_000_000


print(data.shape)

E_init = np.random.rand(data.shape[1], LATENT_DIM)

print(E_init.shape)

In [None]:
losses_train = []
signatures = []
iterations = 5


for i in tqdm(range(iterations)):
    
    # Applying NMF
    signatures_nmf, exposures_nmf, loss_nmf, _, _, n_iter_nmf = NMF_mult_tol(data.to_numpy(),
                                                                             rank = LATENT_DIM,
                                                                             tol = TOLERANCE,
                                                                             mse=True,
                                                                             G_0 = E_init.T)

    # Calculating signatures and exposures for NMF
    diagonals_nmf = signatures_nmf.sum(axis=0)
    exposures_nmf = exposures_nmf.T @ np.diag(diagonals_nmf)
    signatures_nmf = (signatures_nmf) @ np.diag(1 / diagonals_nmf)
    
    losses_train.append(loss_nmf[-1])
    signatures.append(signatures_nmf)


In [None]:
print("Losses train: ", np.mean(losses_train))

In [6]:
all_signatures = np.hstack(signatures)

In [7]:
pam = KMedoids(n_clusters = LATENT_DIM, metric='cosine').fit(all_signatures.T)
consensus_signatures = all_signatures[:, pam.medoid_indices_]

In [None]:
match = cosmic_val.compute_match(consensus_signatures, cosmic)

In [None]:
match