# Libraries

In [1]:
import os

# go up one directory
os.chdir("..")

import pandas as pd 
import numpy as np
from tqdm import tqdm
from sklearn_extra.cluster import KMedoids
from functions import cosmic_val
from models.nmf import nmf
from functions import cosmic_val
from functions import data_handling as dh

# set seed
# np.random.seed(15)

# Data

In [2]:
data_path = "data/catalogues_Ovary_SBS.tsv"
cosmic_path = "data/COSMIC_v3.4_SBS_GRCh37.txt"
output_folder = "data/processed"
output_filename = "Ordered_Ovary_SBS.csv"
ordered_data_path = os.path.join(output_folder, output_filename)

In [3]:
dh.load_preprocess_data(data_path, cosmic_path, sep1 = "\t", sep2 = "\t", output_folder = output_folder, output_filename = output_filename)

Data already exists in  data/processed/Ordered_Ovary_SBS.csv


In [26]:
# load data
data = pd.read_csv(ordered_data_path, index_col = 0)
cosmic = pd.read_csv(cosmic_path, sep = "\t", index_col = 0)

In [5]:
LATENT_DIM = 4
TOLERANCE = 1e-4
MAX_ITERATIONS = 100_000_000

In [6]:
losses_train = []
signatures = []
iterations = 1


for i in tqdm(range(iterations)):
    
    # Applying NMF
    signatures_nmf, exposures_nmf, loss_nmf = nmf(catalog_matrix = data.to_numpy(),
                                                  num_sign = LATENT_DIM,
                                                  tol = TOLERANCE,
                                                  max_iter = MAX_ITERATIONS)
    
    # Calculating signatures and exposures for NMF
    diagonals_nmf = signatures_nmf.sum(axis=0)
    exposures_nmf = exposures_nmf.T @ np.diag(diagonals_nmf)
    signatures_nmf = (signatures_nmf) @ np.diag(1 / diagonals_nmf)
    
    losses_train.append(loss_nmf[-1])
    signatures.append(signatures_nmf)



  0%|          | 0/1 [00:00<?, ?it/s]

Iteration: 1000, Loss: 33154.52353135035
Iteration: 2000, Loss: 28161.85573259711
Iteration: 3000, Loss: 26427.429519980473
Iteration: 4000, Loss: 25719.453729811798
Iteration: 5000, Loss: 25502.351944073755
Iteration: 6000, Loss: 25371.92485494772
Iteration: 7000, Loss: 25211.088473150456
Iteration: 8000, Loss: 25012.534101918405
Iteration: 9000, Loss: 24544.367858880974
Iteration: 10000, Loss: 24006.00058179592
Iteration: 11000, Loss: 22205.63691487846
Iteration: 12000, Loss: 19484.102019937898
Iteration: 13000, Loss: 18698.20472974728
Iteration: 14000, Loss: 18544.3602020063
Iteration: 15000, Loss: 18482.42500907173
Iteration: 16000, Loss: 18473.041942140713
Iteration: 17000, Loss: 18463.28133169108
Iteration: 18000, Loss: 18452.927174018034
Iteration: 19000, Loss: 18434.152630659977
Iteration: 20000, Loss: 18430.62139323773
Iteration: 21000, Loss: 18413.935297675027
Iteration: 22000, Loss: 18387.001807071636


100%|██████████| 1/1 [00:52<00:00, 52.17s/it]


In [7]:
print("Losses train: ", np.mean(losses_train))

Losses train:  18386.26231605195


In [8]:
all_signatures = np.hstack(signatures)

In [9]:
pam = KMedoids(n_clusters = LATENT_DIM, metric='cosine').fit(all_signatures.T)
consensus_signatures = all_signatures[:, pam.medoid_indices_]

In [27]:
matched_signatures, mean_similarity = cosmic_val.compute_match(consensus_signatures, cosmic)

In [28]:
print(matched_signatures.head())
print("\nMean similarity of the matched signatures: ", mean_similarity)

   Extracted    True  Similarity
0          0   SBS44    0.842920
1          1  SBS10c    0.743205
2          2  SBS10a    0.931570
3          3  SBS40a    0.728419
Mean similarity of the matched signatures:  0.8115285506351773
