# Libraries

In [1]:
import os

# go up one directory
os.chdir("..")

import pandas as pd 
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn_extra.cluster import KMedoids
from functions import cosmic_val
from models.aenmf import *
from functions import cosmic_val
from functions import data_handling as dh
import torch.optim as optim
import torch.nn as nn

# set seed
# np.random.seed(15)
# torch.manual_seed(15)

# Data

In [2]:
data_path = "data/catalogues_Ovary_SBS.tsv"
cosmic_path = "data/COSMIC_v3.4_SBS_GRCh37.txt"
output_folder = "data/processed"
output_filename = "Ordered_Ovary_SBS.csv"
ordered_data_path = os.path.join(output_folder, output_filename)

In [3]:
dh.load_preprocess_data(data_path, cosmic_path, sep1 = "\t", sep2 = "\t", output_folder = output_folder, output_filename = output_filename)

Data already exists in  data/processed/Ordered_Ovary_SBS.csv


In [4]:
# load data
data = pd.read_csv(ordered_data_path, index_col = 0)
cosmic = pd.read_csv(cosmic_path, sep = "\t", index_col = 0)

In [5]:
LATENT_DIM = 4
TOLERANCE = 1e-10
MAX_ITERATIONS = 100_000_000
CONSTRAINT = 'abs'
CRITERION = nn.MSELoss() # consider frobenius norm!

In [6]:
losses_train = []
signatures = []
iterations = 1


for i in tqdm(range(iterations)):
    # Train-test split (here it makes sense, we are working with AE)
    train, test = train_test_split(data.T, test_size = 0.2)
    train = train.T
    test = test.T

    # Initializing AENMF model
    aenmf_model = aenmf(input_dim = train.shape[1],
                              latent_dim = LATENT_DIM,
                              constraint= CONSTRAINT,)
    # Training AENMF
    aenmf_mod,training_loss_aenmf, signatures_aenmf, exposures_aenmf , enc_aenmf, sig_weights, exp_weights = train_aenmf( 
                                                model = aenmf_model,
                                                training_data = train,
                                                criterion = CRITERION,
                                                optimizer = optim.Adam(aenmf_model.parameters(), lr=1e-3),
                                                tol = TOLERANCE,
                                                relative_tol = True,
                                                max_iter = MAX_ITERATIONS)
    

    # TESTING PART

    sig_not_weights = signatures_aenmf
    exp_not_weights = exposures_aenmf

    ####

    # Calculating signatures and exposures for NMF
    diagonals_aenmf = signatures_aenmf.sum(axis=0)
    exposures_aenmf = exposures_aenmf.T @ np.diag(diagonals_aenmf)
    signatures_aenmf = (signatures_aenmf) @ np.diag(1 / diagonals_aenmf)
    
    losses_train.append(training_loss_aenmf[-1])
    signatures.append(signatures_aenmf)



100%|██████████| 1/1 [00:15<00:00, 15.06s/it]


In [7]:
print("Losses train: ", np.mean(losses_train))

Losses train:  6962.92333984375


In [8]:
all_signatures = np.hstack(signatures)

In [9]:
pam = KMedoids(n_clusters = LATENT_DIM, metric='cosine').fit(all_signatures.T)
consensus_signatures = all_signatures[:, pam.medoid_indices_]

In [10]:
matched_signatures, mean_similarity = cosmic_val.compute_match(consensus_signatures, cosmic)

In [11]:
print(matched_signatures.head())
print("\nMean similarity of the matched signatures: ", mean_similarity)

   Extracted    True  Similarity
0          0  SBS40a    0.821416
1          1  SBS10c    0.739153
2          2   SBS44    0.845215
3          3  SBS10a    0.926539

Mean similarity of the matched signatures:  0.8330806938638476


In [12]:
print(signatures_aenmf)

[[1.97645848e-02 2.00796426e-03 4.00413417e-03 6.77076129e-04]
 [1.57305506e-02 3.13509512e-03 3.21950118e-03 2.17225433e-03]
 [2.43804914e-03 4.14059428e-04 4.00828788e-04 1.13986684e-04]
 [1.70891127e-02 1.43580374e-02 4.54563512e-03 1.21693011e-02]
 [1.22905126e-02 5.72972944e-04 2.95483011e-03 1.62759035e-03]
 [6.13312285e-03 2.39062505e-04 1.15227668e-03 5.48621936e-04]
 [3.34324124e-03 1.92972432e-05 4.11335341e-04 3.70443542e-05]
 [1.20419493e-02 2.83055230e-04 2.81653006e-03 4.83475849e-04]
 [1.73272060e-02 2.56955039e-03 5.80473346e-02 1.39035882e-02]
 [7.91163257e-03 4.03646113e-03 1.24966519e-02 1.17961638e-02]
 [1.64996187e-02 9.22757992e-03 6.24633490e-02 2.49778598e-02]
 [1.56423956e-02 7.29637677e-03 1.56920395e-02 1.30129869e-02]
 [8.15681027e-03 1.12809541e-03 3.51475496e-03 1.29217856e-03]
 [5.95391490e-03 3.69389614e-04 3.52159907e-03 8.24885162e-04]
 [8.62492852e-03 1.15719849e-04 1.38253952e-03 2.02773066e-04]
 [1.15593723e-02 5.05345996e-03 1.81702183e-02 5.540594

In [20]:
print(sig_weights.head())

# sig_not_weights = pd.DataFrame(sig_not_weights)

print(sig_not_weights.head())

                  0            1           2            3
Type                                                     
A[C>A]A  724.152296   567.290069  201.174399   134.220272
A[C>A]C  576.349791   885.727083  161.753125   430.617111
A[C>A]G   89.327395   116.980071   20.138309    22.596165
A[C>A]T  626.125990  4056.432761  228.380312  2412.382940
A[C>G]A  450.310645   161.876318  148.455608   322.645578
            0            1           2            3
0  724.152296   567.290069  201.174399   134.220272
1  576.349791   885.727083  161.753125   430.617111
2   89.327395   116.980071   20.138309    22.596165
3  626.125990  4056.432761  228.380312  2412.382940
4  450.310645   161.876318  148.455608   322.645578


In [22]:
print(sig_not_weights.shape)
print(sig_weights.shape)

(96, 4)
(96, 4)


In [25]:
# Back to numpy_arrs

sig_not_weights = sig_not_weights.to_numpy()
sig_weights = sig_weights.to_numpy()

# Compute the differnce between the two

diff = np.abs(sig_not_weights - sig_weights)

print(diff)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0.