# Libraries

In [1]:
import os

# go up one directory
os.chdir("..")

import pandas as pd 
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn_extra.cluster import KMedoids
from functions import cosmic_val
from models.aenmf import *
from functions import cosmic_val
from functions import data_handling as dh
import torch.optim as optim
import torch.nn as nn

# set seed
# np.random.seed(15)
# torch.manual_seed(15)

# Data

In [2]:
data_path = "data/catalogues_Ovary_SBS.tsv"
cosmic_path = "data/COSMIC_v3.4_SBS_GRCh37.txt"
output_folder = "data/processed"
output_filename = "Ordered_Ovary_SBS.csv"
ordered_cosmic_filename = "ordered_cosmic.csv"
ordered_data_path = os.path.join(output_folder, output_filename)
ordered_cosmic_path = os.path.join(output_folder, ordered_cosmic_filename)

In [3]:
dh.load_preprocess_data(data_path, cosmic_path, sep1 = "\t", sep2 = "\t", output_folder = output_folder, output_filename = output_filename, out_cosmic_filename = ordered_cosmic_filename)

In [9]:
# load data
data = pd.read_csv(ordered_data_path, index_col = 0)
cosmic = pd.read_csv(ordered_cosmic_path, index_col = 0)

In [15]:
LATENT_DIM = 4
TOLERANCE = 1e-10
MAX_ITERATIONS = 100_000_000
CONSTRAINT = 'abs'
CRITERION = nn.MSELoss() # consider frobenius norm!

print(data.shape)

E_init = np.random.rand(data.shape[1], LATENT_DIM)

print(E_init.shape)

(96, 523)
(523, 4)


In [20]:
losses_train = []
signatures = []
iterations = 1


for i in tqdm(range(iterations)):
    # Train-test split (here it makes sense, we are working with AE)
    train, test = train_test_split(data.T, test_size = 0.2)
    train = train.T
    test = test.T
    print(type(train))

    # Initializing AENMF model
    aenmf_model = aenmf(input_dim = train.shape[1],
                              latent_dim = LATENT_DIM,
                              constraint= CONSTRAINT,)
    # Training AENMF
    aenmf_mod,training_loss_aenmf, signatures_aenmf, exposures_aenmf , enc_aenmf = train( 
                                                model = aenmf_model,
                                                training_data = train,
                                                criterion = CRITERION,
                                                optimizer = optim.Adam(aenmf_model.parameters(), lr=1e-3),
                                                tol = TOLERANCE,
                                                relative_tol = True,
                                                max_iter = MAX_ITERATIONS)
    
    # Calculating signatures and exposures for NMF
    diagonals_aenmf = signatures_nmf.sum(axis=0)
    exposures_aenmf = exposures_aenmf.T @ np.diag(diagonals_aenmf)
    signatures_nmf = (signatures_nmf) @ np.diag(1 / diagonals_aenmf)
    
    losses_train.append(training_loss_aenmf[-1])
    signatures.append(signatures_aenmf)



  0%|          | 0/1 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>





TypeError: 'DataFrame' object is not callable

In [8]:
print("Losses train: ", np.mean(losses_train))

Losses train:  18307.472785824662


In [9]:
all_signatures = np.hstack(signatures)

In [10]:
pam = KMedoids(n_clusters = LATENT_DIM, metric='cosine').fit(all_signatures.T)
consensus_signatures = all_signatures[:, pam.medoid_indices_]

In [11]:
match = cosmic_val.compute_match(consensus_signatures, cosmic)

           0         1         2         3
0   0.005577  0.000000  0.025359  0.002142
1   0.004096  0.001839  0.019929  0.003253
2   0.000635  0.000027  0.003283  0.000454
3   0.004077  0.012536  0.019291  0.014627
4   0.003387  0.001560  0.015050  0.000395
..       ...       ...       ...       ...
91  0.051070  0.011481  0.000970  0.011779
92  0.000640  0.001736  0.005946  0.021565
93  0.000309  0.008897  0.004173  0.007547
94  0.003936  0.001483  0.007071  0.003821
95  0.015113  0.057556  0.003857  0.147273

[96 rows x 4 columns]
                SBS44    SBS10c    SBS40a        SBS10a
Type                                                   
A[C>A]A  7.680000e-18  0.004331  0.036395  2.190170e-03
A[C>A]C  1.500380e-04  0.014830  0.016772  1.770137e-03
A[C>A]G  9.160000e-07  0.000657  0.003748  1.500120e-04
A[C>A]T  5.781465e-03  0.013128  0.015435  1.700132e-02
A[C>G]A  3.180806e-03  0.000348  0.008213  2.230000e-16
...               ...       ...       ...           ...
T[T>C]T  3.71

In [15]:
match

Unnamed: 0,Extracted,True,Similarity
0,0,SBS44,0.839172
1,1,SBS10c,0.661057
2,2,SBS40a,0.721298
3,3,SBS10a,0.931474


In [16]:
print(np.mean(match['Similarity']))

0.7882502198463217
