# Libraries

In [1]:
import os

# go up one directory
os.chdir("..")

import pandas as pd 
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn_extra.cluster import KMedoids
from functions import cosmic_val
from models.denoising import *
from functions import cosmic_val
from functions import data_handling as dh
import torch.optim as optim
import torch.nn as nn

# set seed
# np.random.seed(15)
# torch.manual_seed(15)

# Data

In [2]:
data_path = "data/catalogues_Ovary_SBS.tsv"
cosmic_path = "data/COSMIC_v3.4_SBS_GRCh37.txt"
output_folder = "data/processed"
output_filename = "Ordered_Ovary_SBS.csv"
ordered_data_path = os.path.join(output_folder, output_filename)

In [3]:
dh.load_preprocess_data(data_path, cosmic_path, sep1 = "\t", sep2 = "\t", output_folder = output_folder, output_filename = output_filename)

Data already exists in  data/processed/Ordered_Ovary_SBS.csv


In [4]:
# load data
data = pd.read_csv(ordered_data_path, index_col = 0)
cosmic = pd.read_csv(cosmic_path, sep = "\t", index_col = 0)

# Denoising Sparse Autoencoder

In [5]:
SIGMA = 3 # standerd deviation of the noise
MU = 5 # mean of the noise
TEST_SPLIT = 0.2 # proportion of the data used for testing
BATCH_SIZE = 32 # batch size for the dataloader

In [6]:
LATENT_DIM = 4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CRITERION = nn.MSELoss()
LEARNING_RATE = 1e-4
LAMBDA = 1e-6
MAX_ITERATIONS = 100_000_000
CONSTRAINT = 'abs'
TOLERANCE = 1e-3

In [7]:
losses_train = []
signatures = []
iterations = 1


for i in tqdm(range(iterations)):
    # Train-test split (here it makes sense, we are working with AE)
    train, test = train_test_split(data.T, test_size = 0.2)


    # Initializing AENMF model
    dsae_model = dsae(input_dim = train.shape[1],
                       latent_dim = LATENT_DIM,
                       constraint= CONSTRAINT,
                       xavier = True)
    # Training AENMF
    aenmf_mod,training_loss_aenmf, signatures_aenmf, exposures_aenmf = train_dsae( 
                                                model = dsae_model,
                                                training_data = train,
                                                criterion = CRITERION,
                                                optimizer = optim.Adam(dsae_model.parameters(), lr=1e-3),
                                                l1_lambda = LAMBDA,
                                                mu = MU,
                                                sigma = SIGMA,
                                                tol = TOLERANCE,
                                                relative_tol = False,
                                                max_iter = MAX_ITERATIONS)
    

    print("Shape of signatures: ", signatures_aenmf.shape)
    print("Shape of exposures: ", exposures_aenmf.shape)


    # Calculating signatures and exposures for NMF
    diagonals_aenmf = signatures_aenmf.sum(axis=0)
    exposures_aenmf = exposures_aenmf.T @ np.diag(diagonals_aenmf)
    signatures_aenmf = (signatures_aenmf) @ np.diag(1 / diagonals_aenmf)
    
    losses_train.append(training_loss_aenmf[-1])
    signatures.append(signatures_aenmf)



100%|██████████| 1/1 [00:10<00:00, 10.95s/it]

Shape of signatures:  torch.Size([96, 4])
Shape of exposures:  (4, 418)





In [8]:
print("Losses train: ", np.mean(losses_train))

Losses train:  5803.52099609375


In [9]:
all_signatures = np.hstack(signatures)

In [10]:
pam = KMedoids(n_clusters = LATENT_DIM, metric='cosine').fit(all_signatures.T)
consensus_signatures = all_signatures[:, pam.medoid_indices_]

In [11]:
matched_signatures, mean_similarity = cosmic_val.compute_match(consensus_signatures, cosmic)

In [12]:
print(matched_signatures.head())
print("\nMean similarity of the matched signatures: ", mean_similarity)

   Extracted    True  Similarity
0          0   SBS44    0.827106
1          1  SBS10a    0.935239
2          2   SBS56    0.820818
3          3   SBS13    0.737863

Mean similarity of the matched signatures:  0.8302564303100336
