# Libraries

In [None]:
import os

# go up one directory
os.chdir("..")

import pandas as pd 
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn_extra.cluster import KMedoids
from functions import cosmic_val
from models.denoising import *
from functions import cosmic_val
from functions import data_handling as dh
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

# set seed
# np.random.seed(15)
# torch.manual_seed(15)

# Data

In [2]:
data_path = "data/catalogues_Ovary_SBS.tsv"
cosmic_path = "data/COSMIC_v3.4_SBS_GRCh37.txt"
output_folder = "data/processed"
output_filename = "Ordered_Ovary_SBS.csv"
ordered_data_path = os.path.join(output_folder, output_filename)

In [None]:
dh.load_preprocess_data(data_path, cosmic_path, sep1 = "\t", sep2 = "\t", output_folder = output_folder, output_filename = output_filename)

In [4]:
# load data
data = pd.read_csv(ordered_data_path, index_col = 0)
cosmic = pd.read_csv(cosmic_path, sep = "\t", index_col = 0)

# Denoising Sparse Autoencoder

In [5]:
SIGMA = 3 # standerd deviation of the noise
MU = 5 # mean of the noise
TEST_SPLIT = 0.2 # proportion of the data used for testing
BATCH_SIZE = 32 # batch size for the dataloader

In [6]:
LATENT_DIM = 4
CRITERION = nn.MSELoss()
LEARNING_RATE = 1e-4
LAMBDA = 1e-6
CONSTRAINT = 'abs'
TOLERANCE = 1e-3
MAX_ITERS = 100_000_000

In [22]:
losses_train = []
signatures = []
iterations = 1

data = data.T
noisy_data = add_noise(data, SIGMA, MU)

num_samples = data.shape[0]
test_size = int(num_samples * TEST_SPLIT)
train_size = num_samples - test_size

noisy_train = data.iloc[:train_size].T
clean_train = data.iloc[:train_size].T
noisy_test = data.iloc[train_size:].T
clean_test = data.iloc[train_size:].T

# Labels are the clean data
train_dataset = TensorDataset(torch.tensor(noisy_train.values).float(), torch.tensor(clean_train.values).float())
test_dataset = TensorDataset(torch.tensor(noisy_test.values).float(), torch.tensor(clean_test.values).float())

In [None]:

for i in tqdm(range(iterations)):
    
    dsae_model = dsae(input_dim = train_size, latent_dim = LATENT_DIM, constraint=CONSTRAINT)
    # Training AENMF
    dsae_model, dsae_train_losses, signatures_dsae, exposures_dsae = train_dsae(model = dsae_model,
                training_data=train_dataset,  
                criterion = CRITERION,
                optimizer = torch.optim.Adam(dsae_model.parameters(), lr=LEARNING_RATE),
                tol = TOLERANCE, 
                relative_tol=True, 
                max_iter= MAX_ITERS,
                l1_lambda=LAMBDA)    
    

    # Calculating signatures and exposures for DSAE
    diagonals_dsae = signatures_dsae.sum(axis=0)
    exposures_dsae = exposures_dsae.T @ np.diag(diagonals_dsae)
    signatures_dsae = (signatures_dsae) @ np.diag(1 / diagonals_dsae)
    
    losses_train.append(dsae_train_losses[-1])
    signatures.append(signatures_dsae)



In [None]:
# # Split the data into train and test sets
# train_inputs = noisy_data[:, :train_size].T  # Transpose to match (samples, features)
# train_targets = clean_data_tensor[:, :train_size].T
# test_inputs = noisy_data[:, train_size:].T
# test_targets = clean_data_tensor[:, train_size:].T

# Training loop
for i in tqdm(range(iterations)):
    # Initialize the model
    dsae_model = dsae(input_dim=clean_data_tensor.shape[0], latent_dim=LATENT_DIM, constraint=CONSTRAINT)

    optimizer = optim.Adam(dsae_model.parameters(), lr=LEARNING_RATE)

    dsae_model, dsae_train_losses, signatures_dsae, exposures_dsae = train_dsae(
        model=dsae_model,
        training_data=train_inputs,
        criterion=CRITERION,
        optimizer=optimizer,
        tol=TOLERANCE,
        relative_tol=True,
        max_iter=MAX_ITERS,
        l1_lambda=LAMBDA
    )

    # Calculate signatures and exposures
    diagonals_dsae = signatures_dsae.sum(axis=0)
    exposures_dsae = exposures_dsae.T @ np.diag(diagonals_dsae)
    signatures_dsae = signatures_dsae @ np.diag(1 / diagonals_dsae)

    losses_train.append(dsae_train_losses[-1])
    signatures.append(signatures_dsae)


In [None]:
print("Losses train: ", np.mean(losses_train))

In [8]:
all_signatures = np.hstack(signatures)

In [9]:
pam = KMedoids(n_clusters = LATENT_DIM, metric='cosine').fit(all_signatures.T)
consensus_signatures = all_signatures[:, pam.medoid_indices_]

In [10]:
matched_signatures, mean_similarity = cosmic_val.compute_match(consensus_signatures, cosmic)

In [None]:
print(matched_signatures.head())
print("\nMean similarity of the matched signatures: ", mean_similarity)