# Libraries

In [1]:
import os

# go up one directory
os.chdir("..")

import pandas as pd 
import numpy as np
from tqdm import tqdm
from sklearn_extra.cluster import KMedoids
from sklearn.decomposition import PCA
from functions import cosmic_val
from functions.graph_tools import *
from functions.data_handling import data_augmentation
from models.muse import *
from functions import cosmic_val
from functions import data_handling as dh
from tqdm import tqdm
import torch.optim as optim
import torch.nn as nn

# set seed
# np.random.seed(15)
# torch.manual_seed(15)

# Data

In [2]:
data_path = "data/catalogues_Ovary_SBS.tsv"
cosmic_path = "data/COSMIC_v3.4_SBS_GRCh37.txt"
output_folder = "data/processed"
output_filename = "Ordered_Ovary_SBS.csv"
ordered_data_path = os.path.join(output_folder, output_filename)

In [3]:
dh.load_preprocess_data(data_path, cosmic_path, sep1 = "\t", sep2 = "\t", output_folder = output_folder, output_filename = output_filename)

Data already exists in  data/processed/Ordered_Ovary_SBS.csv


In [4]:
# load data
data = pd.read_csv(ordered_data_path, index_col = 0)
cosmic = pd.read_csv(cosmic_path, sep = "\t", index_col = 0)

In [5]:
L_ONE = 128
TOLERANCE = 1e-10
CONSTRAINT = 'identity'

In [6]:
from collections import defaultdict
import pandas as pd
import numpy as np

# Dictionary of lists: {k: [iterations]}
results_dict = defaultdict(list)

losses_train = []
signatures = []
iterations = 1
k_range = 3

augmented_data = data_augmentation(X=data, augmentation=30)

for u in tqdm(range(2, k_range)):
    for i in range(iterations):
        
        k = 4
        muse_model = HybridAutoencoder(input_dim=data.shape[0],  # 96
                                        l_1=L_ONE,
                                        latent_dim=k,
                                        weights = 'xavier')

        # Training MUSE
        muse_error, muse_signatures, muse_exposures, muse_train_loss, muse_val_loss = train_model_for_extraction(
            model=muse_model,
            X_aug_multi_scaled=augmented_data.T,
            X_scaled=data.T,
            signatures=k,
            epochs=2000,
            batch_size=64,
            save_to='muse_test',
            iteration=i,
            patience=30,
            beta = 0.001
        )

        # Normalize signatures
        diagonals_muse = muse_signatures.sum(axis=0)
        muse_exposures = muse_exposures.T @ np.diag(diagonals_muse)
        muse_signatures = muse_signatures @ np.diag(1 / diagonals_muse)

        # Store results
        losses_train.append(muse_train_loss)
        signatures.append(muse_signatures)

        # Store data in structured format
        results_dict[k].append({
            "iteration": i,
            "muse_error": muse_error,
            "muse_signatures": muse_signatures  # Keep as NumPy array for easier processing
        })

    # Convert dictionary into a DataFrame for better analysis
    df_results = pd.DataFrame([
    {"k": k, "iteration": entry["iteration"], "muse_error": entry["muse_error"], "muse_signatures": entry["muse_signatures"]}
    for k, entries in results_dict.items()
    for entry in entries
    ])


  model.load_state_dict(torch.load(best_model_path))
100%|██████████| 1/1 [03:08<00:00, 188.74s/it]

Early stopping at epoch 144





In [7]:
print(np.mean(muse_val_loss))

488530.5533854167


In [8]:
all_signatures = np.hstack(signatures)

In [9]:
print(all_signatures.shape)

(96, 4)


In [10]:
pam = KMedoids(n_clusters = k, metric='cosine').fit(all_signatures.T)
labels = pam.labels_
medoid_indices = pam.medoid_indices_
consensus_signatures = all_signatures[:, medoid_indices]

In [11]:
matched_signatures, mean_similarity = cosmic_val.compute_match(consensus_signatures, cosmic, index = 0)

In [12]:
print(matched_signatures)
print("\nMean similarity of the matched signatures: ", mean_similarity)

   Extracted_0  True_0  Similarity_0
0            0  SBS10a      0.925463
1            1   SBS44      0.687106
2            2  SBS10c      0.756927
3            3    SBS3      0.809602

Mean similarity of the matched signatures:  0.7947746129920017


In [13]:
reduced_signatures = PCA(n_components=2).fit_transform(all_signatures.T)
plot_clusters(reduced_signatures, labels, medoid_indices, j, "AENMF signature clusters")

NameError: name 'j' is not defined

In [None]:
df_consensus = pd.DataFrame(consensus_signatures, index = data.index)

In [None]:
plot_signature(df_consensus, "AENMF consensus signatures")