# Libraries

In [1]:
import os

# go up one directory
os.chdir("..")

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn_extra.cluster import KMedoids
from functions import cosmic_val
from models.nmf import NMF_mult_tol
from functions import cosmic_val
from sklearn.decomposition import NMF as nmf_sklearn



In [2]:
def rearrange_tsv_based_on_txt(tsv_file_path, txt_file_path, output_tsv_path):
    # Read the .txt file to get the label order
    with open(txt_file_path, 'r') as txt_file:
        labels = txt_file.readline().strip().split('\t')[1:]  # Skip the first column (Type)

    # Read the .tsv file
    tsv_df = pd.read_csv(tsv_file_path, sep='\t')

    # Ensure the first column is the index
    tsv_df.set_index(tsv_df.columns[0], inplace=True)

    # Reorder the rows based on the labels from the .txt file
    reordered_df = tsv_df.reindex(labels)

    # Reset the index to make the first column a regular column again
    reordered_df.reset_index(inplace=True)

    # Save the reordered DataFrame to a new .tsv file
    reordered_df.to_csv(output_tsv_path, sep='\t', index=False)


def transform_first_column_to_index(df):
    """
    Transforms the first column of a DataFrame into the index and drops it as a column.

    Parameters:
    - df: pandas DataFrame

    Returns:
    - Updated DataFrame with the first column as the index.
    """
    # Set the first column as the index
    df = df.set_index(df.columns[0])
    
    # Optionally rename the index (if needed)
    df.index.name = None  # Set to `None` to remove the name
    
    return df


def align_datasets_by_index(df1, df2):
    """
    Aligns two DataFrames by their indices (row names), ensuring rows appear in the same order.

    Parameters:
    - df1: pandas DataFrame (first dataset)
    - df2: pandas DataFrame (second dataset)

    Returns:
    - Aligned DataFrames (df1_aligned, df2_aligned)
    """
    # Ensure both DataFrames have the same index and are sorted by index
    df1_aligned = df1.loc[df1.index.sort_values()]
    df2_aligned = df2.loc[df1_aligned.index]  # Reorder df2 to match df1's index

    # Check if indices are perfectly aligned (optional)
    assert (df1_aligned.index == df2_aligned.index).all(), "Indices are not perfectly aligned!"

    return df1_aligned, df2_aligned

# Data

In [3]:
# load data
data = pd.read_csv("data/catalogues_Ovary_SBS.tsv", sep="\t")

cosmic = pd.read_csv("data/COSMIC_v3.4_SBS_GRCh37.txt", sep="\t")

In [4]:
cosmic = cosmic.set_index(cosmic.columns[0])

In [5]:
cosmic, data = align_datasets_by_index(cosmic, data)

In [6]:
LATENT_DIM = 4
TOLERANCE = 1e-10
MAX_ITERATIONS = 100_000_000


print(data.shape)

E_init = np.random.rand(data.shape[1], LATENT_DIM)

print(E_init.shape)

(96, 523)
(523, 4)


In [7]:
losses_train = []
signatures = []
iterations = 5


# for i in tqdm(range(iterations)):
    
#     # Applying NMF
#     signatures_nmf, exposures_nmf, loss_nmf, _, _, n_iter_nmf = NMF_mult_tol(data.to_numpy(),
#                                                                              rank = LATENT_DIM,
#                                                                              tol = TOLERANCE,
#                                                                              mse=True,
#                                                                              G_0 = E_init.T)

#     # Calculating signatures and exposures for NMF
#     diagonals_nmf = signatures_nmf.sum(axis=0)
#     exposures_nmf = exposures_nmf.T @ np.diag(diagonals_nmf)
#     signatures_nmf = (signatures_nmf) @ np.diag(1 / diagonals_nmf)
    
#     losses_train.append(loss_nmf[-1])
#     signatures.append(signatures_nmf)

for i in tqdm(range(iterations)):

    # Sklearn NMF

    nmf = nmf_sklearn(n_components = LATENT_DIM, init = 'random', random_state = 0, max_iter = MAX_ITERATIONS, tol = TOLERANCE)

    signatures_nmf = nmf.fit_transform(data)
    exposures_nmf = nmf.components_
    loss = nmf.reconstruction_err_

    diagonals_nmf = signatures_nmf.sum(axis=0)
    exposures_nmf = exposures_nmf.T @ np.diag(diagonals_nmf)
    signatures_nmf = (signatures_nmf) @ np.diag(1 / diagonals_nmf)
    
    losses_train.append(loss)
    signatures.append(signatures_nmf)


100%|██████████| 5/5 [00:03<00:00,  1.46it/s]


In [8]:
print("Losses train: ", np.mean(losses_train))

Losses train:  18282.915131557565


In [9]:
print("Signatures: ", np.shape(signatures))

Signatures:  (5, 96, 4)


In [10]:
all_signatures = np.hstack(signatures)

In [11]:
signature_test = signatures[-1]

In [12]:
print(signature_test.shape)

(96, 4)


In [13]:
pam = KMedoids(n_clusters = LATENT_DIM, metric='cosine').fit(all_signatures.T)
consensus_signatures = all_signatures[:, pam.medoid_indices_]



In [14]:
print(consensus_signatures.shape)

(96, 4)


In [15]:
match = cosmic_val.compute_match(signature_test, cosmic)

           0         1         2         3
0   0.002184  0.025966  0.000509  0.005513
1   0.003262  0.020346  0.002051  0.004107
2   0.000454  0.003366  0.000077  0.000600
3   0.014658  0.019393  0.011910  0.004072
4   0.000426  0.015425  0.001735  0.003396
..       ...       ...       ...       ...
91  0.011512  0.000742  0.014445  0.051129
92  0.021356  0.005459  0.001607  0.000646
93  0.007595  0.004089  0.008242  0.000296
94  0.003791  0.007145  0.001676  0.003943
95  0.146347  0.000000  0.053967  0.015041

[96 rows x 4 columns]
               SBS10a      SBS3    SBS10c         SBS44
Type                                                   
A[C>A]A  2.190170e-03  0.020808  0.004331  7.680000e-18
A[C>A]C  1.770137e-03  0.016507  0.014830  1.500380e-04
A[C>A]G  1.500120e-04  0.001751  0.000657  9.160000e-07
A[C>A]T  1.700132e-02  0.012205  0.013128  5.781465e-03
A[C>G]A  2.230000e-16  0.019708  0.000348  3.180806e-03
...               ...       ...       ...           ...
T[T>C]T  3.25

In [16]:
match

Unnamed: 0,Extracted,True,Similarity
0,0,SBS10a,0.931109
1,1,SBS3,0.713532
2,2,SBS10c,0.650058
3,3,SBS44,0.839054
