In [1]:
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch
from sklearn.model_selection import train_test_split
from utils.functions import *
from utils import COSMIC_validation
from models import AE_NMF
import seaborn as sns
from tqdm import tqdm
from sklearn_extra.cluster import KMedoids

# set seed
np.random.seed(15)
torch.manual_seed(15)

<torch._C.Generator at 0x26c2a6eaa70>

In [2]:
# Load the .tsv file
data = pd.read_csv("data/GEL_data/catalogues_Ovary_SBS.tsv", sep="\t")

In [3]:
LATENT_DIM = 4
CONSTRAINT = 'abs'
CRITERION = nn.MSELoss()
TOLERANCE = 1e-10
MAX_ITERATIONS = 100_000_000

In [4]:
COSMIC = pd.read_csv("data/COSMIC_v3.4_SBS_GRCh37.txt", sep="\t")
COSMIC = COSMIC.drop(columns = ["Type"])

In [5]:
all_sigs_df = pd.DataFrame()

In [6]:
n_signatures = []
similarities = []

for i in range(10):
    # We need to transpose the data because the colums contain the patient data and the rows contain the specifc mutations
    train, test = train_test_split(data.T, test_size=0.2)


    train = train.T
    test = test.T

    #print(train.shape)
    #print(test.shape)
    training_data = pd.DataFrame(train)
    test_data = pd.DataFrame(test)
    
    INPUT_DIM = train.shape[1]

    AE_NMF_model = AE_NMF.AE_NMF(input_dim=INPUT_DIM, 
                            latent_dim=LATENT_DIM, 
                            constraint=CONSTRAINT,
                            xavier=False)
    OPTIMIZER = optim.Adam(AE_NMF_model.parameters(), lr=1e-3)

    AE_NMF_model = AE_NMF.train(model=AE_NMF_model,
                        training_data=training_data,
                        criterion=CRITERION,
                        optimizer=OPTIMIZER,
                        tol=TOLERANCE,
                        relative_tol=True,
                        max_iter=MAX_ITERATIONS
                        )
    
    train_loss = AE_NMF_model[1]
    AENMF_sig = AE_NMF_model[2]
    AENMF_exp = AE_NMF_model[3]
    enc_mat = AE_NMF_model[4]

    diagonals = AENMF_sig.sum(axis = 0)
    AENMF_exp = np.diag(diagonals)@AENMF_exp
    AENMF_sig = AENMF_sig@np.diag(1/diagonals)

    n_signatures.append(AENMF_sig)
    
    



In [7]:
all_signatures = np.hstack(n_signatures)


In [8]:
num_clusters = 4

In [9]:
pam = KMedoids(n_clusters=num_clusters, metric='cosine', random_state=15).fit(all_signatures.T)
consensus_signatures = all_signatures[:, pam.medoid_indices_]

In [10]:
consensus_signatures.shape

(96, 4)

In [11]:
match = COSMIC_validation.compute_cosmic_match(consensus_signatures, COSMIC)

In [12]:
match

Unnamed: 0,Extracted signature,SBS_COSMIC,similarity
0,0,SBS3,0.758769
1,1,SBS40c,0.581038
2,2,SBS34,0.700078
3,3,SBS40b,0.576385
