In [1]:
import pandas as pd
import numpy as np
import json
import os
import networkx as nx
#import pickle5 as pickle

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.preprocessing import LabelEncoder

from myclass.CleanMergeDataset import Clean_Merge_Dataset
from myclass.BonferroniTtest import Bonferroni_Ttest

In [2]:
if os.path.exists('final_dataset_common.json') is False:

    data_normal = pd.read_pickle('./data-ready/RNA_dataframe_normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/RNA_dataframe').replace('/', '\\')
    dataset_RNA, y_RNA, cases_id_RNA = Clean_Merge_Dataset(name='RNA').transform(data_normal, data_tumor)
    df_RNA = pd.concat([dataset_RNA, cases_id_RNA], axis=1)

    data_normal = pd.read_pickle('./data-ready/miRNA_dataframe_normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/miRNA_dataframe').replace('/', '\\')
    dataset_miRNA, y_miRNA, cases_id_miRNA= Clean_Merge_Dataset(name='miRNA').transform(data_normal, data_tumor)
    df_miRNA = pd.concat([dataset_miRNA, cases_id_miRNA], axis=1)

    data_normal = pd.read_pickle('./data-ready/illumina-27-450-normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/illumina450-27-tumor').replace('/', '\\')
    dataset_illumina, y_illumina, cases_id_illumina= Clean_Merge_Dataset(name='illumina').transform(data_normal, data_tumor)
    df_illumina = pd.concat([dataset_illumina, cases_id_illumina], axis=1)

    dataset_RNA = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_RNA, y_RNA], axis=1), y_RNA)
    dataset_miRNA = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_miRNA, y_miRNA], axis=1), y_miRNA)
    dataset_illumina = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_illumina, y_illumina], axis=1), y_illumina)

    cases_id = set(dataset_illumina['case_id']) & set(dataset_miRNA['case_id']) & set(dataset_RNA['case_id'])
    df_final_illumina = dataset_illumina.loc[dataset_illumina['case_id'].isin(cases_id)]
    df_final_rna = dataset_RNA.loc[dataset_RNA['case_id'].isin(cases_id)]
    df_final_mirna = dataset_miRNA.loc[dataset_miRNA['case_id'].isin(cases_id)]

    print(df_final_illumina.shape)
    print(df_final_rna.shape)
    print(df_final_mirna.shape)
    
    df_final_illumina.to_pickle('illumina_pickle.pkl')
    df_final_rna.to_pickle('rna_pickle.pkl')
    df_final_mirna.to_pickle('miRNA_pickle.pkl')
    
    my_dict = {
        'miRNA': df_final_mirna.to_dict(),
        'RNA': df_final_rna.to_dict(),
        'illumina': df_final_illumina.to_dict()
    }
    with open('final_dataset_common.json', 'w') as outfile:
        json.dump(my_dict, outfile)
    
    df_illumina = df_final_illumina.copy()
    df_mirna = df_final_mirna.copy()
    df_rna = df_final_rna.copy()
    
    del my_dict
    del df_final_illumina
    del df_final_rna
    del df_final_mirna
    del dataset_illumina
    del dataset_RNA
    del dataset_miRNA
else:
    df_illumina = pd.read_pickle('illumina_pickle.pkl')
    df_mirna = pd.read_pickle('miRNA_pickle.pkl')
    df_rna = pd.read_pickle('rna_pickle.pkl')
    #with open('illumina_pickle.pkl' ,'rb') as f:
        #df_illumina = pickle.load(f)
    #with open('mirna_pickle.pkl' ,'rb') as f:
        #df_mirna = pickle.load(f)
    #with open('rna_pickle.pkl' ,'rb') as f:
        #df_rna = pickle.load(f)

In [8]:
import numpy as np
import pandas as pd
import os

from scipy.spatial.distance import pdist, squareform, cdist
from copy import deepcopy
from sklearn.preprocessing import StandardScaler

class SimilarityNetworkFusion:
    def __init__(self, df_mirna, df_rna, df_illumina, k=3, mu=0.3):
        
        self.cases_id = df_rna.loc[:, 'case_id']
        self.rna = df_rna.copy()
        self.mirna = df_mirna.copy()
        self.illumina = df_illumina.copy()
        
        self.k = k
        self.mu = mu
        self.check_columns()
    
    def calculate_matrix(self):
        if hasattr(self, 'w_rna') is False:
            self.w_rna = self.__weights__(self.rna, 'RNA', save_matrix=True)
            self.w_mirna = self.__weights__(self.mirna, 'miRNA', save_matrix=True)
            self.w_illumina = self.__weights__(self.illumina, 'Illumina', save_matrix=True)
        
        if hasattr(self, 'p_rna') is False:
            self.starting_p_rna = self.P_matrix(self.w_rna.to_numpy().tolist(), self.cases_id.shape[0], 'RNA', save_matrix=True)
            self.starting_p_mirna = self.P_matrix(self.w_mirna.to_numpy().tolist(), self.cases_id.shape[0], 'miRNA', save_matrix=True)
            self.starting_p_illumina = self.P_matrix(self.w_illumina.to_numpy().tolist(), self.cases_id.shape[0], 'Illumina', save_matrix=True)

        self.s_rna = self.S_matrix(self.w_rna.to_numpy().tolist(), self.cases_id.shape[0], 'RNA')
        self.s_mirna = self.S_matrix(self.w_mirna.to_numpy().tolist(), self.cases_id.shape[0], 'miRNA')
        self.s_illumina = self.S_matrix(self.w_illumina.to_numpy().tolist(), self.cases_id.shape[0], 'Illumina')
        
        return self
        

    def __weights__(self, dataset, name, save_matrix=False):
        
        if 'weights_matrix_'+name+'.pkl' in os.listdir('.'):
            print('Read file pickle for weights matrix of {}'.format(name))
            dist = pdist(dataset, 'euclidean')
            self.df_dist = pd.DataFrame(columns=self.cases_id, index=self.cases_id, data=squareform(dist))
            weights = pd.read_pickle('weights_matrix_'+name+'.pkl')
            return weights
        
        print('Calculating weights for {}...'.format(name))
        df = pd.DataFrame(columns=self.cases_id, data=dataset.T.values)
        
        #calculate euclidean distance
        dist = pdist(dataset, 'euclidean')
        self.df_dist = pd.DataFrame(columns=self.cases_id, index=self.cases_id, data=squareform(dist))
        weights = pd.DataFrame(columns=self.cases_id, index=self.cases_id, data=[])
                
        for i, patient_i in enumerate(tqdm(self.cases_id)):
            for patient_j in self.cases_id.iloc[i:]:
                    tokK_mean_i = np.sort(self.df_dist.loc[patient_i, :].to_numpy())[:self.k].mean()
                    topK_mean_j = np.sort(self.df_dist.loc[patient_j, :].to_numpy())[:self.k].mean()
                    
                    eps = (tokK_mean_i + tokK_mean_i + self.df_dist.loc[patient_i, patient_j])/3

                    weights.loc[patient_i, patient_j] = np.exp(-(self.df_dist.loc[patient_i, patient_j]**2/(eps*self.mu)))
                    weights.loc[patient_j, patient_i] = np.exp(-(self.df_dist.loc[patient_j, patient_i]**2/(eps*self.mu)))
        if save_matrix:
            weights.to_pickle('weights_matrix_'+name+'.pkl')
        return weights       
    
    def check_columns(self):
        scaler = MinMaxScaler()
        if 'label' in self.mirna.columns:
            self.mirna.drop(['label'], axis=1, inplace=True)
        if 'case_id' in self.mirna.columns:
            self.mirna.drop(['case_id'], axis=1, inplace=True)
            
        if 'label' in self.rna.columns:
            self.rna.drop(['label'], axis=1, inplace=True)
        if 'case_id' in self.rna.columns:
            self.rna.drop(['case_id'], axis=1, inplace=True)
            
        if 'label' in self.illumina.columns:
            self.illumina.drop(['label'], axis=1, inplace=True)
        if 'case_id' in self.illumina.columns:
            self.illumina.drop(['case_id'], axis=1, inplace=True)
            
        self.mirna = pd.DataFrame(scaler.fit_transform(self.mirna))
        self.rna = pd.DataFrame(scaler.fit_transform(self.rna))
        self.illumina = pd.DataFrame(scaler.fit_transform(self.illumina))

        return


    def find_k_neighbors(self, row, i, k=None): 
        row=deepcopy(row)
        #case of P matrix
        if k==None:
            del row[i]  #delete element of the same column of row index
            return row

        #case of S (find k elements with minimum distance value of W[i][j])
        else:
            k_neighbors_index=[]
            neigh = 0
            max_value = max(row)
            for j in range(0, len(row)):
                if j!=i:
                    min_index = row.index(min(row))
                    k_neighbors_index.append(min_index)
                    neigh+=1
                    row[min_index] = max_value
                    if neigh == k:
                        return k_neighbors_index


    def P_matrix(self, W, n_case_id, name, save_matrix=False):
        if 'pStarting_matrix_'+name+'.pkl' in os.listdir('.'):
            print('Reading the file pickle for the p starting matrix {}'.format(name))
            df_p = pd.read_pickle('pStarting_matrix_'+name+'.pkl')
            return df_p.to_numpy()
            
        print('Calculating P matrix for {}...'.format(name))
        P=[]
        for i in tqdm(range(0, n_case_id)):
            row=[]
            for j in range(0,n_case_id):
                if i==j:
                    row.append(1/2)

                else:
                    k_neighbors = self.find_k_neighbors(W[i], i)
                    denominator = 2*sum(k_neighbors)
                    row.append(W[i][j]/denominator)
            P.append(row)
        #print(np.array(P))
        
        if save_matrix:
            df_P = pd.DataFrame(np.array(P))
            df_P.to_pickle('pStarting_matrix_'+name+'.pkl')
        return np.array(P)

    def S_matrix(self, W, n_case_id, name):
        print('Calculating S matrix for {}...'.format(name))
        S=[]
        for i in tqdm(range(0, n_case_id)):
            S_row=[]
            neighbors_indeces = self.find_k_neighbors(self.df_dist.iloc[i,:].to_numpy().tolist(), i, self.k)
            for j in range(0,n_case_id):
                if j not in neighbors_indeces:
                    S_row.append(0)

                else:
                    np_row = np.array(W[i])
                    denominator = sum(np_row[neighbors_indeces])
                    S_row.append(W[i][j]/denominator)
            S.append(S_row)
        print(np.array(S))
        return np.array(S)
    
    def product_matrix(self, S_matrix, P_matrix):
        result = np.dot(S_matrix, P_matrix)
        result = np.dot(result, S_matrix.T)
        return result
    
    def sum_matrix_P(self, P1, P2):
        return np.add(P1,P2)/2
    
    def fit(self, num_iter=None):
        if num_iter is not None:
            self.p_rna = self.starting_p_rna
            self.p_mirna = self.starting_p_mirna
            self.p_illumina = self.starting_p_illumina
            for i in range(0, num_iter):
                self.p_rna_t1 = self.product_matrix(self.s_rna, self.sum_matrix_P(self.p_mirna, self.p_illumina))
                self.p_mirna_t1 = self.product_matrix(self.s_mirna, self.sum_matrix_P(self.p_rna, self.p_illumina))
                self.p_illumina_t1 = self.product_matrix(self.s_illumina, self.sum_matrix_P(self.p_mirna, self.p_rna))
                print(self.p_rna_t1)
                print(self.p_mirna_t1)
                print(self.p_illumina_t1)
                self.p_rna = self.p_rna_t1
                self.p_mirna = self.p_mirna_t1
                self.p_illumina = self.p_illumina_t1
        else:
            print('ciao')

        return self
    
    def iterations_fit(self, matrices_diff=None, max_iter=100):
        if matrices_diff is not None:
            sm.p_rna = sm.starting_p_rna
            sm.p_mirna = sm.starting_p_mirna
            sm.p_illumina = sm.starting_p_illumina
            for step in range(0, max_iter):
                sm.p_rna_t1 = sm.product_matrix(sm.s_rna, sm.sum_matrix_P(sm.p_mirna, sm.p_illumina))
                sm.p_mirna_t1 = sm.product_matrix(sm.s_mirna, sm.sum_matrix_P(sm.p_rna, sm.p_illumina))
                sm.p_illumina_t1 = sm.product_matrix(sm.s_illumina, sm.sum_matrix_P(sm.p_mirna, sm.p_rna))
               
                sm.p_rna = sm.p_rna_t1
                sm.p_mirna = sm.p_mirna_t1
                sm.p_illumina = sm.p_illumina_t1

                diff_matrix = 0
                for i in range(0, len(sm.p_rna)):
                    for j in range(i, len(sm.p_rna)):
                        diff_matrix += np.abs(sm.p_rna[i][j] - sm.p_mirna[i][j])
                        diff_matrix += np.abs(sm.p_illumina[i][j] - sm.p_mirna[i][j])
                        diff_matrix += np.abs(sm.p_illumina[i][j] - sm.p_rna[i][j])
                
                diff_matrix = diff_matrix**0.5
                print(step, ':', diff_matrix)
                
                #diff_matrix = np.abs(np.subtract(sm.p_rna, sm.p_mirna)) + np.abs(np.subtract(sm.p_rna, sm.p_illumina)) + np.abs(np.subtract(sm.p_mirna, sm.p_illumina))
                #diff_matrix= np.abs(np.mean(diff_matrix))
                if diff_matrix<=np.abs(matrices_diff):
                    print('number of iterations to reach difference: ', step)
                    break
                    
                if step == max_iter-1: ##impossible to reach matrices difference
                    print('impossible to reach indicated difference, try with a bigger difference value')
        else:
            print('no difference for matrices found')

        return self
    
    def local_minimum_fit(self, iters_to_min=None, max_iter=100):
        if iters_to_min is not None:
            sm.p_rna = sm.starting_p_rna
            sm.p_mirna = sm.starting_p_mirna
            sm.p_illumina = sm.starting_p_illumina
            count=0
            prev_diff=0
            for step in range(0, max_iter):
                sm.p_rna_t1 = sm.product_matrix(sm.s_rna, sm.sum_matrix_P(sm.p_mirna, sm.p_illumina))
                sm.p_mirna_t1 = sm.product_matrix(sm.s_mirna, sm.sum_matrix_P(sm.p_rna, sm.p_illumina))
                sm.p_illumina_t1 = sm.product_matrix(sm.s_illumina, sm.sum_matrix_P(sm.p_mirna, sm.p_rna))
               
                sm.p_rna = sm.p_rna_t1
                sm.p_mirna = sm.p_mirna_t1
                sm.p_illumina = sm.p_illumina_t1

                diff_matrix = 0
                for i in range(0, len(sm.p_rna)):
                    for j in range(i, len(sm.p_rna)):
                        diff_matrix += np.abs(sm.p_rna[i][j] - sm.p_mirna[i][j])
                        diff_matrix += np.abs(sm.p_illumina[i][j] - sm.p_mirna[i][j])
                        diff_matrix += np.abs(sm.p_illumina[i][j] - sm.p_rna[i][j])
                
                diff_matrix = diff_matrix**0.5
                print(step, ':', diff_matrix)
                
                #diff_matrix = np.abs(np.subtract(sm.p_rna, sm.p_mirna)) + np.abs(np.subtract(sm.p_rna, sm.p_illumina)) + np.abs(np.subtract(sm.p_mirna, sm.p_illumina))
                #diff_matrix= np.abs(np.mean(diff_matrix))
                
                #check if a local minimum is found
                if int(diff_matrix)==prev_diff:
                    count+=1
                    if count>=iters_to_min:
                        print('local minimum reached in ', step, 'iterations')
                        break
                else:
                    count=0
                    
                prev_diff = int(diff_matrix)
                    
                if step == max_iter-1: ##impossible to reach matrices difference
                    print('impossible to reach local minimum, matrices seem to not converge')
        else:
            print('no minimum iterations for matrices found')

        return self
    
    def clean(self):
        del self.p_rna
        del self.p_mirna
        del self.p_illumina
        
        del self.p_rna_t1
        del self.p_mirna_t1
        del self.p_illumina_t1
        
        
        del self.w_rna
        del self.w_mirna
        del self.w_illumina
        
        return self

In [9]:
df_mirna.sort_values(by='case_id', inplace=True)
df_rna.sort_values(by='case_id', inplace=True)
df_illumina.sort_values(by='case_id', inplace=True)

sm = SimilarityNetworkFusion(df_mirna.drop_duplicates(subset='case_id'),
                            df_rna.drop_duplicates(subset='case_id'),
                            df_illumina.drop_duplicates(subset='case_id'), k=100).calculate_matrix()

Read file pickle for weights matrix of RNA
Read file pickle for weights matrix of miRNA
Read file pickle for weights matrix of Illumina


  2%|▏         | 7/449 [00:00<00:07, 63.00it/s]

Reading the file pickle for the p starting matrix RNA
Reading the file pickle for the p starting matrix miRNA
Reading the file pickle for the p starting matrix Illumina
Calculating S matrix for RNA...


100%|██████████| 449/449 [00:06<00:00, 72.17it/s]
  2%|▏         | 9/449 [00:00<00:05, 85.82it/s]

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 3.61447414e-31 0.00000000e+00 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [8.81446815e-20 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 9.99999771e-01]]
Calculating S matrix for miRNA...


100%|██████████| 449/449 [00:05<00:00, 85.33it/s]
  2%|▏         | 10/449 [00:00<00:04, 91.35it/s]

[[9.53624077e-01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 9.92785206e-01 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 8.89281659e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 8.27827171e-01
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 3.84302822e-06 0.00000000e+00 ... 0.00000000e+00
  9.99298906e-01 0.00000000e+00]
 [4.53047009e-04 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 7.72727170e-01]]
Calculating S matrix for Illumina...


100%|██████████| 449/449 [00:04<00:00, 90.22it/s]

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.69620492e-35 0.00000000e+00 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [1.62184284e-53 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]]





In [10]:
from scipy.spatial.distance import cdist

def modified_fit(sm, matrices_diff=None, max_iter=100):
        if matrices_diff is not None:
            sm.p_rna = sm.starting_p_rna
            sm.p_mirna = sm.starting_p_mirna
            sm.p_illumina = sm.starting_p_illumina
            for step in range(0, max_iter):
                sm.p_rna_t1 = sm.product_matrix(sm.s_rna, sm.sum_matrix_P(sm.p_mirna, sm.p_illumina))
                sm.p_mirna_t1 = sm.product_matrix(sm.s_mirna, sm.sum_matrix_P(sm.p_rna, sm.p_illumina))
                sm.p_illumina_t1 = sm.product_matrix(sm.s_illumina, sm.sum_matrix_P(sm.p_mirna, sm.p_rna))
               
                sm.p_rna = sm.p_rna_t1
                sm.p_mirna = sm.p_mirna_t1
                sm.p_illumina = sm.p_illumina_t1

                diff_matrix = 0
                for i in range(0, len(sm.p_rna)):
                    for j in range(i, len(sm.p_rna)):
                        diff_matrix += np.abs(sm.p_rna[i][j] - sm.p_mirna[i][j])
                        diff_matrix += np.abs(sm.p_illumina[i][j] - sm.p_mirna[i][j])
                        diff_matrix += np.abs(sm.p_illumina[i][j] - sm.p_rna[i][j])
                
                diff_matrix = diff_matrix**0.5
                print(step, ':', diff_matrix)
                
                #diff_matrix = np.abs(np.subtract(sm.p_rna, sm.p_mirna)) + np.abs(np.subtract(sm.p_rna, sm.p_illumina)) + np.abs(np.subtract(sm.p_mirna, sm.p_illumina))
                #diff_matrix= np.abs(np.mean(diff_matrix))
                if diff_matrix<=np.abs(matrices_diff):
                    print('number of iterations to reach difference: ', step)
                    break
                    
                if step == max_iter-1: ##impossible to reach matrices difference
                    print('impossible to reach indicated difference, try with a bigger difference value')
        else:
            print('no difference for matrices found')

        return sm

In [11]:
from scipy.spatial.distance import cdist

def rimodified_fit(sm, iters_to_min=None, max_iter=100):
        if iters_to_min is not None:
            sm.p_rna = sm.starting_p_rna
            sm.p_mirna = sm.starting_p_mirna
            sm.p_illumina = sm.starting_p_illumina
            count=0
            prev_diff=0
            for step in range(0, max_iter):
                sm.p_rna_t1 = sm.product_matrix(sm.s_rna, sm.sum_matrix_P(sm.p_mirna, sm.p_illumina))
                sm.p_mirna_t1 = sm.product_matrix(sm.s_mirna, sm.sum_matrix_P(sm.p_rna, sm.p_illumina))
                sm.p_illumina_t1 = sm.product_matrix(sm.s_illumina, sm.sum_matrix_P(sm.p_mirna, sm.p_rna))
               
                sm.p_rna = sm.p_rna_t1
                sm.p_mirna = sm.p_mirna_t1
                sm.p_illumina = sm.p_illumina_t1

                diff_matrix = 0
                for i in range(0, len(sm.p_rna)):
                    for j in range(i, len(sm.p_rna)):
                        diff_matrix += np.abs(sm.p_rna[i][j] - sm.p_mirna[i][j])
                        diff_matrix += np.abs(sm.p_illumina[i][j] - sm.p_mirna[i][j])
                        diff_matrix += np.abs(sm.p_illumina[i][j] - sm.p_rna[i][j])
                
                diff_matrix = diff_matrix**0.5
                print(step, ':', diff_matrix)
                
                #diff_matrix = np.abs(np.subtract(sm.p_rna, sm.p_mirna)) + np.abs(np.subtract(sm.p_rna, sm.p_illumina)) + np.abs(np.subtract(sm.p_mirna, sm.p_illumina))
                #diff_matrix= np.abs(np.mean(diff_matrix))
                
                #check if a local minimum is found
                if int(diff_matrix)==prev_diff:
                    count+=1
                    if count>=iters_to_min:
                        print('local minimum reached in ', step, 'iterations')
                        break
                else:
                    count=0
                    
                prev_diff = int(diff_matrix)
                    
                if step == max_iter-1: ##impossible to reach matrices difference
                    print('impossible to reach local minimum, matrices seem to not converge')
        else:
            print('no minimum for matrices found')

        return sm

In [12]:
modified_fit(sm, matrices_diff=3)

0 : 24.704514507625646
1 : 17.78984100520555
2 : 15.066080451351228
3 : 12.689120687546371
4 : 11.315589319645468
5 : 10.24231369125367
6 : 9.44235398614814
7 : 8.78570206906347
8 : 8.240973546764446
9 : 7.773818901610334
10 : 7.3677313865089
11 : 7.009009551888137
12 : 6.690024524945179
13 : 6.404656878701583
14 : 6.147804341666656
15 : 5.915859102406112
16 : 5.705810403224784
17 : 5.514771961181847
18 : 5.340583763391923
19 : 5.181557909593407
20 : 5.035850317792812
21 : 4.901905204428646
22 : 4.778347293973788
23 : 4.664000449524897
24 : 4.557793243560072
25 : 4.459008720056606
26 : 4.366909147409094
27 : 4.280945380161371
28 : 4.200521668182576
29 : 4.125150090209724
30 : 4.054329661880232
31 : 3.987781545340836
32 : 3.9250287313494496
33 : 3.86569990573929
34 : 3.809610627399268
35 : 3.7564164915859166
36 : 3.705924297344162
37 : 3.657944857545123
38 : 3.6121782361921784
39 : 3.568561752207915
40 : 3.526907440699425
41 : 3.487004892258268
42 : 3.448763099258188
43 : 3.412045603317

<__main__.SimilarityNetworkFusion at 0x7fb422e07100>

In [13]:
rimodified_fit(sm, iters_to_min=8)

0 : 24.704514507625646
1 : 17.78984100520555
2 : 15.066080451351228
3 : 12.689120687546371
4 : 11.315589319645468
5 : 10.24231369125367
6 : 9.44235398614814
7 : 8.78570206906347
8 : 8.240973546764446
9 : 7.773818901610334
10 : 7.3677313865089
11 : 7.009009551888137
12 : 6.690024524945179
13 : 6.404656878701583
14 : 6.147804341666656
15 : 5.915859102406112
16 : 5.705810403224784
17 : 5.514771961181847
18 : 5.340583763391923
19 : 5.181557909593407
20 : 5.035850317792812
21 : 4.901905204428646
22 : 4.778347293973788
23 : 4.664000449524897
24 : 4.557793243560072
25 : 4.459008720056606
26 : 4.366909147409094
27 : 4.280945380161371
28 : 4.200521668182576
29 : 4.125150090209724
local minimum reached in  29 iterations


<__main__.SimilarityNetworkFusion at 0x7fb422e07100>

In [14]:
sm.fit(num_iter=50)

[[5.00000000e-01 8.25633369e-07 7.15528204e-04 ... 6.34470201e-04
  1.35251292e-07 7.26824931e-04]
 [1.18754797e-05 5.00000000e-01 2.08871392e-03 ... 5.15053084e-05
  6.85732060e-05 9.16392303e-05]
 [2.44260150e-04 4.95725089e-05 5.00000000e-01 ... 1.09342885e-04
  4.41093969e-08 1.21400411e-03]
 ...
 [2.28026460e-04 1.28695121e-06 1.15116793e-04 ... 5.00000000e-01
  4.12789893e-09 8.23187354e-04]
 [2.80776889e-05 9.89717087e-04 2.68241019e-05 ... 2.38437728e-06
  5.00000000e-01 6.78049830e-04]
 [1.59593812e-04 1.39895833e-06 7.80872907e-04 ... 5.02933504e-04
  7.17180741e-07 4.99999781e-01]]
[[4.55179803e-01 1.76498731e-06 1.84978943e-04 ... 1.24834719e-05
  1.43710348e-06 1.76593685e-03]
 [7.47071483e-05 4.93055324e-01 1.13063207e-08 ... 2.62355134e-05
  3.62177844e-06 1.63835448e-04]
 [5.16663786e-06 1.03928356e-07 3.96126431e-01 ... 4.02644102e-05
  2.07129074e-08 2.05656236e-05]
 ...
 [1.39219342e-05 6.16473053e-07 5.71525351e-05 ... 3.60452441e-01
  7.53628317e-09 3.08356998e-05]

[[4.23900109e-01 1.28728485e-05 7.57710266e-04 ... 5.73643600e-04
  2.17401022e-06 2.70613127e-03]
 [1.50297971e-04 4.87624318e-01 1.20088104e-03 ... 9.40315765e-05
  5.11116469e-05 4.22698497e-04]
 [2.14579139e-04 3.33681286e-05 3.36165733e-01 ... 3.63789896e-04
  1.20901009e-07 9.19007110e-04]
 ...
 [2.25110796e-04 5.01014607e-06 3.87031545e-04 ... 2.81154366e-01
  8.31605000e-08 5.43499982e-04]
 [5.96316948e-05 7.46830801e-04 1.57167332e-05 ... 6.04733600e-06
  4.98766636e-01 3.66335527e-04]
 [1.40967171e-03 2.74879619e-05 6.50380394e-04 ... 3.86442818e-04
  7.78358214e-07 2.17458278e-01]]
[[3.98422817e-01 2.16360080e-05 8.71788244e-04 ... 7.06607926e-04
  2.89567860e-06 3.31127409e-03]
 [2.08525651e-04 4.83106618e-01 1.17777759e-03 ... 1.20310740e-04
  5.54462513e-05 5.82312027e-04]
 [2.51885348e-04 3.54258735e-05 2.88898782e-01 ... 5.63410011e-04
  1.75233360e-07 9.83503172e-04]
 ...
 [2.85881567e-04 7.85534798e-06 5.87867510e-04 ... 2.23437544e-01
  1.36261680e-07 6.03695504e-04]

[[3.41721110e-01 5.73015246e-05 1.07974318e-03 ... 1.08354783e-03
  5.13383450e-06 4.52985251e-03]
 [3.76664568e-04 4.71821898e-01 9.93820404e-04 ... 1.80108066e-04
  5.98284585e-05 1.02229501e-03]
 [3.26977402e-04 3.59903279e-05 2.01929162e-01 ... 1.22946902e-03
  3.14899976e-07 9.84395031e-04]
 ...
 [4.81137419e-04 1.81046419e-05 1.23491151e-03 ... 1.32195851e-01
  3.21870683e-07 8.13938342e-04]
 [1.12556687e-04 8.31598451e-04 1.39792074e-05 ... 1.56366543e-05
  4.97159376e-01 3.35905746e-04]
 [2.79571844e-03 1.20120015e-04 7.44781980e-04 ... 6.93793948e-04
  2.42216449e-06 7.63754577e-02]]
[[3.21029465e-01 7.46065692e-05 1.16059956e-03 ... 1.23960105e-03
  6.07056365e-06 4.91337391e-03]
 [4.46701384e-04 4.67377736e-01 9.37984584e-04 ... 2.05512474e-04
  6.21206390e-05 1.19750926e-03]
 [3.64876862e-04 3.68461911e-05 1.73873842e-01 ... 1.51365700e-03
  3.75639605e-07 9.94422954e-04]
 ...
 [5.69766055e-04 2.32077837e-05 1.50975854e-03 ... 1.05532275e-01
  4.18395968e-07 9.16408945e-04]

[[2.59323704e-01 1.52092530e-04 1.41182275e-03 ... 1.74675825e-03
  9.73948714e-06 5.82063324e-03]
 [7.05804294e-04 4.52171052e-01 7.79519117e-04 ... 2.99983573e-04
  7.02499683e-05 1.77403085e-03]
 [5.27536732e-04 4.20527833e-05 1.06463771e-01 ... 2.48161601e-03
  6.20208951e-07 1.05092380e-03]
 ...
 [9.08749136e-04 4.73510740e-05 2.44465324e-03 ... 5.20935053e-02
  9.25828721e-07 1.30541367e-03]
 [1.82396288e-04 8.89982262e-04 1.34218507e-05 ... 3.32028777e-05
  4.95095918e-01 3.99083065e-04]
 [4.10798076e-03 3.00015537e-04 8.47995992e-04 ... 1.20465852e-03
  6.68057384e-06 2.47293930e-02]]
[[2.43721319e-01 1.77319113e-04 1.47810207e-03 ... 1.87800402e-03
  1.08725332e-05 6.01241599e-03]
 [7.81780584e-04 4.47900073e-01 7.41678039e-04 ... 3.28130121e-04
  7.25760555e-05 1.92691818e-03]
 [5.79405692e-04 4.41510762e-05 9.20853974e-02 ... 2.73475006e-03
  7.00337579e-07 1.07337249e-03]
 ...
 [1.00733846e-03 5.55830603e-05 2.68976421e-03 ... 4.22989558e-02
  1.11291133e-06 1.41530107e-03]

[[2.15983098e-01 2.32738948e-04 1.60206548e-03 ... 2.11156454e-03
  1.32990510e-05 6.30230314e-03]
 [9.35709706e-04 4.39500720e-01 6.79871656e-04 ... 3.86534601e-04
  7.72361751e-05 2.20642680e-03]
 [6.91289267e-04 4.98200972e-05 7.03117416e-02 ... 3.17828343e-03
  8.90699327e-07 1.13071516e-03]
 ...
 [1.20216759e-03 7.45606280e-05 3.12126450e-03 ... 2.94334273e-02
  1.57881155e-06 1.62483267e-03]
 [2.30210060e-04 9.15882702e-04 1.47394436e-05 ... 4.77909836e-05
  4.93724412e-01 4.67587226e-04]
 [4.74153539e-03 4.35786944e-04 9.51732759e-04 ... 1.53388777e-03
  1.06205563e-05 1.48642668e-02]]
[[2.03056292e-01 2.61723408e-04 1.66183140e-03 ... 2.21987064e-03
  1.45590605e-05 6.42561731e-03]
 [1.01291977e-03 4.35343552e-01 6.52952950e-04 ... 4.16324196e-04
  7.95678370e-05 2.33812188e-03]
 [7.49145421e-04 5.31516288e-05 6.10762336e-02 ... 3.37968101e-03
  9.97447444e-07 1.16283994e-03]
 ...
 [1.29800542e-03 8.47768277e-05 3.31792351e-03 ... 2.44007353e-02
  1.84110741e-06 1.72560483e-03]

[[1.74720997e-01 3.39210672e-04 1.80222045e-03 ... 2.45427047e-03
  1.79187746e-05 6.65489692e-03]
 [1.20571656e-03 4.25147441e-01 6.03273297e-04 ... 4.93263847e-04
  8.54012207e-05 2.63088654e-03]
 [9.00442422e-04 6.40079834e-05 4.41150238e-02 ... 3.78971988e-03
  1.33089891e-06 1.25715513e-03]
 ...
 [1.52788917e-03 1.13547027e-04 3.72172724e-03 ... 1.64693421e-02
  2.64022755e-06 1.95758693e-03]
 [2.86325249e-04 9.38877610e-04 1.84576438e-05 ... 6.70510019e-05
  4.92128538e-01 5.51115737e-04]
 [5.30080506e-03 5.96581090e-04 1.10551937e-03 ... 1.87904805e-03
  1.60439065e-05 1.07107024e-02]]
[[1.64342402e-01 3.71311275e-04 1.85628292e-03 ... 2.53934221e-03
  1.93160418e-05 6.73045443e-03]
 [1.28243908e-03 4.21121020e-01 5.87817991e-04 ... 5.24616824e-04
  8.77349156e-05 2.73876459e-03]
 [9.62171847e-04 6.90715976e-05 3.85999682e-02 ... 3.93012939e-03
  1.48543916e-06 1.29801088e-03]
 ...
 [1.61681955e-03 1.25867171e-04 3.86098420e-03 ... 1.41377719e-02
  2.99952062e-06 2.04499706e-03]

[[1.22011902e-01 5.41171408e-04 2.10639699e-03 ... 2.88230975e-03
  2.68790872e-05 6.97653445e-03]
 [1.65660725e-03 4.01592316e-01 5.57049029e-04 ... 6.86622335e-04
  9.93957232e-05 3.18255279e-03]
 [1.27696453e-03 1.03379581e-04 2.13156799e-02 ... 4.39733274e-03
  2.56566908e-06 1.52844745e-03]
 ...
 [2.02360857e-03 1.96041136e-04 4.33433063e-03 ... 8.19038892e-03
  5.27718045e-06 2.42411864e-03]
 [3.80309044e-04 9.68489146e-04 3.10236206e-05 ... 1.04018104e-04
  4.89403460e-01 6.76017260e-04]
 [5.94546772e-03 8.61076345e-04 1.41990408e-03 ... 2.36834594e-03
  2.67592106e-05 8.87311835e-03]]
[[1.14889938e-01 5.76209272e-04 2.15365794e-03 ... 2.94013176e-03
  2.84781458e-05 7.01031234e-03]
 [1.72933860e-03 3.97783783e-01 5.57686830e-04 ... 7.19659137e-04
  1.01725182e-04 3.25770577e-03]
 [1.33998588e-03 1.11744802e-04 1.90325673e-02 ... 4.45927160e-03
  2.83965146e-06 1.57742635e-03]
 ...
 [2.09883065e-03 2.11331953e-04 4.39887620e-03 ... 7.54171135e-03
  5.81235953e-06 2.49155924e-03]

<__main__.SimilarityNetworkFusion at 0x7fb422e07100>

In [15]:
sm.iterations_fit(matrices_diff=3)

0 : 24.704514507625646
1 : 17.78984100520555
2 : 15.066080451351228
3 : 12.689120687546371
4 : 11.315589319645468
5 : 10.24231369125367
6 : 9.44235398614814
7 : 8.78570206906347
8 : 8.240973546764446
9 : 7.773818901610334
10 : 7.3677313865089
11 : 7.009009551888137
12 : 6.690024524945179
13 : 6.404656878701583
14 : 6.147804341666656
15 : 5.915859102406112
16 : 5.705810403224784
17 : 5.514771961181847
18 : 5.340583763391923
19 : 5.181557909593407
20 : 5.035850317792812
21 : 4.901905204428646
22 : 4.778347293973788
23 : 4.664000449524897
24 : 4.557793243560072
25 : 4.459008720056606
26 : 4.366909147409094
27 : 4.280945380161371
28 : 4.200521668182576
29 : 4.125150090209724
30 : 4.054329661880232
31 : 3.987781545340836
32 : 3.9250287313494496
33 : 3.86569990573929
34 : 3.809610627399268
35 : 3.7564164915859166
36 : 3.705924297344162
37 : 3.657944857545123
38 : 3.6121782361921784
39 : 3.568561752207915
40 : 3.526907440699425
41 : 3.487004892258268
42 : 3.448763099258188
43 : 3.412045603317

<__main__.SimilarityNetworkFusion at 0x7fb422e07100>

In [16]:
sm.local_minimum_fit(iters_to_min=8)

0 : 24.704514507625646
1 : 17.78984100520555
2 : 15.066080451351228
3 : 12.689120687546371
4 : 11.315589319645468
5 : 10.24231369125367
6 : 9.44235398614814
7 : 8.78570206906347
8 : 8.240973546764446
9 : 7.773818901610334
10 : 7.3677313865089
11 : 7.009009551888137
12 : 6.690024524945179
13 : 6.404656878701583
14 : 6.147804341666656
15 : 5.915859102406112
16 : 5.705810403224784
17 : 5.514771961181847
18 : 5.340583763391923
19 : 5.181557909593407
20 : 5.035850317792812
21 : 4.901905204428646
22 : 4.778347293973788
23 : 4.664000449524897
24 : 4.557793243560072
25 : 4.459008720056606
26 : 4.366909147409094
27 : 4.280945380161371
28 : 4.200521668182576
29 : 4.125150090209724
local minimum reached in  29 iterations


<__main__.SimilarityNetworkFusion at 0x7fb422e07100>

In [17]:
import plotly.graph_objects as go

def plot_starting_graphs(df, y_pred):
    G = nx.from_numpy_matrix(df)
    print(nx.info(G))
    nx.draw_networkx(G, with_labels=False)
    return

In [18]:
plot_starting_graphs(sm.p_rna, y_pred)

NameError: name 'y_pred' is not defined

# Loading the label

In [None]:
y_illumina = LabelEncoder().fit_transform(df_illumina.drop_duplicates(subset='case_id').loc[:, 'label'].transform(lambda x: str(x)))
y_mirna = LabelEncoder().fit_transform(df_mirna.drop_duplicates(subset='case_id').loc[:, 'label'].transform(lambda x:  str(x)))
y_rna = LabelEncoder().fit_transform(df_rna.drop_duplicates(subset='case_id').loc[:, 'label'].transform(lambda x: str(x)))

In [None]:
y_pred = SpectralClustering(n_clusters=3, affinity='precomputed').fit(sm.p_mirna).labels_

In [None]:
print('Rand Score:')
print('\tIllumina', adjusted_rand_score(y_illumina, y_pred))
print('\tMirna', adjusted_rand_score(y_mirna, y_pred))
print('\tRNA:', adjusted_rand_score(y_rna, y_pred))
print('\tmean:', (adjusted_rand_score(y_rna, y_pred) + adjusted_rand_score(y_illumina, y_pred) + adjusted_rand_score(y_mirna, y_pred))/3)

print('\n')
print('Silhouette score:')
print('\tIllumina', silhouette_score(sm.p_illumina, y_pred))
print('\tMirna', silhouette_score(sm.p_mirna, y_pred))
print('\tRNA:', silhouette_score(sm.p_rna, y_pred))

In [None]:
y_pred = KMeans(n_clusters=3).fit(sm.p_mirna).labels_
print('Rand Score:')
print('\tIllumina', adjusted_rand_score(y_illumina, y_pred))
print('\tMirna', adjusted_rand_score(y_mirna, y_pred))
print('\tRNA:', adjusted_rand_score(y_rna, y_pred))
print('\n')
print('Silhouette score:')
print('\tIllumina', silhouette_score(sm.p_illumina, y_pred))
print('\tMirna', silhouette_score(sm.p_mirna, y_pred))
print('\tRNA:', silhouette_score(sm.p_rna, y_pred))

In [None]:
#####SI POTREBBE UTILIZZARE COME CONDIZIONE DI TERMINAZIONE CON UN CERTO VALORE
diff_matrix = np.abs(np.subtract(sm.p_rna, sm.p_mirna), sm.p_illumina)
print(np.mean(diff_matrix))