In [103]:
import pandas as pd
import numpy as np
import json
import os
import networkx as nx
#import pickle5 as pickle

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.preprocessing import LabelEncoder

from myclass.CleanMergeDataset import Clean_Merge_Dataset
from myclass.BonferroniTtest import Bonferroni_Ttest

In [104]:
if os.path.exists('final_dataset_common.json') is False:

    data_normal = pd.read_pickle('./data-ready/RNA_dataframe_normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/RNA_dataframe').replace('/', '\\')
    dataset_RNA, y_RNA, cases_id_RNA = Clean_Merge_Dataset(name='RNA').transform(data_normal, data_tumor)
    df_RNA = pd.concat([dataset_RNA, cases_id_RNA], axis=1)

    data_normal = pd.read_pickle('./data-ready/miRNA_dataframe_normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/miRNA_dataframe').replace('/', '\\')
    dataset_miRNA, y_miRNA, cases_id_miRNA= Clean_Merge_Dataset(name='miRNA').transform(data_normal, data_tumor)
    df_miRNA = pd.concat([dataset_miRNA, cases_id_miRNA], axis=1)

    data_normal = pd.read_pickle('./data-ready/illumina-27-450-normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/illumina450-27-tumor').replace('/', '\\')
    dataset_illumina, y_illumina, cases_id_illumina= Clean_Merge_Dataset(name='illumina').transform(data_normal, data_tumor)
    df_illumina = pd.concat([dataset_illumina, cases_id_illumina], axis=1)

    dataset_RNA = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_RNA, y_RNA], axis=1), y_RNA)
    dataset_miRNA = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_miRNA, y_miRNA], axis=1), y_miRNA)
    dataset_illumina = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_illumina, y_illumina], axis=1), y_illumina)

    cases_id = set(dataset_illumina['case_id']) & set(dataset_miRNA['case_id']) & set(dataset_RNA['case_id'])
    df_final_illumina = dataset_illumina.loc[dataset_illumina['case_id'].isin(cases_id)]
    df_final_rna = dataset_RNA.loc[dataset_RNA['case_id'].isin(cases_id)]
    df_final_mirna = dataset_miRNA.loc[dataset_miRNA['case_id'].isin(cases_id)]

    print(df_final_illumina.shape)
    print(df_final_rna.shape)
    print(df_final_mirna.shape)
    
    df_final_illumina.to_pickle('illumina_pickle.pkl')
    df_final_rna.to_pickle('rna_pickle.pkl')
    df_final_mirna.to_pickle('miRNA_pickle.pkl')
    
    my_dict = {
        'miRNA': df_final_mirna.to_dict(),
        'RNA': df_final_rna.to_dict(),
        'illumina': df_final_illumina.to_dict()
    }
    with open('final_dataset_common.json', 'w') as outfile:
        json.dump(my_dict, outfile)
    
    df_illumina = df_final_illumina.copy()
    df_mirna = df_final_mirna.copy()
    df_rna = df_final_rna.copy()
    
    del my_dict
    del df_final_illumina
    del df_final_rna
    del df_final_mirna
    del dataset_illumina
    del dataset_RNA
    del dataset_miRNA
else:
    df_illumina = pd.read_pickle('illumina_pickle.pkl')
    df_mirna = pd.read_pickle('miRNA_pickle.pkl')
    df_rna = pd.read_pickle('rna_pickle.pkl')
    #with open('illumina_pickle.pkl' ,'rb') as f:
        #df_illumina = pickle.load(f)
    #with open('mirna_pickle.pkl' ,'rb') as f:
        #df_mirna = pickle.load(f)
    #with open('rna_pickle.pkl' ,'rb') as f:
        #df_rna = pickle.load(f)

In [105]:
print(df_illumina.shape)
print(df_mirna.shape)
print(df_rna.shape)

(493, 15700)
(473, 237)
(508, 12965)


In [106]:
case_id_new = list()
for i, row in df_mirna.iterrows():
    case_id_new.append(row['case_id'] + '_' + str(row['label']))
df_mirna['case_id_new'] = case_id_new

case_id_new = list()
for i, row in df_rna.iterrows():
    case_id_new.append(row['case_id'] + '_' + str(row['label'])) 
local_df = pd.DataFrame(new_col)
df_rna['case_id_new'] = case_id_new

case_id_new = list()
for i, row in df_illumina.iterrows():
    case_id_new.append(row['case_id'] + '_' + str(row['label']))
df_illumina['case_id_new'] = case_id_new

In [107]:
cases_id = set(df_mirna['case_id_new']) & set(df_rna['case_id_new']) & set(df_illumina['case_id_new'])
df_illumina = df_illumina.loc[df_illumina['case_id_new'].isin(cases_id)]
df_rna = df_rna.loc[df_rna['case_id_new'].isin(cases_id)]
df_mirna = df_mirna.loc[df_mirna['case_id_new'].isin(cases_id)]
print(df_mirna.shape)
print(df_rna.shape)
print(df_illumina.shape)

(430, 238)
(430, 12966)
(430, 15701)


In [108]:
df_illumina.drop(columns=['case_id'], inplace=True, axis=1)
df_mirna.drop(columns=['case_id'], inplace=True, axis=1)
df_rna.drop(columns=['case_id'], inplace=True, axis=1)

In [109]:
import numpy as np
import pandas as pd
import os

from scipy.spatial.distance import pdist, squareform, cdist
from copy import deepcopy
from sklearn.preprocessing import StandardScaler

class SimilarityNetworkFusion:
    def __init__(self, df_mirna, df_rna, df_illumina, k=3, mu=0.3):
        
        self.cases_id = df_rna.loc[:, 'case_id_new']
        self.rna = df_rna.copy()
        self.mirna = df_mirna.copy()
        self.illumina = df_illumina.copy()
        
        self.k = k
        self.mu = mu
        self.check_columns()
    
    def calculate_matrix(self):
        self.dict_dist = self.calculate_sim_matrix()
        if hasattr(self, 'w_rna') is False:
            self.w_rna = self.__weights__(self.rna, 'RNA', save_matrix=True)
            self.w_mirna = self.__weights__(self.mirna, 'miRNA', save_matrix=True)
            self.w_illumina = self.__weights__(self.illumina, 'Illumina', save_matrix=True)
        
        if hasattr(self, 'p_rna') is False:
            self.starting_p_rna = self.P_matrix(self.w_rna.to_numpy().tolist(), self.cases_id.shape[0], 'RNA', save_matrix=True)
            self.starting_p_mirna = self.P_matrix(self.w_mirna.to_numpy().tolist(), self.cases_id.shape[0], 'miRNA', save_matrix=True)
            self.starting_p_illumina = self.P_matrix(self.w_illumina.to_numpy().tolist(), self.cases_id.shape[0], 'Illumina', save_matrix=True)
                
        self.s_rna = self.S_matrix(self.w_rna.to_numpy().tolist(), self.cases_id.shape[0], 'RNA')
        self.s_mirna = self.S_matrix(self.w_mirna.to_numpy().tolist(), self.cases_id.shape[0], 'miRNA')
        self.s_illumina = self.S_matrix(self.w_illumina.to_numpy().tolist(), self.cases_id.shape[0], 'Illumina')
        
        return self
    
    def calculate_sim_matrix(self):
        tot = 0
        distance = 'euclidean'
        tot += pdist(self.rna, distance)
        tot += pdist(self.mirna, distance)
        tot += pdist(self.illumina, distance)
        tot = tot/3 
        df_dist = pd.DataFrame(columns=self.cases_id, index=self.cases_id, data=squareform(tot))
        
        return df_dist

    def __weights__(self, dataset, name, save_matrix=False):
        
        if 'weights_matrix_'+name+'.pkl' in os.listdir('.'):
            print('Read file pickle for weights matrix of {}'.format(name))
            weights = pd.read_pickle('weights_matrix_'+name+'.pkl')
            #dist = pdist(dataset, 'euclidean')
            #df_dist = pd.DataFrame(columns=self.cases_id, index=self.cases_id, data=squareform(dist))
            #self.dict_dist[name] = df_dist.copy()
            return weights
        
        print('Calculating weights for {}...'.format(name))
        df = pd.DataFrame(columns=self.cases_id, data=dataset.T.values)
        
        #calculate euclidean distance
        dist = pdist(dataset, 'euclidean')
        df_dist = pd.DataFrame(columns=self.cases_id, index=self.cases_id, data=squareform(dist))
        weights = pd.DataFrame(columns=self.cases_id, index=self.cases_id, data=[])
                
        for i, patient_i in enumerate(tqdm(self.cases_id)):
            for patient_j in self.cases_id.iloc[i:]:
                    tokK_mean_i = np.sort(df_dist.loc[patient_i, :].to_numpy())[:self.k].mean()
                    topK_mean_j = np.sort(df_dist.loc[patient_j, :].to_numpy())[:self.k].mean()
                    
                    eps = (tokK_mean_i + tokK_mean_i + df_dist.loc[patient_i, patient_j])/3

                    weights.loc[patient_i, patient_j] = np.exp(-(df_dist.loc[patient_i, patient_j]**2/(eps*self.mu)))
                    weights.loc[patient_j, patient_i] = np.exp(-(df_dist.loc[patient_j, patient_i]**2/(eps*self.mu)))
        if save_matrix:
            weights.to_pickle('weights_matrix_'+name+'.pkl')
        #self.dict_dist[name] = df_dist.copy()
        return weights       
    
    def check_columns(self):
        scaler = MinMaxScaler()
        if 'label' in self.mirna.columns:
            self.mirna.drop(['label'], axis=1, inplace=True)
        if 'case_id_new' in self.mirna.columns:
            self.mirna.drop(['case_id_new'], axis=1, inplace=True)
            
        if 'label' in self.rna.columns:
            self.rna.drop(['label'], axis=1, inplace=True)
        if 'case_id_new' in self.rna.columns:
            self.rna.drop(['case_id_new'], axis=1, inplace=True)
            
        if 'label' in self.illumina.columns:
            self.illumina.drop(['label'], axis=1, inplace=True)
        if 'case_id_new' in self.illumina.columns:
            self.illumina.drop(['case_id_new'], axis=1, inplace=True)
            
        self.mirna = pd.DataFrame(scaler.fit_transform(self.mirna))
        self.rna = pd.DataFrame(scaler.fit_transform(self.rna))
        self.illumina = pd.DataFrame(scaler.fit_transform(self.illumina))

        return


    def find_k_neighbors(self, row, i, k=None): 
        row=deepcopy(row)
        #case of P matrix
        if k==None:
            del row[i]  #delete element of the same column of row index
            return row

        #case of S (find k elements with minimum distance value of W[i][j])
        else:
            k_neighbors_index=[]
            neigh = 0
            max_value = max(row)
            for j in range(0, len(row)):
                if j!=i:
                    min_index = row.index(min(row))
                    k_neighbors_index.append(min_index)
                    neigh+=1
                    row[min_index] = max_value
                    if neigh == k:
                        return k_neighbors_index


    def P_matrix(self, W, n_case_id, name, save_matrix=False):
        if 'pStarting_matrix_'+name+'.pkl' in os.listdir('.'):
            print('Reading the file pickle for the p starting matrix {}'.format(name))
            df_p = pd.read_pickle('pStarting_matrix_'+name+'.pkl')
            return df_p.to_numpy()
            
        print('Calculating P matrix for {}...'.format(name))
        P=[]
        for i in tqdm(range(0, n_case_id)):
            row=[]
            for j in range(0,n_case_id):
                if i==j:
                    row.append(1/2)

                else:
                    k_neighbors = self.find_k_neighbors(W[i], i)
                    denominator = 2*sum(k_neighbors)
                    row.append(W[i][j]/denominator)
            P.append(row)
        #print(np.array(P))
        
        if save_matrix:
            df_P = pd.DataFrame(np.array(P))
            df_P.to_pickle('pStarting_matrix_'+name+'.pkl')
        return np.array(P)

    def S_matrix(self, W, n_case_id, name):
        print('Calculating S matrix for {}...'.format(name))
        S=[]
        for i in tqdm(range(0, n_case_id)):
            S_row=[]
            neighbors_indeces = self.find_k_neighbors(self.dict_dist.iloc[i,:].to_numpy().tolist(), i, self.k)
            for j in range(0,n_case_id):
                if j not in neighbors_indeces:
                    S_row.append(0)

                else:
                    np_row = np.array(W[i])
                    denominator = sum(np_row[neighbors_indeces])
                    S_row.append(W[i][j]/denominator)
            S.append(S_row)
        print(np.array(S))
        return np.array(S)
    
    def product_matrix(self, S_matrix, P_matrix):
        result = np.dot(S_matrix, P_matrix)
        result = np.dot(result, S_matrix.T)
        return result
    
    def sum_matrix_P(self, P1, P2):
        return np.add(P1,P2)/2
    
    def fit(self, num_iter=None):
        if num_iter is not None:
            self.p_rna = self.starting_p_rna.copy()
            self.p_mirna = self.starting_p_mirna.copy()
            self.p_illumina = self.starting_p_illumina.copy()
            for i in range(0, num_iter):
                self.p_rna_t1 = self.product_matrix(self.s_rna, self.sum_matrix_P(self.p_mirna, self.p_illumina))
                self.p_mirna_t1 = self.product_matrix(self.s_mirna, self.sum_matrix_P(self.p_rna, self.p_illumina))
                self.p_illumina_t1 = self.product_matrix(self.s_illumina, self.sum_matrix_P(self.p_mirna, self.p_rna))
                #print(self.p_rna_t1)
                #print(self.p_mirna_t1)
                #print(self.p_illumina_t1)
                self.p_rna = self.p_rna_t1
                self.p_mirna = self.p_mirna_t1
                self.p_illumina = self.p_illumina_t1
        else:
            print('ciao')

        return self
    
   
    def iterations_fit(self, matrices_diff=None, max_iter=100):
        if matrices_diff is not None:
            self.p_rna = self.starting_p_rna.copy()
            self.p_mirna = self.starting_p_mirna.copy()
            self.p_illumina = self.starting_p_illumina.copy()
            
            for step in range(0, max_iter):
                self.p_rna_t1 = self.product_matrix(self.s_rna, self.sum_matrix_P(self.p_mirna, self.p_illumina))
                self.p_mirna_t1 = self.product_matrix(self.s_mirna, self.sum_matrix_P(self.p_rna, self.p_illumina))
                self.p_illumina_t1 = self.product_matrix(self.s_illumina, self.sum_matrix_P(self.p_mirna, self.p_rna))
               
                self.p_rna = self.p_rna_t1
                self.p_mirna = self.p_mirna_t1
                self.p_illumina = self.p_illumina_t1

                diff_matrix = 0
                for i in range(0, len(self.p_rna)):
                    for j in range(i, len(self.p_rna)):
                        diff_matrix += np.abs(self.p_rna[i][j] - self.p_mirna[i][j])
                        diff_matrix += np.abs(self.p_illumina[i][j] - self.p_mirna[i][j])
                        diff_matrix += np.abs(self.p_illumina[i][j] - self.p_rna[i][j])
                
                diff_matrix = diff_matrix**0.5
                print(step, ':', diff_matrix)
                
                #diff_matrix = np.abs(np.subtract(self.p_rna, self.p_mirna)) + np.abs(np.subtract(self.p_rna, self.p_illumina)) + np.abs(np.subtract(self.p_mirna, self.p_illumina))
                #diff_matrix= np.abs(np.mean(diff_matrix))
                if diff_matrix<=np.abs(matrices_diff):
                    print('number of iterations to reach difference: ', step)
                    break
                    
                if step == max_iter-1: ##impossible to reach matrices difference
                    print('impossible to reach indicated difference, try with a bigger difference value')
        else:
            print('no difference for matrices found')

        return self
    
    def local_minimum_fit(self, iters_to_min=None, max_iter=100):
        if iters_to_min is not None:
            self.p_rna = self.starting_p_rna.copy()
            self.p_mirna = self.starting_p_mirna.copy()
            self.p_illumina = self.starting_p_illumina.copy()
            count=0
            prev_diff=0
            for step in range(0, max_iter):
                self.p_rna_t1 = self.product_matrix(self.s_rna, self.sum_matrix_P(self.p_mirna, self.p_illumina))
                self.p_mirna_t1 = self.product_matrix(self.s_mirna, self.sum_matrix_P(self.p_rna, self.p_illumina))
                self.p_illumina_t1 = self.product_matrix(self.s_illumina, self.sum_matrix_P(self.p_mirna, self.p_rna))
               
                self.p_rna = self.p_rna_t1
                self.p_mirna = self.p_mirna_t1
                self.p_illumina = self.p_illumina_t1

                diff_matrix = 0
                for i in range(0, len(self.p_rna)):
                    for j in range(i, len(self.p_rna)):
                        diff_matrix += np.abs(self.p_rna[i][j] - self.p_mirna[i][j])
                        diff_matrix += np.abs(self.p_illumina[i][j] - self.p_mirna[i][j])
                        diff_matrix += np.abs(self.p_illumina[i][j] - self.p_rna[i][j])
                
                diff_matrix = diff_matrix**0.5
                print(step, ':', diff_matrix)
                
                #diff_matrix = np.abs(np.subtract(self.p_rna, self.p_mirna)) + np.abs(np.subtract(self.p_rna, self.p_illumina)) + np.abs(np.subtract(self.p_mirna, self.p_illumina))
                #diff_matrix= np.abs(np.mean(diff_matrix))
                
                #check if a local minimum is found
                if int(diff_matrix)==prev_diff:
                    count+=1
                    if count>=iters_to_min:
                        print('local minimum reached in ', step, 'iterations')
                        break
                else:
                    count=0
                    
                prev_diff = int(diff_matrix)
                    
                if step == max_iter-1: ##impossible to reach matrices difference
                    print('impossible to reach local minimum, matrices seem to not converge')
        else:
            print('no minimum iterations for matrices found')

        return self
    
    def clean(self):
        del self.p_rna
        del self.p_mirna
        del self.p_illumina
        
        del self.p_rna_t1
        del self.p_mirna_t1
        del self.p_illumina_t1
        
        
        del self.w_rna
        del self.w_mirna
        del self.w_illumina
        
        return self

In [110]:
df_mirna.sort_values(by='case_id_new', inplace=True)
df_rna.sort_values(by='case_id_new', inplace=True)
df_illumina.sort_values(by='case_id_new', inplace=True)

sm = SimilarityNetworkFusion(df_mirna,
                            df_rna,
                            df_illumina, k=100).calculate_matrix()

Calculating weights for RNA...


100%|██████████| 430/430 [02:12<00:00,  3.26it/s]
  0%|          | 0/430 [00:00<?, ?it/s]

Calculating weights for miRNA...


100%|██████████| 430/430 [01:57<00:00,  3.66it/s]


Calculating weights for Illumina...


100%|██████████| 430/430 [01:25<00:00,  5.05it/s]
  0%|          | 0/430 [00:00<?, ?it/s]

Calculating P matrix for RNA...


100%|██████████| 430/430 [04:18<00:00,  1.67it/s]
  0%|          | 0/430 [00:00<?, ?it/s]

Calculating P matrix for miRNA...


100%|██████████| 430/430 [04:10<00:00,  1.71it/s]
  0%|          | 0/430 [00:00<?, ?it/s]

Calculating P matrix for Illumina...


100%|██████████| 430/430 [03:53<00:00,  1.84it/s]
  2%|▏         | 8/430 [00:00<00:05, 78.99it/s]

Calculating S matrix for RNA...


100%|██████████| 430/430 [00:05<00:00, 80.28it/s]
  2%|▏         | 9/430 [00:00<00:05, 82.32it/s]

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 5.00270315e-39 ... 1.00000000e+00
  8.80181522e-36 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [0.00000000e+00 3.83991750e-31 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]]
Calculating S matrix for miRNA...


100%|██████████| 430/430 [00:05<00:00, 79.12it/s]
  2%|▏         | 8/430 [00:00<00:05, 74.06it/s]

[[9.47083016e-01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 9.92648358e-01 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 8.63967067e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 4.89429933e-08 ... 9.99985429e-01
  3.63694198e-06 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  7.94409972e-01 0.00000000e+00]
 [0.00000000e+00 2.84072813e-06 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 9.99285037e-01]]
Calculating S matrix for Illumina...


100%|██████████| 430/430 [00:06<00:00, 70.75it/s]

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 1.15539506e-55 ... 1.00000000e+00
  2.37360213e-45 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [0.00000000e+00 7.92269821e-36 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]]





In [111]:
sm.fit(num_iter=50)

<__main__.SimilarityNetworkFusion at 0x7fb37c81f520>

In [112]:
sm.iterations_fit(matrices_diff=3)

0 : 24.2675195095318
1 : 17.570515783431524
2 : 14.80683273079802
3 : 12.493153185378894
4 : 11.074258315168379
5 : 9.991503848137087
6 : 9.162455005684473
7 : 8.496660814992122
8 : 7.940946193455418
9 : 7.471442703250472
10 : 7.064576044267386
11 : 6.711158298185656
12 : 6.399146930255973
13 : 6.12300695737132
14 : 5.87649912536672
15 : 5.655608169946378
16 : 5.457028002568043
17 : 5.277464475209049
18 : 5.114030904828196
19 : 4.964903772046636
20 : 4.828368256090836
21 : 4.70282571778883
22 : 4.58695764399404
23 : 4.479673247134771
24 : 4.380046991528198
25 : 4.287195869935415
26 : 4.200543073611898
27 : 4.119674048845961
28 : 4.044108611443047
29 : 3.973136710288096
30 : 3.9062743117050673
31 : 3.8431890613172137
32 : 3.7834511722968065
33 : 3.726630731469131
34 : 3.6725098691808684
35 : 3.620878792832157
36 : 3.571529686161098
37 : 3.5242787151270294
38 : 3.4789086727142555
39 : 3.4352768345270377
40 : 3.3933481437559823
41 : 3.352930012875422
42 : 3.3138439294567967
43 : 3.2760243

<__main__.SimilarityNetworkFusion at 0x7fb37c81f520>

In [116]:
sm.local_minimum_fit(iters_to_min=8)

0 : 24.2675195095318
1 : 17.570515783431524
2 : 14.80683273079802
3 : 12.493153185378894
4 : 11.074258315168379
5 : 9.991503848137087
6 : 9.162455005684473
7 : 8.496660814992122
8 : 7.940946193455418
9 : 7.471442703250472
10 : 7.064576044267386
11 : 6.711158298185656
12 : 6.399146930255973
13 : 6.12300695737132
14 : 5.87649912536672
15 : 5.655608169946378
16 : 5.457028002568043
17 : 5.277464475209049
18 : 5.114030904828196
19 : 4.964903772046636
20 : 4.828368256090836
21 : 4.70282571778883
22 : 4.58695764399404
23 : 4.479673247134771
24 : 4.380046991528198
25 : 4.287195869935415
26 : 4.200543073611898
27 : 4.119674048845961
local minimum reached in  27 iterations


<__main__.SimilarityNetworkFusion at 0x7fb37c81f520>

In [None]:
import plotly.graph_objects as go

def plot_starting_graphs(df, y_pred):
    G = nx.from_numpy_matrix(df)
    print(nx.info(G))
    nx.draw_networkx(G, with_labels=False)
    return

In [None]:
plot_starting_graphs(sm.p_rna, y_pred)

# Loading the label

In [117]:
y_illumina = LabelEncoder().fit_transform(df_illumina.loc[:, 'label'].transform(lambda x: str(x)))
y_mirna = LabelEncoder().fit_transform(df_mirna.loc[:, 'label'].transform(lambda x:  str(x)))
y_rna = LabelEncoder().fit_transform(df_rna.loc[:, 'label'].transform(lambda x: str(x)))

In [118]:
y_pred = SpectralClustering(n_clusters=3, affinity='precomputed').fit(sm.p_mirna).labels_

  return f(**kwargs)


In [119]:
print('Rand Score:')
print('\tIllumina', adjusted_rand_score(y_illumina, y_pred))
print('\tMirna', adjusted_rand_score(y_mirna, y_pred))
print('\tRNA:', adjusted_rand_score(y_rna, y_pred))
print('\tmean:', (adjusted_rand_score(y_rna, y_pred) + adjusted_rand_score(y_illumina, y_pred) + adjusted_rand_score(y_mirna, y_pred))/3)

print('\n')
print('Silhouette score:')
print('\tIllumina', silhouette_score(sm.p_illumina, y_pred))
print('\tMirna', silhouette_score(sm.p_mirna, y_pred))
print('\tRNA:', silhouette_score(sm.p_rna, y_pred))

Rand Score:
	Illumina 0.8808569143992506
	Mirna 0.8808569143992506
	RNA: 0.8808569143992506
	mean: 0.8808569143992506


Silhouette score:
	Illumina 0.20672284127582458
	Mirna 0.2103121048452716
	RNA: 0.20672284127369672


In [None]:
y_pred = KMeans(n_clusters=3).fit(sm.p_mirna).labels_
print('Rand Score:')
print('\tIllumina', adjusted_rand_score(y_illumina, y_pred))
print('\tMirna', adjusted_rand_score(y_mirna, y_pred))
print('\tRNA:', adjusted_rand_score(y_rna, y_pred))
print('\n')
print('Silhouette score:')
print('\tIllumina', silhouette_score(sm.p_illumina, y_pred))
print('\tMirna', silhouette_score(sm.p_mirna, y_pred))
print('\tRNA:', silhouette_score(sm.p_rna, y_pred))

In [None]:
#####SI POTREBBE UTILIZZARE COME CONDIZIONE DI TERMINAZIONE CON UN CERTO VALORE
diff_matrix = np.abs(np.subtract(sm.p_rna, sm.p_mirna), sm.p_illumina)
print(np.mean(diff_matrix))

**Prove con geni di ATLAS**
da aggiungere il controllo su case_id_new

In [None]:
data_normal = pd.read_pickle('./data-ready/RNA_dataframe_normal').replace('/', '\\')
data_tumor = pd.read_pickle('./data-ready/RNA_dataframe').replace('/', '\\')
dataset_RNA, y_RNA, cases_id_RNA = Clean_Merge_Dataset(name='RNA').transform(data_normal, data_tumor)
dataset_RNA = pd.concat([dataset_RNA, cases_id_RNA, y_RNA], axis=1)

data_normal = pd.read_pickle('./data-ready/miRNA_dataframe_normal').replace('/', '\\')
data_tumor = pd.read_pickle('./data-ready/miRNA_dataframe').replace('/', '\\')
dataset_miRNA, y_miRNA, cases_id_miRNA= Clean_Merge_Dataset(name='miRNA').transform(data_normal, data_tumor)
df_miRNA = pd.concat([dataset_miRNA, cases_id_miRNA], axis=1)

data_normal = pd.read_pickle('./data-ready/illumina-27-450-normal').replace('/', '\\')
data_tumor = pd.read_pickle('./data-ready/illumina450-27-tumor').replace('/', '\\')
dataset_illumina, y_illumina, cases_id_illumina= Clean_Merge_Dataset(name='illumina').transform(data_normal, data_tumor)
df_illumina = pd.concat([dataset_illumina, cases_id_illumina], axis=1)

dataset_miRNA = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_miRNA, y_miRNA], axis=1), y_miRNA)
dataset_illumina = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_illumina, y_illumina], axis=1), y_illumina)

cases_id = set(dataset_illumina['case_id']) & set(dataset_miRNA['case_id']) & set(dataset_RNA['case_id'])
df_final_illumina = dataset_illumina.loc[dataset_illumina['case_id'].isin(cases_id)]
df_final_rna = dataset_RNA.loc[dataset_RNA['case_id'].isin(cases_id)]
df_final_mirna = dataset_miRNA.loc[dataset_miRNA['case_id'].isin(cases_id)]

print(df_final_illumina.shape)
print(df_final_rna.shape)
print(df_final_mirna.shape)

df_illumina = df_final_illumina.copy()
df_mirna = df_final_mirna.copy()
df_rna = df_final_rna.copy()

with open('all_ensgs_no_version.json') as f:
    genes = json.load(f)
print('Number of genes available:', len(genes))
print('Starting shape RNA:',df_rna.shape)
columns = [col.split('.')[0] for col in df_rna.columns]
columns = [original for gene, original in zip(columns, df_rna.columns) if gene in genes]
columns.append('case_id')
columns.append('label')
df_rna = df_rna.loc[:, columns]
print('Final shape RNA:', df_rna.shape)

In [None]:
df_mirna.sort_values(by='case_id', inplace=True)
df_rna.sort_values(by='case_id', inplace=True)
df_illumina.sort_values(by='case_id', inplace=True)

sm = SimilarityNetworkFusion(df_mirna.drop_duplicates(subset='case_id'),
                            df_rna.drop_duplicates(subset='case_id'),
                            df_illumina.drop_duplicates(subset='case_id'), k=100).calculate_matrix()

In [None]:
sm.local_minimum_fit(iters_to_min=8)
#sm.iterations_fit(matrices_diff=3)

In [None]:
y_illumina = LabelEncoder().fit_transform(df_illumina.drop_duplicates(subset='case_id').loc[:, 'label'].transform(lambda x: str(x)))
y_mirna = LabelEncoder().fit_transform(df_mirna.drop_duplicates(subset='case_id').loc[:, 'label'].transform(lambda x:  str(x)))
y_rna = LabelEncoder().fit_transform(df_rna.drop_duplicates(subset='case_id').loc[:, 'label'].transform(lambda x: str(x)))

y_pred = SpectralClustering(n_clusters=3, affinity='precomputed').fit(sm.p_mirna).labels_

print('Rand Score:')
print('\tIllumina', adjusted_rand_score(y_illumina, y_pred))
print('\tMirna', adjusted_rand_score(y_mirna, y_pred))
print('\tRNA:', adjusted_rand_score(y_rna, y_pred))
print('\tmean:', (adjusted_rand_score(y_rna, y_pred) + adjusted_rand_score(y_illumina, y_pred) + adjusted_rand_score(y_mirna, y_pred))/3)

print('\n')
print('Silhouette score:')
print('\tIllumina', silhouette_score(sm.p_illumina, y_pred))
print('\tMirna', silhouette_score(sm.p_mirna, y_pred))
print('\tRNA:', silhouette_score(sm.p_rna, y_pred))