In [1]:
import pandas as pd
import numpy as np
import json
import os

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

from myclass.CleanMergeDataset import Clean_Merge_Dataset
from myclass.BonferroniTtest import Bonferroni_Ttest

In [2]:
if os.path.exists('final_dataset_common.json') is False:

    data_normal = pd.read_pickle('./data-ready/RNA_dataframe_normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/RNA_dataframe').replace('/', '\\')
    dataset_RNA, y_RNA, cases_id_RNA = Clean_Merge_Dataset(name='RNA').transform(data_normal, data_tumor)
    df_RNA = pd.concat([dataset_RNA, cases_id_RNA], axis=1)

    data_normal = pd.read_pickle('./data-ready/miRNA_dataframe_normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/miRNA_dataframe').replace('/', '\\')
    dataset_miRNA, y_miRNA, cases_id_miRNA= Clean_Merge_Dataset(name='miRNA').transform(data_normal, data_tumor)
    df_miRNA = pd.concat([dataset_miRNA, cases_id_miRNA], axis=1)

    data_normal = pd.read_pickle('./data-ready/illumina-27-450-normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/illumina450-27-tumor').replace('/', '\\')
    dataset_illumina, y_illumina, cases_id_illumina= Clean_Merge_Dataset(name='illumina').transform(data_normal, data_tumor)
    df_illumina = pd.concat([dataset_illumina, cases_id_illumina], axis=1)

    dataset_RNA = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_RNA, y_RNA], axis=1), y_RNA)
    dataset_miRNA = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_miRNA, y_miRNA], axis=1), y_miRNA)
    dataset_illumina = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_illumina, y_illumina], axis=1), y_illumina)

    cases_id = set(dataset_illumina['case_id']) & set(dataset_miRNA['case_id']) & set(dataset_RNA['case_id'])
    df_final_illumina = dataset_illumina.loc[dataset_illumina['case_id'].isin(cases_id)]
    df_final_rna = dataset_RNA.loc[dataset_RNA['case_id'].isin(cases_id)]
    df_final_mirna = dataset_miRNA.loc[dataset_miRNA['case_id'].isin(cases_id)]

    print(df_final_illumina.shape)
    print(df_final_rna.shape)
    print(df_final_mirna.shape)
    
    df_final_illumina.to_pickle('illumina_pickle.pkl')
    df_final_rna.to_pickle('rna_pickle.pkl')
    df_final_mirna.to_pickle('miRNA_pickle.pkl')
    
    my_dict = {
        'miRNA': df_final_mirna.to_dict(),
        'RNA': df_final_rna.to_dict(),
        'illumina': df_final_illumina.to_dict()
    }
    with open('final_dataset_common.json', 'w') as outfile:
        json.dump(my_dict, outfile)
    
    df_illumina = df_final_illumina.copy()
    df_mirna = df_final_mirna.copy()
    df_rna = df_final_rna.copy()
    
    del my_dict
    del df_final_illumina
    del df_final_rna
    del df_final_mirna
    del dataset_illumina
    del dataset_RNA
    del dataset_miRNA
else:
    df_illumina = pd.read_pickle('illumina_pickle.pkl')
    df_mirna = pd.read_pickle('miRNA_pickle.pkl')
    df_rna = pd.read_pickle('rna_pickle.pkl')

In [3]:
import numpy as np
import pandas as pd

from scipy.spatial.distance import pdist, squareform
from copy import deepcopy
from sklearn.preprocessing import StandardScaler

class SimilarityNetworkFusion:
    def __init__(self, df_mirna, df_rna, df_illumina, k=3):
        
        self.cases_id = df_rna.loc[:, 'case_id']
        self.rna = df_rna.copy()
        self.mirna = df_mirna.copy()
        self.illumina = df_illumina.copy()
        
        self.k = k
        self.check_columns()
    
    def calculate_matrix(self):
        self.w_rna = self.__weights__(self.rna, 'RNA')
        self.w_mirna = self.__weights__(self.mirna, 'miRNA')
        self.w_illumina = self.__weights__(self.illumina, 'Illumina')
        
        self.p_rna = self.P_matrix(self.w_rna.to_numpy().tolist(), self.cases_id.shape[0], 'RNA')
        self.p_mirna = self.P_matrix(self.w_mirna.to_numpy().tolist(), self.cases_id.shape[0], 'miRNA')
        self.p_illumina = self.P_matrix(self.w_illumina.to_numpy().tolist(), self.cases_id.shape[0], 'Illumina')
        
        self.s_rna = self.S_matrix(self.w_rna.to_numpy().tolist(), self.cases_id.shape[0], 'RNA')
        self.s_mirna = self.S_matrix(self.w_mirna.to_numpy().tolist(), self.cases_id.shape[0], 'miRNA')
        self.s_illumina = self.S_matrix(self.w_illumina.to_numpy().tolist(), self.cases_id.shape[0], 'Illumina')
        
        return self
        

    def __weights__(self, dataset, name):
        print('Calculating weights for {}...'.format(name))
        df = pd.DataFrame(columns=self.cases_id, data=dataset.T.values)
        
        #calculate euclidean distance
        dist = pdist(dataset, 'euclidean')
        df_dist = pd.DataFrame(columns=self.cases_id, index=self.cases_id, data=squareform(dist))
        weights = df.cov()**2
                
        for i, patient_i in enumerate(tqdm(self.cases_id)):
            for patient_j in self.cases_id.iloc[i:]:
                    topK_i_mean = np.sort(df_dist.loc[patient_i, :].to_numpy())[:self.k].mean()
                    topK_j_mean = np.sort(df_dist.loc[patient_j, :].to_numpy())[:self.k].mean()
                    
                    mean = (topK_i_mean + topK_j_mean)/2

                    weights.loc[patient_i, patient_j] = np.exp(-(weights.loc[patient_i, patient_j]/mean))
                    weights.loc[patient_j, patient_i] = np.exp(-(weights.loc[patient_j, patient_i]/mean))
                    
        return weights        
    
    def check_columns(self):
        scaler = StandardScaler()
        if 'label' in self.mirna.columns:
            self.mirna.drop(['label'], axis=1, inplace=True)
        if 'case_id' in self.mirna.columns:
            self.mirna.drop(['case_id'], axis=1, inplace=True)
            
        if 'label' in self.rna.columns:
            self.rna.drop(['label'], axis=1, inplace=True)
        if 'case_id' in self.rna.columns:
            self.rna.drop(['case_id'], axis=1, inplace=True)
            
        if 'label' in self.illumina.columns:
            self.illumina.drop(['label'], axis=1, inplace=True)
        if 'case_id' in self.illumina.columns:
            self.illumina.drop(['case_id'], axis=1, inplace=True)
            
        self.mirna = pd.DataFrame(scaler.fit_transform(self.mirna))
        self.rna = pd.DataFrame(scaler.fit_transform(self.rna))
        self.illumina = pd.DataFrame(scaler.fit_transform(self.illumina))

        return


    def find_k_neighbors(self, row, i, k=None): 
        row=deepcopy(row)
        #case of P matrix
        if k==None:
            del row[i]  #delete element of the same column of row index
            return row

        #case of S (find k elements with minimum distance value of W[i][j])
        else:
            k_neighbors_index=[]
            neigh = 0
            max_value = max(row)
            for j in range(0, len(row)):
                if j!=i:
                    min_index = row.index(min(row))
                    k_neighbors_index.append(min_index)
                    neigh+=1
                    row[min_index] = max_value
                    if neigh == k:
                        return k_neighbors_index


    def P_matrix(self, W, n_case_id, name):
        print('Calculating P matrix for {}...'.format(name))
        P=[]
        for i in tqdm(range(0, n_case_id)):
            row=[]
            for j in range(0,n_case_id):
                if i==j:
                    row.append(1/2)

                else:
                    k_neighbors = self.find_k_neighbors(W[i], i)
                    denominator = 2*sum(k_neighbors)
                    row.append(W[i][j]/denominator)
            P.append(row)
        print(np.array(P))
        return np.array(P)

    def S_matrix(self, W, n_case_id, name):
        print('Calculating S matrix for {}...'.format(name))
        S=[]
        for i in tqdm(range(0, n_case_id)):
            S_row=[]
            neighbors_indeces = self.find_k_neighbors(W[i], i, self.k)
            for j in range(0,n_case_id):
                if j not in neighbors_indeces:
                    S_row.append(0)

                else:
                    np_row = np.array(W[i])
                    denominator = sum(np_row[neighbors_indeces])
                    S_row.append(W[i][j]/denominator)
            S.append(S_row)
        print(np.array(S))
        return np.array(S)
    
    def product_matrix(self, S_matrix, P_matrix):
        result = np.dot(S_matrix, P_matrix)
        result = np.dot(result, S_matrix.T)
        return result
    
    def sum_matrix_P(self, P1, P2):
        return np.add(P1,P2)/2
    
    def fit(self, num_iter=None):
        if num_iter is not None:
            for i in range(0, num_iter):
                self.p_rna_t1 = self.product_matrix(self.s_rna, self.sum_matrix_P(self.p_mirna, self.p_illumina))
                self.p_mirna_t1 = self.product_matrix(self.s_mirna, self.sum_matrix_P(self.p_rna, self.p_illumina))
                self.p_illumina_t1 = self.product_matrix(self.s_illumina, self.sum_matrix_P(self.p_mirna, self.p_rna))
                print(self.p_rna_t1)
                print(self.p_mirna_t1)
                print(self.p_illumina_t1)
                self.p_rna = self.p_rna_t1
                self.p_mirna = self.p_mirna_t1
                self.p_illumina = self.p_illumina_t1
        else:
            print('ciao')
        return

In [4]:
sm = SimilarityNetworkFusion(df_mirna.drop_duplicates(subset='case_id'),
                            df_rna.drop_duplicates(subset='case_id'),
                            df_illumina.drop_duplicates(subset='case_id'), k=100).calculate_matrix()

Calculating weights for RNA...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [02:01<00:00,  3.68it/s]
  0%|                                                                                          | 0/449 [00:00<?, ?it/s]

Calculating weights for miRNA...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [01:58<00:00,  3.80it/s]


Calculating weights for Illumina...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [01:52<00:00,  4.00it/s]
  0%|                                                                                          | 0/449 [00:00<?, ?it/s]

Calculating P matrix for RNA...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [01:34<00:00,  4.75it/s]
  0%|                                                                                          | 0/449 [00:00<?, ?it/s]

[[0.5        0.00111613 0.00111616 ... 0.00111617 0.00111616 0.00111615]
 [0.00111608 0.5        0.00111605 ... 0.00111612 0.00111609 0.00111612]
 [0.00111609 0.00111603 0.5        ... 0.00111609 0.00111609 0.00111607]
 ...
 [0.00111645 0.00111645 0.00111644 ... 0.5        0.00111207 0.00111111]
 [0.00111638 0.00111636 0.00111638 ... 0.001112   0.5        0.00111281]
 [0.00111641 0.00111642 0.00111639 ... 0.00111109 0.00111285 0.5       ]]
Calculating P matrix for miRNA...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [03:51<00:00,  1.94it/s]
  0%|                                                                                          | 0/449 [00:00<?, ?it/s]

[[0.5        0.0011168  0.00111708 ... 0.00111707 0.00111705 0.00111704]
 [0.00111611 0.5        0.00111638 ... 0.00111556 0.00111639 0.00111632]
 [0.00111654 0.00111654 0.5        ... 0.00111653 0.00111528 0.0011158 ]
 ...
 [0.00111952 0.00111869 0.00111951 ... 0.5        0.00111427 0.00111736]
 [0.00111705 0.00111707 0.0011158  ... 0.00111183 0.5        0.00111393]
 [0.00111676 0.00111672 0.00111604 ... 0.00111463 0.00111365 0.5       ]]
Calculating P matrix for Illumina...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [01:35<00:00,  4.69it/s]
  4%|███▍                                                                             | 19/449 [00:00<00:04, 92.99it/s]

[[0.5        0.00110835 0.0011044  ... 0.00111601 0.00111652 0.00111688]
 [0.00110833 0.5        0.00110883 ... 0.00111597 0.00111677 0.00111675]
 [0.00110431 0.00110876 0.5        ... 0.00111605 0.00111645 0.00111679]
 ...
 [0.00111543 0.00111541 0.00111557 ... 0.5        0.00111612 0.00111627]
 [0.00111591 0.00111618 0.00111593 ... 0.00111609 0.5        0.00111596]
 [0.00111613 0.00111602 0.00111614 ... 0.0011161  0.00111582 0.5       ]]
Calculating S matrix for RNA...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [00:04<00:00, 91.01it/s]
  2%|█▊                                                                               | 10/449 [00:00<00:05, 87.52it/s]

[[0.00991924 0.         0.         ... 0.         0.         0.        ]
 [0.         0.00990804 0.01000167 ... 0.         0.         0.        ]
 [0.         0.01000086 0.00993474 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.00992819 0.00997236 0.00996382]
 [0.         0.         0.         ... 0.0099709  0.00993518 0.00997817]
 [0.         0.         0.         ... 0.00996309 0.00997889 0.00993346]]
Calculating S matrix for miRNA...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [00:04<00:00, 90.96it/s]
  2%|█▊                                                                               | 10/449 [00:00<00:04, 97.86it/s]

[[0.00930747 0.         0.         ... 0.         0.         0.        ]
 [0.         0.00937907 0.         ... 0.01000825 0.         0.        ]
 [0.         0.         0.0091106  ... 0.         0.01001101 0.0100157 ]
 ...
 [0.         0.         0.         ... 0.00973063 0.01005923 0.        ]
 [0.         0.         0.         ... 0.00998334 0.00969212 0.01000224]
 [0.         0.         0.         ... 0.01000951 0.01000067 0.00906763]]
Calculating S matrix for Illumina...


100%|████████████████████████████████████████████████████████████████████████████████| 449/449 [00:04<00:00, 91.30it/s]

[[0.00995304 0.0099472  0.00991175 ... 0.         0.         0.        ]
 [0.00994608 0.0099561  0.00995059 ... 0.         0.         0.        ]
 [0.0099095  0.00994946 0.00995295 ... 0.         0.         0.        ]
 ...
 [0.00999836 0.00999818 0.00999962 ... 0.00991955 0.         0.        ]
 [0.01000161 0.         0.01000182 ... 0.         0.00990241 0.01000205]
 [0.         0.01000163 0.         ... 0.         0.00999984 0.00991563]]





In [5]:
sm.fit(num_iter=3)

[[0.00610495 0.00251315 0.0022637  ... 0.00286328 0.00291298 0.00291318]
 [0.00251327 0.00610479 0.00201347 ... 0.00251384 0.00281318 0.0022642 ]
 [0.00226366 0.00201331 0.00610497 ... 0.0015649  0.00161476 0.00156487]
 ...
 [0.00286331 0.00251375 0.00156497 ... 0.00610486 0.00485619 0.00535554]
 [0.00291309 0.00281316 0.00161491 ... 0.00485626 0.00610479 0.00445685]
 [0.00291321 0.00226411 0.00156494 ... 0.00535554 0.00445678 0.00610487]]
[[0.00610517 0.00276361 0.0019657  ... 0.00246985 0.00226497 0.00266535]
 [0.00276361 0.00610512 0.00226552 ... 0.00275871 0.00241357 0.00196565]
 [0.0019657  0.00226552 0.00610533 ... 0.00211807 0.0032114  0.00281018]
 ...
 [0.00246986 0.00275872 0.00211808 ... 0.00610605 0.00439818 0.00370367]
 [0.00226499 0.00241359 0.00321142 ... 0.0043982  0.00610496 0.00440505]
 [0.00266538 0.00196568 0.00281021 ... 0.00370369 0.00440506 0.00610532]]
[[0.00610494 0.0049046  0.0059048  ... 0.00430768 0.00341196 0.00181446]
 [0.00490457 0.00610494 0.00490468 ... 

In [None]:
sm.p_rna

In [None]:
sm.p_illumina 

In [None]:
sm.p_mirna