# Index
* [Load data](#load-data)
* [Way to fit the data](#way-to-fit)
    * [Standard approach](#standard-approach)
    * [Difference between matrices](#difference-matrix)
    * [Local minimum approach](#local-minimum)
* [Clustering and results](#Clustering-and-results)
    * [Analysis for standard fit approach](#Analysis-for-standard-fit-approach)
    * [Analysis for local minimum fit approach](#Analysis-for-local-minimum-fit-approach)
    * [Analysis for difference matrix fit approach](#Analysis-for-difference-matrix-fit-approach)
* [Clinical data](#Clinical-data)
* [Clustering with K-Means algorithm](#Clustering-with-K-Means-algorithm)
* [Genes from TCGA](#Genes-from-TCGA)

In [1]:
import pandas as pd
import numpy as np
import json
import os
import networkx as nx
import plotly.express as px
import plotly.graph_objects as go

from tqdm import tqdm
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.preprocessing import LabelEncoder

from myclass.CleanMergeDataset import Clean_Merge_Dataset
from myclass.BonferroniTtest import Bonferroni_Ttest
from myclass.ExtractClinicalCase import ExtractClinicalCase
from myclass.SimilarityNetworkFusion import SimilarityNetworkFusion

<a id='load-data'></a>
# Load data
Loading the datasets, clean them through class *Clean_Merge_Dataset*, reduce the number of features with *Bonferroni_Ttest* and cosider only the cases_id common in the three omnics.

In [2]:
if os.path.exists('./data-ready/final_dataset_common.json') is False:

    data_normal = pd.read_pickle('./data-ready/RNA_dataframe_normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/RNA_dataframe').replace('/', '\\')
    dataset_RNA, y_RNA, cases_id_RNA = Clean_Merge_Dataset(name='RNA').transform(data_normal, data_tumor)
    df_RNA = pd.concat([dataset_RNA, cases_id_RNA], axis=1)

    data_normal = pd.read_pickle('./data-ready/miRNA_dataframe_normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/miRNA_dataframe').replace('/', '\\')
    dataset_miRNA, y_miRNA, cases_id_miRNA= Clean_Merge_Dataset(name='miRNA').transform(data_normal, data_tumor)
    df_miRNA = pd.concat([dataset_miRNA, cases_id_miRNA], axis=1)

    data_normal = pd.read_pickle('./data-ready/illumina-27-450-normal').replace('/', '\\')
    data_tumor = pd.read_pickle('./data-ready/illumina450-27-tumor').replace('/', '\\')
    dataset_illumina, y_illumina, cases_id_illumina= Clean_Merge_Dataset(name='illumina').transform(data_normal, data_tumor)
    df_illumina = pd.concat([dataset_illumina, cases_id_illumina], axis=1)

    dataset_RNA = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_RNA, y_RNA], axis=1), y_RNA)
    dataset_miRNA = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_miRNA, y_miRNA], axis=1), y_miRNA)
    dataset_illumina = Bonferroni_Ttest(label_case_id_into_X=True, alpha=0.05).fit_transform(pd.concat([df_illumina, y_illumina], axis=1), y_illumina)

    cases_id = set(dataset_illumina['case_id']) & set(dataset_miRNA['case_id']) & set(dataset_RNA['case_id'])
    df_final_illumina = dataset_illumina.loc[dataset_illumina['case_id'].isin(cases_id)]
    df_final_rna = dataset_RNA.loc[dataset_RNA['case_id'].isin(cases_id)]
    df_final_mirna = dataset_miRNA.loc[dataset_miRNA['case_id'].isin(cases_id)]

    df_final_illumina.to_pickle('./data-ready/illumina_pickle.pkl')
    df_final_rna.to_pickle('./data-ready/rna_pickle.pkl')
    df_final_mirna.to_pickle('./data-ready/miRNA_pickle.pkl')
    
    my_dict = {
        'miRNA': df_final_mirna.to_dict(),
        'RNA': df_final_rna.to_dict(),
        'illumina': df_final_illumina.to_dict()
    }
    with open('final_dataset_common.json', 'w') as outfile:
        json.dump(my_dict, outfile)
    
    df_illumina = df_final_illumina.copy()
    df_mirna = df_final_mirna.copy()
    df_rna = df_final_rna.copy()
    
    del my_dict
    del df_final_illumina
    del df_final_rna
    del df_final_mirna
    del dataset_illumina
    del dataset_RNA
    del dataset_miRNA
else:
    df_illumina = pd.read_pickle('./data-ready/illumina_pickle.pkl')
    df_mirna = pd.read_pickle('./data-ready/miRNA_pickle.pkl')
    df_rna = pd.read_pickle('./data-ready/rna_pickle.pkl')
   

In [3]:
print("Illumina's shape {}".format(df_illumina.shape))
print("miRNA's shape {}".format(df_mirna.shape))
print("RNA's shape {}".format(df_rna.shape))

Illumina's shape (493, 15700)
miRNA's shape (473, 237)
RNA's shape (508, 12965)


Creating a new features that is composed by the *case_id* plus the *label*, in this way we consider only the case_id with the same label.
Drop the case_id because now we consider only the *new* feature composed by case_id with the label.

In [4]:
case_id_new = list()
for i, row in df_mirna.iterrows():
    case_id_new.append(row['case_id'] + '_' + str(row['label']))
df_mirna['case_id_new'] = case_id_new

case_id_new = list()
for i, row in df_rna.iterrows():
    case_id_new.append(row['case_id'] + '_' + str(row['label'])) 
df_rna['case_id_new'] = case_id_new

case_id_new = list()
for i, row in df_illumina.iterrows():
    case_id_new.append(row['case_id'] + '_' + str(row['label']))
df_illumina['case_id_new'] = case_id_new

cases_id = set(df_mirna['case_id_new']) & set(df_rna['case_id_new']) & set(df_illumina['case_id_new'])
df_illumina = df_illumina.loc[df_illumina['case_id_new'].isin(cases_id)]
df_rna = df_rna.loc[df_rna['case_id_new'].isin(cases_id)]
df_mirna = df_mirna.loc[df_mirna['case_id_new'].isin(cases_id)]

df_illumina.drop(columns=['case_id'], inplace=True, axis=1)
df_mirna.drop(columns=['case_id'], inplace=True, axis=1)
df_rna.drop(columns=['case_id'], inplace=True, axis=1)

print("Final illumina's shape {}".format(df_illumina.shape))
print("Final miRNA's shape {}".format(df_mirna.shape))
print("Final RNA's shape {}".format(df_rna.shape))

Final illumina's shape (430, 15700)
Final miRNA's shape (430, 237)
Final RNA's shape (430, 12965)


Before to elaborate the datasets into *SimilarityNetworkFusion* class, we order the datasets by *case_id_new*, this operation helps us with the comparison *case_id - cluster assigned*.

In [5]:
df_mirna.sort_values(by='case_id_new', inplace=True)
df_rna.sort_values(by='case_id_new', inplace=True)
df_illumina.sort_values(by='case_id_new', inplace=True)

snf = SimilarityNetworkFusion(df_mirna,
                            df_rna,
                            df_illumina, k=100).calculate_matrix()

Read file pickle for weights matrix of RNA
Read file pickle for weights matrix of miRNA
Read file pickle for weights matrix of Illumina


  0%|          | 0/430 [00:00<?, ?it/s]

Reading the file pickle for the p starting matrix RNA
Reading the file pickle for the p starting matrix miRNA
Reading the file pickle for the p starting matrix Illumina
Calculating S matrix for RNA...


100%|██████████| 430/430 [00:09<00:00, 43.68it/s]
  1%|          | 3/430 [00:00<00:14, 29.79it/s]

Calculating S matrix for miRNA...


100%|██████████| 430/430 [00:09<00:00, 44.75it/s]
  1%|          | 4/430 [00:00<00:11, 38.69it/s]

Calculating S matrix for Illumina...


100%|██████████| 430/430 [00:08<00:00, 49.08it/s]


<a id="way-to-fit"></a>
# Way to fit
In this section we show differents way that we have developed to fit the matrix


## Standard approach
This is the standard approach used in *Similarity Network Fusion* algorithm, itereates and update the matrix P per *num_iter* times.
The update of the matrix is calculated as follow:
$$1)  P_{t+1}^{(1)} = S^{(1)} * P_{t}^{(2)} * S^{T(1)}$$
$$2)  P_{t+1}^{(2)} = S^{(2)} * P_{t}^{(1)} * S^{T(2)}$$


In [None]:
snf.fit(num_iter=50)

## Difference between the matrices
Execute the updating of the matrices P in this way:
$$1)  P_{t+1}^{(1)} = S^{(1)} * P_{t}^{(2)} * S^{T(1)}$$
$$2)  P_{t+1}^{(2)} = S^{(2)} * P_{t}^{(1)} * S^{T(2)}$$

The matrix *(2)* in the first row and the matrix *(1)* in the second as the mean between the matrices P of the other 2 omnics.
The function stops when the difference between the matrices P are equal to the parameter matrices_diff passed.
If the differences isn't reached after a num_iteration, it stops automatically.

The difference is calculated as follow:
$$ diff\_matrix =  |P^{(1)} - P^{(2)}|$$

In [29]:
snf.iterations_fit(matrices_diff=8)

0 : 22.30666228921337
1 : 16.242586217975525
2 : 13.969165727060496
3 : 11.848278588536365
4 : 10.57803885332637
5 : 9.569871396814849
6 : 8.807148767515567
7 : 8.182963719297721
8 : 7.663138002957942
number of iterations to reach difference:  8


<myclass.SimilarityNetworkFusion.SimilarityNetworkFusion at 0x7f9606b4b8e0>

## Local minimum approach
Execute the updating of the matrices P in this way:
$$1)  P_{t+1}^{(1)} = S^{(1)} * P_{t}^{(2)} * S^{T(1)}$$
$$2)  P_{t+1}^{(2)} = S^{(2)} * P_{t}^{(1)} * S^{T(2)}$$

The matrix *(2)* in the first row and the matrix *(1)* in the second as the mean between the matrices P of the other 2 omnics.
The function stops when it reaches the local minum (the lowest value that is repeated for *iters_to_min* times consecutively).
If the local minimum isn't reached after a num_iteration, it stops automatically.

In [None]:
snf.local_minimum_fit(iters_to_min=4)

# Clustering and results
In this section we use *SpectralClustering* algorithm to calculate the affinity of one patient in one case of the tumor's type.
The metrics used to analyze the reults are:
* *Rand Score:*
$$ R = \frac{a + b}{a + b + c + d} = \frac{a + b}{\binom{n}{2}}$$
    * *a*: number of elements that are in the same subset in the original clusters and in the retrieved one

    * *b*: number of elements that are in different subsets in the original clusters and in the retrieved one

    * *c*: number of elements that are in different subsets in the original clusters but in the same subset in the retrieved one

    * *d*: number of elements that are in the same subset in the original clusters but in different subsets in the retrieved one

* *Silhouette Score:*
$$ s(i) = \frac{b(i) - a(i)}{max\{a(i), b(i)\}}$$
with:
    * *a(i) average intra-cluster distance:*
$$ a(i) = \frac{1}{|C_i| - 1} \sum_{j \in C_i, i\neq j} d(i,j)$$
    * *b(i) average inter-cluster distance:*
$$ b(i) = min_{k \neq i}\frac{1}{|C_k|} \sum_{j \in C_k} d(i,j)$$


In [30]:
y_illumina = LabelEncoder().fit_transform(df_illumina.loc[:, 'label'].transform(lambda x: str(x)))
y_mirna = LabelEncoder().fit_transform(df_mirna.loc[:, 'label'].transform(lambda x:  str(x)))
y_rna = LabelEncoder().fit_transform(df_rna.loc[:, 'label'].transform(lambda x: str(x)))

In [31]:
y_pred = SpectralClustering(n_clusters=3, affinity='precomputed').fit(snf.p_mirna).labels_


Array is not symmetric, and will be converted to symmetric by average with its transpose.



In [32]:
print('Rand Score:')
print('\tIllumina', adjusted_rand_score(y_illumina, y_pred))
print('\tMirna', adjusted_rand_score(y_mirna, y_pred))
print('\tRNA:', adjusted_rand_score(y_rna, y_pred))
print('\tmean:', (adjusted_rand_score(y_rna, y_pred) + adjusted_rand_score(y_illumina, y_pred) + adjusted_rand_score(y_mirna, y_pred))/3)

print('\n')
print('Silhouette score:')
print('\tIllumina', silhouette_score(snf.p_illumina, y_pred))
print('\tMirna', silhouette_score(snf.p_mirna, y_pred))
print('\tRNA:', silhouette_score(snf.p_rna, y_pred))

Rand Score:
	Illumina 0.8635112422462753
	Mirna 0.8635112422462753
	RNA: 0.8635112422462753
	mean: 0.8635112422462753


Silhouette score:
	Illumina 0.11278570119948149
	Mirna 0.13214156077249115
	RNA: 0.11277628262018903


## Analysis for standard fit approach

In [None]:
rand_scores = list()
sil_scores = list()
for num_iter in np.arange(10, 110, 10):
    snf.fit(num_iter=num_iter)
    y_pred = SpectralClustering(n_clusters=3, affinity='precomputed').fit(snf.p_mirna).labels_
    mean_rand = (adjusted_rand_score(y_rna, y_pred) + adjusted_rand_score(y_illumina, y_pred) + adjusted_rand_score(y_mirna, y_pred))/3
    mean_sil = (silhouette_score(snf.p_illumina, y_pred) + silhouette_score(snf.p_mirna, y_pred) + silhouette_score(snf.p_rna, y_pred))/3
    rand_scores.append(mean_rand)
    sil_scores.append(mean_sil)

steps = np.arange(10, 110, 10)
fig = go.Figure()
fig.add_trace(go.Scatter(x=steps, y=rand_scores, name='Rand_Score'))
fig.add_trace(go.Scatter(x=steps, y=sil_scores, name='Silhouette'))
fig.update_layout(title="Silhouette and Rand score for number of iterations",
                 xaxis_title="Number of iterations",
                 yaxis_title="Metric Score"
                 )
fig.show()

## Analysis for local minimum fit approach


In [None]:
rand_scores = list()
sil_scores = list()
for num_iter in np.arange(2, 11, 1):
    snf.local_minimum_fit(iters_to_min=num_iter, plot_step=False)
    y_pred = SpectralClustering(n_clusters=3, affinity='precomputed').fit(snf.p_mirna).labels_
    mean_rand = (adjusted_rand_score(y_rna, y_pred) + adjusted_rand_score(y_illumina, y_pred) + adjusted_rand_score(y_mirna, y_pred))/3
    mean_sil = (silhouette_score(snf.p_illumina, y_pred) + silhouette_score(snf.p_mirna, y_pred) + silhouette_score(snf.p_rna, y_pred))/3
    rand_scores.append(mean_rand)
    sil_scores.append(mean_sil)

steps = np.arange(2, 11, 1)
fig = go.Figure()
fig.add_trace(go.Scatter(x=steps, y=rand_scores, name='Rand_Score'))
fig.add_trace(go.Scatter(x=steps, y=sil_scores, name='Silhouette'))
fig.update_layout(title="Silhouette and Rand score for number of minimum repetitions",
                 xaxis_title="Number of minimum",
                 yaxis_title="Metric Score",
                 )
fig.show()

## Analysis for difference matrix fit approach

In [None]:
rand_scores = list()
sil_scores = list()
for conver in np.arange(2, 11, 1):
    snf.iterations_fit(matrices_diff=conver, plot_step=False)
    y_pred = SpectralClustering(n_clusters=3, affinity='precomputed').fit(snf.p_mirna).labels_
    mean_rand = (adjusted_rand_score(y_rna, y_pred) + adjusted_rand_score(y_illumina, y_pred) + adjusted_rand_score(y_mirna, y_pred))/3
    mean_sil = (silhouette_score(snf.p_illumina, y_pred) + silhouette_score(snf.p_mirna, y_pred) + silhouette_score(snf.p_rna, y_pred))/3
    rand_scores.append(mean_rand)
    sil_scores.append(mean_sil)

steps = np.arange(2, 11, 1)
fig = go.Figure()
fig.add_trace(go.Scatter(x=steps, y=rand_scores, name='Rand_Score'))
fig.add_trace(go.Scatter(x=steps, y=sil_scores, name='Silhouette'))
fig.update_layout(title="Silhouette and Rand score for difference matrix value",
                 xaxis_title="Difference value matrix",
                 yaxis_title="Metric Score",
                 )
fig.show()

# Clinical data

In [45]:
cases_id = [el.split('_')[0] for el in df_rna['case_id_new']]
df_clinical_case = ExtractClinicalCase(cases_id).get_df_clinical_case()

df_clinical_case.sort_values(by='case_id', inplace=True)
df = pd.DataFrame()
df['case_id'] = cases_id
df['label'] = y_pred
clinica_cases_id = [el.replace('\n', '') for el in df_clinical_case['case_id']]

label = df.loc[df['case_id'].isin(clinica_cases_id)]['label']
df_clinical_case['label'] = label

In [46]:
df_clinical_case.head()

Unnamed: 0,case_id,tumor_stage,prior_malignancy,age_at_diagnosis,morphology,label
386,00fd9306-4a68-49ab-a768-e5fed126a765,stage iib,no,50.0,8070/3,1
355,01417822-b608-4934-8fe0-594315212be5,stage iia,no,50.0,8070/3,0
189,0232d299-4cdf-4fd7-9a5e-8d13c208b40c,stage iiia,no,50.0,8140/3,0
124,028e99e9-5b9a-4954-bb6e-6d4709a3cea8,stage ib,no,70.0,8252/3,1
262,02ba8600-9d3d-489d-9f0b-f1c59085ecd6,stage iv,no,60.0,8070/3,0


In [48]:
dict_ = {}
df = pd.DataFrame()
# Consideriamo 
for l in range(0, 3):
    dict_['Cluster'] = l
    for el in set(df_clinical_case['prior_malignancy']):
        el_cluster = df_clinical_case[df_clinical_case['label'] == l]
        count = el_cluster[el_cluster['prior_malignancy'] == el]['case_id'].count()
        dict_[el] = count
    
    df = df.append(pd.DataFrame(dict_, index=[0]))

Unnamed: 0,Cluster,no,yes
0,0,166,21
0,1,201,37
0,2,2,0


In [54]:
fig = go.Figure(data=[
    go.Bar(name='yes', x=df['Cluster'], y=df['yes']),
    go.Bar(name='no', x=df['Cluster'], y=df['no'])
])
# Change the bar mode
fig.update_layout(barmode='stack',
                  title='Prior Malignancy Distribution',
                  xaxis_title="Clusters",
                  yaxis_title="Count")
fig.show()

In [72]:
dict_ = {}
df = pd.DataFrame()
df_clinical_case.dropna(inplace=True)
# Consideriamo 
for l in range(0, 3):
    dict_['Cluster'] = l
    for el in set(df_clinical_case['age_at_diagnosis']):
        el_cluster = df_clinical_case[df_clinical_case['label'] == l]
        count = el_cluster[el_cluster['age_at_diagnosis'] == el]['case_id'].count()
        dict_[el] = count
    
    df = df.append(pd.DataFrame(dict_, index=[0]))

df.head()

Unnamed: 0,Cluster,70.0,40.0,80.0,50.0,60.0,30.0
0,0,80,5,10,37,55,0
0,1,84,18,13,44,77,1
0,2,0,0,1,0,1,0


In [73]:
fig = go.Figure()

for key in df.columns[:1]:
    fig = fig.add_trace(
        go.Bar(name=key, x=df['Cluster'], y=df[key])
    )
    
# Change the bar mode
fig.update_layout(barmode='stack',
                  title='Age diagnosis',
                  xaxis_title="Clusters",
                  yaxis_title="Count")
fig.show()

## Clustering with K-Means algorithm
Try to apply a difference clustering tecnique. We decide for the *K-means* algorithm

In [None]:
y_pred = KMeans(n_clusters=3).fit(snf.p_mirna).labels_
print('Rand Score:')
print('\tIllumina', adjusted_rand_score(y_illumina, y_pred))
print('\tMirna', adjusted_rand_score(y_mirna, y_pred))
print('\tRNA:', adjusted_rand_score(y_rna, y_pred))
print('\n')
print('Silhouette score:')
print('\tIllumina', silhouette_score(snf.p_illumina, y_pred))
print('\tMirna', silhouette_score(snf.p_mirna, y_pred))
print('\tRNA:', silhouette_score(snf.p_rna, y_pred))

# Genes from TCGA
In this section we try to have a major impact on the the result selecting only the genes of **The Cancer Genome Atlas**.
This is possible only on the *RNA* omnic.

In [None]:
with open('all_ensgs_no_version.json') as f:
    genes = json.load(f)
print('Number of genes available:', len(genes))
print('Starting shape RNA:',df_rna.shape)
columns = [col.split('.')[0] for col in df_rna.columns]
columns = [original for gene, original in zip(columns, df_rna.columns) if gene in genes]
columns.append('case_id_new')
columns.append('label')
df_rna = df_rna.loc[:, columns]
print('Final shape of RNA:', df_rna.shape)

In [None]:
cases_id = set(df_mirna['case_id_new']) & set(df_rna['case_id_new']) & set(df_illumina['case_id_new'])
df_illumina = df_illumina.loc[df_illumina['case_id_new'].isin(cases_id)]
df_rna = df_rna.loc[df_rna['case_id_new'].isin(cases_id)]
df_mirna = df_mirna.loc[df_mirna['case_id_new'].isin(cases_id)]

print("Final illumina's shape {}".format(df_illumina.shape))
print("Final miRNA's shape {}".format(df_mirna.shape))
print("Final RNA's shape {}".format(df_rna.shape))

In [None]:
df_mirna.sort_values(by='case_id_new', inplace=True)
df_rna.sort_values(by='case_id_new', inplace=True)
df_illumina.sort_values(by='case_id_new', inplace=True)

snf = SimilarityNetworkFusion(df_mirna,
                            df_rna,
                            df_illumina, k=100).calculate_matrix()

In [None]:
snf.local_minimum_fit(iters_to_min=4)
#sm.iterations_fit(matrices_diff=3)

In [None]:
y_illumina = LabelEncoder().fit_transform(df_illumina.loc[:, 'label'].transform(lambda x: str(x)))
y_mirna = LabelEncoder().fit_transform(df_mirna.loc[:, 'label'].transform(lambda x:  str(x)))
y_rna = LabelEncoder().fit_transform(df_rna.loc[:, 'label'].transform(lambda x: str(x)))

y_pred = SpectralClustering(n_clusters=3, affinity='precomputed').fit(snf.p_mirna).labels_

print('Rand Score:')
print('\tIllumina', adjusted_rand_score(y_illumina, y_pred))
print('\tMirna', adjusted_rand_score(y_mirna, y_pred))
print('\tRNA:', adjusted_rand_score(y_rna, y_pred))
print('\tmean:', (adjusted_rand_score(y_rna, y_pred) + adjusted_rand_score(y_illumina, y_pred) + adjusted_rand_score(y_mirna, y_pred))/3)

print('\n')
print('Silhouette score:')
print('\tIllumina', silhouette_score(snf.p_illumina, y_pred))
print('\tMirna', silhouette_score(snf.p_mirna, y_pred))
print('\tRNA:', silhouette_score(snf.p_rna, y_pred))