In [1]:
%load_ext autoreload
%autoreload 2

from code_py.DIAMOnD import *
from code_py.backbone import Human_Genes_Graph_Analysis
import markov_clustering as mc
from joblib import Parallel, delayed
from tqdm import tqdm

sys_path = '/Users/alessandroquattrociocchi/Git/BI/Final_project/'


#  Part 1: Preprocessing

In [2]:
disease_code = "C1510586"
hga = Human_Genes_Graph_Analysis(sys_path,disease_ID = disease_code)
hga.create_empty_dataframe(name = disease_code)

#### 1.1 -- Filtering and removing self-loops

In [3]:
# Preprocessing the Biogrid dataset by filtering Homo Sapiens, removing duplicated and self loops
hs_putative_genes = hga.preprocessing_dataset(homo_sap=True,drop_duplicates=True,remove_self_loops=True)

#### 1.2 -- Filtering Disease Genes

In [4]:
hs_disease_genes_df,hs_disease_genes = hga.query_disease_genes()

Found 85 disease genes in Autism Spectrum Disorders


#### 1.3 -- Creating LCC sub-graph,adjacency matrix,

In [5]:
# We are creating the graph with nx from the from the filtered PPI dataset and returning the sub graph, adj matrix, nodes and edges of LCC 
pgenes_sub_graph,pgenes_adj,pnodes,pedges = hga.LCC_to_adj(hs_putative_genes)

19618
Graph with 19618 nodes and 665061 edges


#### 1.3 -- Cross Validation

In [6]:
ds_genes_train,ds_genes_test = hga.KFold_CV(hs_disease_genes,n_folds=5,shuffle_flag=True)

# Part 2: Algorithms  

### 2.1 -- MCL Algorithm 

In [None]:
# Applying MLC Algoritm by given inflation range (1.5, 2.7, step = 0.1)
results = Parallel(n_jobs=3)(delayed(hga.MCL)(pgenes_adj,i) for i in tqdm(np.arange(1.5,2.7,0.1)))
hga.list_to_pikle(results,'MLC_modularity')

In [16]:
results_list_from_pkl = hga.read_pickle_list('MLC_modularity')
for i in enumerate(np.arange(1.5,2.7,0.1)):
    print("inflation:", round(i[1],2), "modularity:", results_list_from_pkl[i[0]])
    

inflation: 1.5 modularity: 0.7318793908083768
inflation: 1.6 modularity: 0.7664038191128469
inflation: 1.7 modularity: 0.7948113431834943
inflation: 1.8 modularity: 0.8254253707325344
inflation: 1.9 modularity: 0.8034790682066575
inflation: 2.0 modularity: 0.7514658577401434
inflation: 2.1 modularity: 0.6951036602617071
inflation: 2.2 modularity: 0.641935117124079
inflation: 2.3 modularity: 0.5919604667312881
inflation: 2.4 modularity: 0.5541978925627196
inflation: 2.5 modularity: 0.5206722692342249
inflation: 2.6 modularity: 0.49161128642816465
inflation: 2.7 modularity: 0.46866783664642075


### 2.1.1 -- Creating Clusters

In [7]:
best_inflation = 1.8
result = mc.run_mcl(pgenes_adj, inflation=best_inflation)
clusters = mc.get_clusters(result)
print(str(len(clusters))+" of clusters obtained with inflation of "+str(best_inflation))

2106 of clusters obtained with inflation of 1.8


In [8]:
_, enriched_genes,enriched_cluster_ID = hga.MLC_eval(pgenes_sub_graph,ds_genes_train,clusters)

Fold number:  0
5 disease genes in cluster 0 --> 0.144678
19 disease genes in cluster 2 --> 0.077192
3 disease genes in cluster 11 --> 0.216522
Fold number:  1
4 disease genes in cluster 0 --> 0.198559
20 disease genes in cluster 2 --> 0.058601
3 disease genes in cluster 11 --> 0.216522
Fold number:  2
4 disease genes in cluster 0 --> 0.198559
24 disease genes in cluster 2 --> 0.009865
3 disease genes in cluster 11 --> 0.216522
Fold number:  3
5 disease genes in cluster 0 --> 0.144678
21 disease genes in cluster 2 --> 0.041493
3 disease genes in cluster 11 --> 0.216522
Fold number:  4
6 disease genes in cluster 0 --> 0.086389
20 disease genes in cluster 2 --> 0.058601
4 disease genes in cluster 11 --> 0.197477
The index of the enriched cluster found using MLC is:  [2]


In [9]:
hga.MCL_evaluation_metrics(pgenes_sub_graph,ds_genes_test,hs_disease_genes,clusters,enriched_cluster_ID)

TP: 26 --- FP: 2080 --- FN: 19592
Precision: 1.23 --- Recall: 0.13 --- F1 Score: 0.24


### 2.2 -- DIAMOnD Algorithm

In [10]:
results_df = hga.return_metrics("DIAMOnD", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, save_dataframe=True)
results_df[["@",'Metric',"DIAMOnD"]]

DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network


Unnamed: 0,@,Metric,DIAMOnD
0,,P,1.6 ± 1.67
1,50,R,4.71 ± 4.92
2,,F1,3.98 ± 1.72
3,,nDCG,1.96 ± 2.02
4,,P,5.0 ± 6.85
5,n/10,R,2.35 ± 3.22
6,,F1,8.0 ± 0.0
7,,nDCG,4.33 ± 6.07
8,,P,1.9 ± 2.61
9,n/4,R,2.35 ± 3.22


### 2.3 -- DiaBLE Algorithm

In [11]:
results_df = hga.return_metrics("DiaBLE", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, save_dataframe=True)
results_df[["@",'Metric',"DiaBLE"]]

DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE(): ignoring 3 of 68 seed genes that are not in the network


Unnamed: 0,@,Metric,DiaBLE
0,,P,2.0 ± 1.41
1,50,R,5.88 ± 4.16
2,,F1,3.73 ± 1.49
3,,nDCG,2.45 ± 1.7
4,,P,7.5 ± 6.85
5,n/10,R,3.53 ± 3.22
6,,F1,8.0 ± 0.0
7,,nDCG,5.93 ± 5.68
8,,P,2.86 ± 2.61
9,n/4,R,3.53 ± 3.22


### 2.4 -- Cytoscape

In [12]:
results_df =hga.return_metrics("Cytoscape", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, save_dataframe=True)
results_df[["@",'Metric',"Cytoscape"]]

Unnamed: 0,@,Metric,Cytoscape
0,,P,0.0 ± 0.0
1,50,R,0.0 ± 0.0
2,,F1,0.0 ± 0.0
3,,nDCG,0.0 ± 0.0
4,,P,0.0 ± 0.0
5,n/10,R,0.0 ± 0.0
6,,F1,0.0 ± 0.0
7,,nDCG,0.0 ± 0.0
8,,P,0.0 ± 0.0
9,n/4,R,0.0 ± 0.0


### 2.6 -- Random Walk with Restart 

In [13]:
results_df = hga.return_metrics("RWR", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, save_dataframe=True)
results_df[["@",'Metric',"RWR"]]

Unnamed: 0,@,Metric,RWR
0,,P,0.0 ± 0.0
1,50,R,0.0 ± 0.0
2,,F1,0.0 ± 0.0
3,,nDCG,0.0 ± 0.0
4,,P,0.0 ± 0.0
5,n/10,R,0.0 ± 0.0
6,,F1,0.0 ± 0.0
7,,nDCG,0.0 ± 0.0
8,,P,0.0 ± 0.0
9,n/4,R,0.0 ± 0.0


# Part 3: Extented Validation  

In [14]:
results_df = hga.return_metrics("DIAMOnD", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, save_dataframe=True, extended_val = True)
results_df[["@",'Metric',"DIAMOnD Ext"]]

DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD Ext


Unnamed: 0,@,Metric,DIAMOnD Ext
0,,P,5.2 ± 3.35
1,50,R,1.21 ± 0.78
2,,F1,2.46 ± 0.73
3,,nDCG,6.6 ± 4.94
4,,P,2.8 ± 1.75
5,n/10,R,1.4 ± 0.87
6,,F1,2.33 ± 0.6
7,,nDCG,4.17 ± 2.95
8,,P,1.87 ± 0.75
9,n/4,R,2.34 ± 0.94


In [15]:
results_df = hga.return_metrics("DiaBLE", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, save_dataframe=True, extended_val = True)
results_df[["@",'Metric',"DiaBLE Ext"]]

DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE Ext


Unnamed: 0,@,Metric,DiaBLE Ext
0,,P,5.2 ± 3.35
1,50,R,1.21 ± 0.78
2,,F1,2.46 ± 0.73
3,,nDCG,6.6 ± 4.94
4,,P,2.8 ± 1.75
5,n/10,R,1.4 ± 0.87
6,,F1,2.33 ± 0.6
7,,nDCG,4.17 ± 2.95
8,,P,1.87 ± 0.75
9,n/4,R,2.34 ± 0.94


In [16]:
results_df = hga.return_metrics("Cytoscape", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, save_dataframe=True, extended_val = True)
results_df[["@",'Metric',"Cytoscape Ext"]]

Cytoscape Ext


Unnamed: 0,@,Metric,Cytoscape Ext
0,,P,4.4 ± 8.76
1,50,R,1.03 ± 2.05
2,,F1,4.17 ± 4.82
3,,nDCG,4.2 ± 8.39
4,,P,2.62 ± 4.35
5,n/10,R,1.31 ± 2.18
6,,F1,2.91 ± 3.43
7,,nDCG,2.91 ± 5.1
8,,P,2.62 ± 3.01
9,n/4,R,3.27 ± 3.75


In [17]:
results_df = hga.return_metrics("RWR", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, save_dataframe=True, extended_val = True)
results_df[["@",'Metric',"RWR Ext"]]

RWR Ext


Unnamed: 0,@,Metric,RWR Ext
0,,P,18.0 ± 8.25
1,50,R,4.2 ± 1.93
2,,F1,6.81 ± 3.12
3,,nDCG,17.39 ± 7.62
4,,P,12.52 ± 4.21
5,n/10,R,6.26 ± 2.11
6,,F1,8.34 ± 2.81
7,,nDCG,13.5 ± 4.4
8,,P,6.29 ± 1.66
9,n/4,R,7.84 ± 2.08


In [18]:
hga.dataframe_to_html(results_df)
results_df

Unnamed: 0,@,Metric,DIAMOnD,DiaBLE,Cytoscape,RWR,DIAMOnD Ext,DiaBLE Ext,Cytoscape Ext,RWR Ext
0,,P,1.6 ± 1.67,2.0 ± 1.41,0.0 ± 0.0,0.0 ± 0.0,5.2 ± 3.35,5.2 ± 3.35,4.4 ± 8.76,18.0 ± 8.25
1,50,R,4.71 ± 4.92,5.88 ± 4.16,0.0 ± 0.0,0.0 ± 0.0,1.21 ± 0.78,1.21 ± 0.78,1.03 ± 2.05,4.2 ± 1.93
2,,F1,3.98 ± 1.72,3.73 ± 1.49,0.0 ± 0.0,0.0 ± 0.0,2.46 ± 0.73,2.46 ± 0.73,4.17 ± 4.82,6.81 ± 3.12
3,,nDCG,1.96 ± 2.02,2.45 ± 1.7,0.0 ± 0.0,0.0 ± 0.0,6.6 ± 4.94,6.6 ± 4.94,4.2 ± 8.39,17.39 ± 7.62
4,,P,5.0 ± 6.85,7.5 ± 6.85,0.0 ± 0.0,0.0 ± 0.0,2.8 ± 1.75,2.8 ± 1.75,2.62 ± 4.35,12.52 ± 4.21
5,n/10,R,2.35 ± 3.22,3.53 ± 3.22,0.0 ± 0.0,0.0 ± 0.0,1.4 ± 0.87,1.4 ± 0.87,1.31 ± 2.18,6.26 ± 2.11
6,,F1,8.0 ± 0.0,8.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,2.33 ± 0.6,2.33 ± 0.6,2.91 ± 3.43,8.34 ± 2.81
7,,nDCG,4.33 ± 6.07,5.93 ± 5.68,0.0 ± 0.0,0.0 ± 0.0,4.17 ± 2.95,4.17 ± 2.95,2.91 ± 5.1,13.5 ± 4.4
8,,P,1.9 ± 2.61,2.86 ± 2.61,0.0 ± 0.0,0.0 ± 0.0,1.87 ± 0.75,1.87 ± 0.75,2.62 ± 3.01,6.29 ± 1.66
9,n/4,R,2.35 ± 3.22,3.53 ± 3.22,0.0 ± 0.0,0.0 ± 0.0,2.34 ± 0.94,2.34 ± 0.94,3.27 ± 3.75,7.84 ± 2.08


# Part 4: Enrichment Analysis 

In [19]:
added_nodes, predicted_nodes = DIAMOnD(G_original=pgenes_sub_graph,
                seed_genes=hs_disease_genes,
                max_number_of_added_nodes=200,alpha=1,DiaBLE=True)

DiaBLE(): ignoring 3 of 85 seed genes that are not in the network


In [20]:
predicted_nodes



['NLGN1',
 'NLGN2',
 'DLG2',
 'DLG3',
 'DLGAP1',
 'GRIN2A',
 'DLGAP2',
 'ZBTB7A',
 'KHDRBS1',
 'GUCY1A2',
 'DLG1',
 'LIN7A',
 'KCNJ12',
 'KCNJ4',
 'DLGAP4',
 'CASK',
 'LIN7B',
 'LIN7C',
 'APBA1',
 'PXDC1',
 'ARHGEF26',
 'MPP6',
 'MPP2',
 'SNTB2',
 'BAI1',
 'SNTA1',
 'KCNA4',
 'DMD',
 'SNTB1',
 'NOS1',
 'DTNB',
 'UTRN',
 'DTNA',
 'SNTG1',
 'PLEKHA2',
 'TAGAP',
 'CTNNAL1',
 'ADRA1D',
 'TNS1',
 'SPZ1',
 'TNS3',
 'TENC1',
 'MPP7',
 'KIF26B',
 'MTMR2',
 'ERBB4',
 'DUSP10',
 'MPP3',
 'MPDZ',
 'MPP5',
 'EPB41L4A',
 'INADL',
 'C15orf59',
 'ZGPAT',
 'WWC1',
 'RHPN1',
 'GRIN1',
 'GRIN3A',
 'NETO1',
 'F8A1',
 'MAGI2',
 'CRHR1',
 'DLGAP3',
 'SHANK2',
 'RPS6KA1',
 'ADRB1',
 'ERBB2IP',
 'GRIN2C',
 'CTNNA1',
 'C11orf52',
 'ABLIM1',
 'FLOT1',
 'EPB41',
 'KRAS',
 'PLEKHA1',
 'OCLN',
 'PTPN13',
 'PARD3',
 'FAM171B',
 'MLLT4',
 'FAM171A1',
 'LYN',
 'RHOB',
 'USP6NL',
 'KIAA0754',
 'PLCH1',
 'EFR3B',
 'DNAJC5',
 'CXADR',
 'FRS2',
 'MARCKS',
 'PKP4',
 'SCRIB',
 'CAV1',
 'PHACTR4',
 'ZDHHC5',
 'GJA1',
 'ANK