In [2]:
%load_ext autoreload
%autoreload 2

from code_py.DIAMOnD import *
from code_py.backbone import Human_Genes_Graph_Analysis
import markov_clustering as mc
from joblib import Parallel, delayed
from tqdm import tqdm

sys_path = '/Users/alessandroquattrociocchi/Git/BI/Final_project/'


#  Part 1: Preprocessing

In [3]:
disease_code = "C1510586"
hga = Human_Genes_Graph_Analysis(sys_path,disease_ID = disease_code)
hga.create_empty_dataframe(name = disease_code)

#### 1.1 -- Filtering and removing self-loops

In [4]:
# Preprocessing the Biogrid dataset by filtering Homo Sapiens, removing duplicated and self loops
hs_putative_genes = hga.preprocessing_dataset(homo_sap=True,drop_duplicates=True,remove_self_loops=True)

#### 1.2 -- Filtering Disease Genes

In [5]:
hs_disease_genes_df,hs_disease_genes = hga.query_disease_genes()

Found 85 disease genes in Autism Spectrum Disorders


#### 1.3 -- Creating LCC sub-graph,adjacency matrix,

In [6]:
# We are creating the graph with nx from the from the filtered PPI dataset and returning the sub graph, adj matrix, nodes and edges of LCC 
pgenes_sub_graph,pgenes_adj,pnodes,pedges = hga.LCC_to_adj(hs_putative_genes)

# of connected components: 1
19618
Graph with 19618 nodes and 665061 edges


#### 1.3 -- Cross Validation

In [9]:
ds_genes_train,ds_genes_test = hga.KFold_CV(hs_disease_genes,n_folds=5,shuffle_flag=True)

#### 1.4 -- Saving Seed Genes to files for Cytoscape

In [33]:
for i in range(len(ds_genes_test)):
    #np.savetxt("cytoscape/test"+str(disease_code)+str(i)+".csv", 
    #        ds_genes_test[i],
    #        delimiter =", ", 
    #        fmt ='% s')
    np.savetxt("cytoscape/"+str(disease_code)+"/train"+str(disease_code)+"_"+str(i)+".csv", 
            ds_genes_train[i],
            delimiter =", ", 
            fmt ='% s')

# Part 2: Algorithms  

### 2.1 -- MCL Algorithm 

In [None]:
# Applying MLC Algoritm by given inflation range (1.5, 2.7, step = 0.1)
results = Parallel(n_jobs=3)(delayed(hga.MCL)(pgenes_adj,i) for i in tqdm(np.arange(1.5,2.7,0.1)))
hga.list_to_pikle(results,'MLC_modularity')

In [16]:
results_list_from_pkl = hga.read_pickle_list('MLC_modularity')
for i in enumerate(np.arange(1.5,2.7,0.1)):
    print("inflation:", round(i[1],2), "modularity:", results_list_from_pkl[i[0]])
    

inflation: 1.5 modularity: 0.7318793908083768
inflation: 1.6 modularity: 0.7664038191128469
inflation: 1.7 modularity: 0.7948113431834943
inflation: 1.8 modularity: 0.8254253707325344
inflation: 1.9 modularity: 0.8034790682066575
inflation: 2.0 modularity: 0.7514658577401434
inflation: 2.1 modularity: 0.6951036602617071
inflation: 2.2 modularity: 0.641935117124079
inflation: 2.3 modularity: 0.5919604667312881
inflation: 2.4 modularity: 0.5541978925627196
inflation: 2.5 modularity: 0.5206722692342249
inflation: 2.6 modularity: 0.49161128642816465
inflation: 2.7 modularity: 0.46866783664642075


### 2.1.1 -- Creating Clusters

In [8]:
best_inflation = 1.8
result = mc.run_mcl(pgenes_adj, inflation=best_inflation)
clusters = mc.get_clusters(result)
print(str(len(clusters))+" of clusters obtained with inflation of "+str(best_inflation))

2106 of clusters obtained with inflation of 1.8


In [9]:
_, enriched_genes,enriched_cluster_ID = hga.MLC_eval(pgenes_sub_graph,ds_genes_train,clusters)

Fold number:  0
5 disease genes in cluster 0 --> 0.144678
19 disease genes in cluster 2 --> 0.077192
3 disease genes in cluster 11 --> 0.216522
Fold number:  1
4 disease genes in cluster 0 --> 0.198559
20 disease genes in cluster 2 --> 0.058601
3 disease genes in cluster 11 --> 0.216522
Fold number:  2
4 disease genes in cluster 0 --> 0.198559
24 disease genes in cluster 2 --> 0.009865
3 disease genes in cluster 11 --> 0.216522
Fold number:  3
5 disease genes in cluster 0 --> 0.144678
21 disease genes in cluster 2 --> 0.041493
3 disease genes in cluster 11 --> 0.216522
Fold number:  4
6 disease genes in cluster 0 --> 0.086389
20 disease genes in cluster 2 --> 0.058601
4 disease genes in cluster 11 --> 0.197477
The index of the enriched cluster found using MLC is:  [2]


In [10]:
hga.MCL_evaluation_metrics(pgenes_sub_graph,ds_genes_test,hs_disease_genes,clusters,enriched_cluster_ID)

TP: 26 --- FP: 2080 --- FN: 19592
Precision: 1.23 --- Recall: 0.13 --- F1 Score: 0.24


### 2.2 -- DIAMOnD Algorithm

In [85]:
results_df = hga.return_metrics("DIAMOnD", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=False)
results_df[["@",'Metric',"DIAMOnD"]]

DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 68 seed genes that are not in the network


Unnamed: 0,@,Metric,DIAMOnD
0,,P,33.72 ± 31.56
1,50,R,4.71 ± 4.92
2,,F1,13.7 ± 5.49
3,,nDCG,1.96 ± 2.02
4,,P,21.33 ± 29.21
5,n/10,R,2.35 ± 3.22
6,,F1,10.6 ± 0.0
7,,nDCG,4.33 ± 6.07
8,,P,20.49 ± 28.05
9,n/4,R,2.35 ± 3.22


### 2.3 -- DiaBLE Algorithm

In [93]:
results_df = hga.return_metrics("DiaBLE", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=False)
results_df[["@",'Metric',"DiaBLE"]]

DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE(): ignoring 3 of 68 seed genes that are not in the network
DiaBLE(): ignoring 3 of 68 seed genes that are not in the network


Unnamed: 0,@,Metric,DiaBLE
0,,P,43.82 ± 25.58
1,50,R,5.88 ± 4.16
2,,F1,12.91 ± 4.75
3,,nDCG,2.45 ± 1.7
4,,P,32.0 ± 29.21
5,n/10,R,3.53 ± 3.22
6,,F1,10.6 ± 0.0
7,,nDCG,5.93 ± 5.68
8,,P,30.73 ± 28.05
9,n/4,R,3.53 ± 3.22


### 2.4 -- Cytoscape

In [10]:
results_df =hga.return_metrics("Cytoscape", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=False)
results_df[["@",'Metric',"Cytoscape"]]

Unnamed: 0,@,Metric,Cytoscape
0,,P,0.0 ± 0.0
1,50,R,0.0 ± 0.0
2,,F1,0.0 ± 0.0
3,,nDCG,0.0 ± 0.0
4,,P,0.0 ± 0.0
5,n/10,R,0.0 ± 0.0
6,,F1,0.0 ± 0.0
7,,nDCG,0.0 ± 0.0
8,,P,0.0 ± 0.0
9,n/4,R,0.0 ± 0.0


### 2.6 -- Random Walk with Restart 

In [11]:
results_df = hga.return_metrics("RWR", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=False)
results_df[["@",'Metric',"RWR"]]

Unnamed: 0,@,Metric,RWR
0,,P,0.0 ± 0.0
1,50,R,0.0 ± 0.0
2,,F1,0.0 ± 0.0
3,,nDCG,0.0 ± 0.0
4,,P,0.0 ± 0.0
5,n/10,R,0.0 ± 0.0
6,,F1,0.0 ± 0.0
7,,nDCG,0.0 ± 0.0
8,,P,0.0 ± 0.0
9,n/4,R,0.0 ± 0.0


# Part 3: Extented Validation  

In [None]:
results_df = hga.return_metrics("DIAMOnD", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=False, extended_val = True)

In [None]:
hga.return_metrics("DiaBLE", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=False, extended_val = True)

In [None]:
results_df = hga.return_metrics("Cytoscape", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=False, extended_val = True)
results_df[["@",'Metric',"Cytoscape Ext"]]

In [None]:
hga.return_metrics("RWR", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=False, extended_val = True)

In [97]:
hga.dataframe_to_html(results_df)
results_df

Unnamed: 0,@,Metric,DIAMOnD,DiaBLE
0,,P,33.72 ± 31.56,43.82 ± 25.58
1,50,R,4.71 ± 4.92,5.88 ± 4.16
2,,F1,13.7 ± 5.49,12.91 ± 4.75
3,,nDCG,1.96 ± 2.02,2.45 ± 1.7
4,,P,21.33 ± 29.21,32.0 ± 29.21
5,n/10,R,2.35 ± 3.22,3.53 ± 3.22
6,,F1,10.6 ± 0.0,10.6 ± 0.0
7,,nDCG,4.33 ± 6.07,5.93 ± 5.68
8,,P,20.49 ± 28.05,30.73 ± 28.05
9,n/4,R,2.35 ± 3.22,3.53 ± 3.22


In [98]:
hga.dataframe_to_latex(results_df)

\begin{tabular}{llll}
\toprule
    @ & Metric &       DIAMOnD &        DiaBLE \\
\midrule
      &      P & 33.72 ± 31.56 & 43.82 ± 25.58 \\
   50 &      R &   4.71 ± 4.92 &   5.88 ± 4.16 \\
      &     F1 &   13.7 ± 5.49 &  12.91 ± 4.75 \\
      &   nDCG &   1.96 ± 2.02 &    2.45 ± 1.7 \\
      &      P & 21.33 ± 29.21 &  32.0 ± 29.21 \\
 n/10 &      R &   2.35 ± 3.22 &   3.53 ± 3.22 \\
      &     F1 &    10.6 ± 0.0 &    10.6 ± 0.0 \\
      &   nDCG &   4.33 ± 6.07 &   5.93 ± 5.68 \\
      &      P & 20.49 ± 28.05 & 30.73 ± 28.05 \\
  n/4 &      R &   2.35 ± 3.22 &   3.53 ± 3.22 \\
      &     F1 &   10.55 ± 0.0 &   10.55 ± 0.0 \\
      &   nDCG &    2.36 ± 3.3 &   3.23 ± 3.09 \\
      &      P & 33.79 ± 31.63 & 43.91 ± 25.64 \\
  n/2 &      R &   4.71 ± 4.92 &   5.88 ± 4.16 \\
      &     F1 &  13.71 ± 5.49 &  12.92 ± 4.75 \\
      &   nDCG &    2.2 ± 2.28 &   2.75 ± 1.92 \\
      &      P &  33.56 ± 31.4 & 43.62 ± 25.46 \\
    n &      R &   4.71 ± 4.92 &   5.88 ± 4.16 \\
      &   

# Part 4: Enrichment Analysis 

In [57]:
added_nodes, predicted_nodes = DIAMOnD(G_original=pgenes_sub_graph,
                seed_genes=hs_disease_genes,
                max_number_of_added_nodes=115,alpha=1)

DIAMOnD(): ignoring 3 of 85 seed genes that are not in the network


In [64]:
set(predicted_nodes+hs_disease_genes)

{'ABLIM1',
 'ADRA1D',
 'ADRB1',
 'ANKRD26',
 'APBA1',
 'ARHGEF26',
 'BAI1',
 'C11orf52',
 'C15orf59',
 'CASK',
 'CAV1',
 'CDC42BPA',
 'CRHR1',
 'CTNNA1',
 'CTNNAL1',
 'CXADR',
 'DIRAS3',
 'DLG1',
 'DLG2',
 'DLG3',
 'DLG5',
 'DLGAP1',
 'DLGAP2',
 'DLGAP3',
 'DLGAP4',
 'DMD',
 'DNAJC5',
 'DTNA',
 'DTNB',
 'DUSP10',
 'EFR3A',
 'EFR3B',
 'EPB41',
 'EPB41L4A',
 'ERBB2IP',
 'ERBB4',
 'F8A1',
 'FAM171A1',
 'FAM171A2',
 'FAM171B',
 'FLOT1',
 'FRS2',
 'GAB1',
 'GJA1',
 'GPRIN3',
 'GRIN1',
 'GRIN2A',
 'GRIN2C',
 'GRIN3A',
 'GUCY1A2',
 'INADL',
 'KCNA4',
 'KCNJ12',
 'KCNJ4',
 'KHDRBS1',
 'KIAA0754',
 'KIF26B',
 'KRAS',
 'LCK',
 'LIN7A',
 'LIN7B',
 'LIN7C',
 'LLGL1',
 'LPHN2',
 'LYN',
 'MAGI2',
 'MARCKS',
 'MARK2',
 'MARK3',
 'MLLT4',
 'MPDZ',
 'MPP2',
 'MPP3',
 'MPP5',
 'MPP6',
 'MPP7',
 'MTMR2',
 'NETO1',
 'NLGN1',
 'NLGN2',
 'NOS1',
 'OCLN',
 'PALM',
 'PARD3',
 'PHACTR4',
 'PKP4',
 'PLCH1',
 'PLEKHA1',
 'PLEKHA2',
 'PSD3',
 'PTPN13',
 'PXDC1',
 'RAB35',
 'RHOB',
 'RHPN1',
 'RPS6KA1',
 'SCRIB',


In [63]:
for i in set(predicted_nodes+hs_disease_genes):
    print(i)

GPRIN3
ADRB1
DTNB
LLGL1
MPP3
WWC1
DMD
ZGPAT
OCLN
CTNNA1
DLG3
DLG2
PTPN13
DLG5
DLGAP2
ZBTB7A
BAI1
LIN7B
STX7
PALM
GAB1
FAM171A1
SNTA1
ADRA1D
KCNJ4
GRIN2C
KCNJ12
CXADR
MARCKS
PLEKHA1
DUSP10
ZDHHC5
RAB35
CAV1
ABLIM1
PSD3
FAM171B
RHPN1
EFR3B
SPZ1
TNS1
MLLT4
PARD3
C11orf52
MARK2
MPDZ
LYN
KIF26B
CASK
NOS1
TAGAP
TNS3
MPP6
F8A1
NLGN2
LCK
SCRIB
KRAS
SNTB2
FAM171A2
DLGAP4
DIRAS3
PLEKHA2
INADL
ANKRD26
ARHGEF26
GRIN2A
EFR3A
PLCH1
CDC42BPA
DNAJC5
APBA1
KHDRBS1
LIN7C
RHOB
DTNA
ERBB4
ERBB2IP
SNTB1
CTNNAL1
MARK3
GUCY1A2
KIAA0754
RPS6KA1
TENC1
USP6NL
GJA1
SHANK2
DLGAP3
MAGI2
PHACTR4
UTRN
NETO1
PXDC1
GRIN1
EPB41L4A
C15orf59
PKP4
DLGAP1
MPP2
GRIN3A
MTMR2
EPB41
FLOT1
DLG1
KCNA4
NLGN1
LIN7A
FRS2
SNTG1
CRHR1
ZFPL1
LPHN2
MPP5
MPP7
