In [1]:
%load_ext autoreload
%autoreload 2

from code_py.DIAMOnD import *
from code_py.backbone import Human_Genes_Graph_Analysis
import markov_clustering as mc
from joblib import Parallel, delayed
import statistics 
from tqdm import tqdm
from prettytable import PrettyTable

sys_path = '/Users/alessandroquattrociocchi/Git/BI/Final_project/'


#  Part 1: Preprocessing

In [2]:
disease_code = "C1510586"
hga = Human_Genes_Graph_Analysis(sys_path,disease_ID = disease_code)

#### 1.1 -- Filtering and removing self-loops

In [3]:
# Preprocessing the Biogrid dataset by filtering Homo Sapiens, removing duplicated and self loops
hs_putative_genes = hga.preprocessing_dataset(homo_sap=True,drop_duplicates=True,remove_self_loops=True)

Number of putative genes: 889884


#### 1.2 -- Filtering Disease Genes

In [4]:
hs_disease_genes_df,hs_disease_genes = hga.query_disease_genes()

Found 85 disease genes in Autism Spectrum Disorders


#### 1.3 -- Creating LCC sub-graph,adjacency matrix,

In [5]:
# We are creating the graph with nx from the from the filtered PPI dataset and returning the sub graph, adj matrix, nodes and edges of LCC 
pgenes_sub_graph,pgenes_adj,pnodes,pedges = hga.LCC_to_adj(hs_putative_genes)

# of connected components: 1
19618
Graph with 19618 nodes and 665061 edges


#### 1.3 -- Cross Validation

In [6]:
ds_genes_train,ds_genes_test = hga.KFold_CV(hs_disease_genes,n_folds=5,shuffle_flag=True)

#### 1.4 -- Saving Seed Genes to files for Cytoscape

In [33]:
for i in range(len(ds_genes_test)):
    #np.savetxt("cytoscape/test"+str(disease_code)+str(i)+".csv", 
    #        ds_genes_test[i],
    #        delimiter =", ", 
    #        fmt ='% s')
    np.savetxt("cytoscape/"+str(disease_code)+"/train"+str(disease_code)+"_"+str(i)+".csv", 
            ds_genes_train[i],
            delimiter =", ", 
            fmt ='% s')

# Part 2: Algorithms  

### 2.1 -- MCL Algorithm 

In [14]:
# Applying MLC Algoritm by given inflation range (1.5, 2.7, step = 0.1)
results = Parallel(n_jobs=3)(delayed(hga.MCL)(pgenes_adj,i) for i in tqdm(np.arange(1.5,2.7,0.1)))


100%|██████████| 13/13 [7:31:19<00:00, 2083.00s/it] 


In [15]:
hga.list_to_pikle(results,'MLC_modularity')

In [16]:
results_list_from_pkl = hga.read_pickle_list('MLC_modularity')
for i in enumerate(np.arange(1.5,2.7,0.1)):
    print("inflation:", round(i[1],2), "modularity:", results_list_from_pkl[i[0]])
    

inflation: 1.5 modularity: 0.7318793908083768
inflation: 1.6 modularity: 0.7664038191128469
inflation: 1.7 modularity: 0.7948113431834943
inflation: 1.8 modularity: 0.8254253707325344
inflation: 1.9 modularity: 0.8034790682066575
inflation: 2.0 modularity: 0.7514658577401434
inflation: 2.1 modularity: 0.6951036602617071
inflation: 2.2 modularity: 0.641935117124079
inflation: 2.3 modularity: 0.5919604667312881
inflation: 2.4 modularity: 0.5541978925627196
inflation: 2.5 modularity: 0.5206722692342249
inflation: 2.6 modularity: 0.49161128642816465
inflation: 2.7 modularity: 0.46866783664642075


### 2.1.1 -- Creating Clusters

In [7]:
best_inflation = 1.8
result = mc.run_mcl(pgenes_adj, inflation=best_inflation)
clusters = mc.get_clusters(result)
print(str(len(clusters))+" of clusters obtained with inflation of "+str(best_inflation))

2106 of clusters obtained with inflation of 1.8


In [10]:
_, enriched_genes,enriched_cluster_ID = hga.MLC_eval(pgenes_sub_graph,ds_genes_train,clusters)

Fold number:  0
16 disease genes in cluster 0 --> 0.075075
117 disease genes in cluster 2 --> 2e-05
7 disease genes in cluster 6 --> 0.13986
4 disease genes in cluster 7 --> 0.0954
21 disease genes in cluster 11 --> 0.080552
5 disease genes in cluster 15 --> 0.178167
3 disease genes in cluster 27 --> 0.214784
3 disease genes in cluster 46 --> 0.015384
16 disease genes in cluster 53 --> 0.100427
13 disease genes in cluster 67 --> 0.041879
6 disease genes in cluster 68 --> 0.144008
6 disease genes in cluster 82 --> 0.142727
4 disease genes in cluster 111 --> 0.19804
3 disease genes in cluster 198 --> 0.003711
3 disease genes in cluster 224 --> 0.201165
3 disease genes in cluster 290 --> 0.014835
Fold number:  1
21 disease genes in cluster 0 --> 0.082933
120 disease genes in cluster 2 --> 5e-06
5 disease genes in cluster 6 --> 0.087203
23 disease genes in cluster 11 --> 0.056081
4 disease genes in cluster 15 --> 0.18091
3 disease genes in cluster 27 --> 0.214784
6 disease genes in cluster

In [11]:
hga.MCL_evaluation_metrics(pgenes_sub_graph,ds_genes_test,hs_disease_genes,clusters,enriched_cluster_ID)

TP: 151 --- FP: 1955 --- FN: 19467
Precision: 0.0717 --- Recall: 0.007697 --- F1 Score: 0.013902


### 2.2 -- DIAMOnD Algorithm

In [7]:
hga.return_metrics("DIAMOnD", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=True)

DIAMOnD(): ignoring 5 of 111 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 111 seed genes that are not in the network
DIAMOnD(): ignoring 4 of 111 seed genes that are not in the network
DIAMOnD(): ignoring 4 of 111 seed genes that are not in the network
DIAMOnD(): ignoring 4 of 112 seed genes that are not in the network
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
Precision at 50: 0.001802 ± 0.004029
Precision at n/10: 0.001802 ± 0.004029
Precision at n/4: 0.001802 ± 0.004029
Precision at n/2: 0.003604 ± 0.004934
Precision at n: 0.010795 ± 0.004038
Recall at 50: 0.007143 ± 0.015972
Recall at n/10: 0.007143 ± 0.015972
Recall at n/4: 0.007143 ± 0.015972
Recall at n/2: 0.014286 ± 0.019562
Recall at n: 0.043122 ± 0.015834
F1 Score at 50: 0.014388
F1 Score at n/10: 0.014388
F1 Score at n/4: 0.014388
F1 Score at

### 2.3 -- DiaBLE Algorithm

In [8]:
hga.return_metrics("DiaBLE", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=True)

DiaBLE(): ignoring 5 of 111 seed genes that are not in the network
DiaBLE(): ignoring 3 of 111 seed genes that are not in the network
DiaBLE(): ignoring 4 of 111 seed genes that are not in the network
DiaBLE(): ignoring 4 of 111 seed genes that are not in the network
DiaBLE(): ignoring 4 of 112 seed genes that are not in the network
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
Precision at 50: 0.001802 ± 0.004029
Precision at n/10: 0.001802 ± 0.004029
Precision at n/4: 0.001802 ± 0.004029
Precision at n/2: 0.003604 ± 0.004934
Precision at n: 0.010795 ± 0.004038
Recall at 50: 0.007143 ± 0.015972
Recall at n/10: 0.007143 ± 0.015972
Recall at n/4: 0.007143 ± 0.015972
Recall at n/2: 0.014286 ± 0.019562
Recall at n: 0.043122 ± 0.015834
F1 Score at 50: 0.014388
F1 Score at n/10: 0.014388
F1 Score at n/4: 0.014388
F1 Score at n/2:

### 2.4 -- Cytoscape

In [9]:
hga.return_metrics("cytoscape", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=True)

zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
Precision at 50: 0.0 ± 0.0
Precision at n/10: 0.0 ± 0.0
Precision at n/4: 0.0 ± 0.0
Precision at n/2: 0.0 ± 0.0
Precision at n: 0.001786 ± 0.003993
Recall at 50: 0.0 ± 0.0
Recall at n/10: 0.0 ± 0.0
Recall at n/4: 0.0 ± 0.0
Recall at n/2: 0.0 ± 0.0
Recall at n: 0.007407 ± 0.016563
No values to record F1
No values to record F1
No values to record F1
No values to record F1
No values to record F1
nDCG at 50: 0.0 ± 0.0
nDCG at n/10: 0.0 ± 0.0
nDCG at n/4: 0.0 ± 0.0
nDCG at n/2: 0.0 ± 0.0
nDCG at n: 0.001159 ± 0.002591


### 2.6 -- Random Walk with Restart 

In [10]:
hga.return_metrics("rwr", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=True)

Precision at 50: 0.0 ± 0.0
Precision at n/10: 0.0 ± 0.0
Precision at n/4: 0.0 ± 0.0
Precision at n/2: 0.0 ± 0.0
Precision at n: 0.59 ± 0.81
Recall at 50: 0.0 ± 0.0
Recall at n/10: 0.0 ± 0.0
Recall at n/4: 0.0 ± 0.0
Recall at n/2: 0.0 ± 0.0
Recall at n: 2.35 ± 3.22
No values to record F1
No values to record F1
No values to record F1
No values to record F1
F1 Score at n: 2.35 ± 0.0
nDCG at 50: 0.0 ± 0.0
nDCG at n/10: 0.0 ± 0.0
nDCG at n/4: 0.0 ± 0.0
nDCG at n/2: 0.0 ± 0.0
nDCG at n: 0.35 ± 0.47


# Part 3: Extented Validation  

In [7]:
hga.return_metrics("DIAMOnD", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=True, extended_val = True)

DIAMOnD(): ignoring 5 of 111 seed genes that are not in the network
DIAMOnD(): ignoring 3 of 111 seed genes that are not in the network
DIAMOnD(): ignoring 4 of 111 seed genes that are not in the network
DIAMOnD(): ignoring 4 of 111 seed genes that are not in the network
DIAMOnD(): ignoring 4 of 112 seed genes that are not in the network
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
Precision at 50: 0.001563 ± 0.003494
Precision at n/10: 0.001563 ± 0.003494
Precision at n/4: 0.001563 ± 0.003494
Precision at n/2: 0.001563 ± 0.003494
Precision at n: 0.001563 ± 0.003494
Recall at 50: 0.00625 ± 0.013975
Recall at n/10: 0.00625 ± 0.013975
Recall at n/4: 0.00625 ± 0.013975
Recall at n/2: 0.00625 ± 0.013975
Recall at n: 0.00625 ± 0.013975
F1 Score at 50: 0.0125
F

In [None]:
hga.return_metrics("DiaBLE", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=True, extended_val = True)

DiaBLE(): ignoring 5 of 111 seed genes that are not in the network
DiaBLE(): ignoring 3 of 111 seed genes that are not in the network
DiaBLE(): ignoring 4 of 111 seed genes that are not in the network
DiaBLE(): ignoring 4 of 111 seed genes that are not in the network
DiaBLE(): ignoring 4 of 112 seed genes that are not in the network
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
Precision at 50: 0.001563 ± 0.003494
Precision at n/10: 0.001563 ± 0.003494
Precision at n/4: 0.001563 ± 0.003494
Precision at n/2: 0.001563 ± 0.003494
Precision at n: 0.001563 ± 0.003494
Recall at 50: 0.00625 ± 0.013975
Recall at n/10: 0.00625 ± 0.013975
Recall at n/4: 0.00625 ± 0.013975
Recall at n/2: 0.00625 ± 0.013975
Recall at n: 0.00625 ± 0.013975
F1 Score at 50: 0.0125
F1 Sco

In [8]:
hga.return_metrics("cytoscape", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=True, extended_val = True)

zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
zero division
Precision at 50: 0.0 ± 0.0
Precision at n/10: 0.0 ± 0.0
Precision at n/4: 0.0 ± 0.0
Precision at n/2: 0.0 ± 0.0
Precision at n: 0.0 ± 0.0
Recall at 50: 0.0 ± 0.0
Recall at n/10: 0.0 ± 0.0
Recall at n/4: 0.0 ± 0.0
Recall at n/2: 0.0 ± 0.0
Recall at n: 0.0 ± 0.0
No values to record F1
No values to record F1
No values to record F1
No values to record F1
No values to record F1
nDCG at 50: 0.0 ± 0.0
nDCG at n/10: 0.0 ± 0.0
nDCG at n/4: 0.0 ± 0.0
nDCG at n/2: 0.0 ± 0.0
nDCG at n: 0.0 ± 0.0


In [9]:
hga.return_metrics("rwr", pgenes_sub_graph, hs_disease_genes, ds_genes_train, ds_genes_test, print_flag=True, extended_val = True)

Precision at 50: 0.075 ± 0.021108
Precision at n/10: 0.028125 ± 0.011848
Precision at n/4: 0.053125 ± 0.021679
Precision at n/2: 0.120313 ± 0.017116
Precision at n: 0.159375 ± 0.017986
Recall at 50: 0.3 ± 0.084433
Recall at n/10: 0.1125 ± 0.047393
Recall at n/4: 0.2125 ± 0.086715
Recall at n/2: 0.48125 ± 0.068465
Recall at n: 0.6375 ± 0.071943
F1 Score at 50: 0.12 ± 0.033773
F1 Score at n/10: 0.045 ± 0.018957
F1 Score at n/4: 0.085 ± 0.034686
F1 Score at n/2: 0.1925 ± 0.027386
F1 Score at n: 0.255 ± 0.028777
nDCG at 50: 0.186422 ± 0.051823
nDCG at n/10: 0.204807 ± 0.066245
nDCG at n/4: 0.171288 ± 0.053476
nDCG at n/2: 0.188087 ± 0.035149
nDCG at n: 0.13987 ± 0.022769


# Part 4: Enrichment Analysis 

In [57]:
added_nodes, predicted_nodes = DIAMOnD(G_original=pgenes_sub_graph,
                seed_genes=hs_disease_genes,
                max_number_of_added_nodes=115,alpha=1)

DIAMOnD(): ignoring 3 of 85 seed genes that are not in the network


In [64]:
set(predicted_nodes+hs_disease_genes)

{'ABLIM1',
 'ADRA1D',
 'ADRB1',
 'ANKRD26',
 'APBA1',
 'ARHGEF26',
 'BAI1',
 'C11orf52',
 'C15orf59',
 'CASK',
 'CAV1',
 'CDC42BPA',
 'CRHR1',
 'CTNNA1',
 'CTNNAL1',
 'CXADR',
 'DIRAS3',
 'DLG1',
 'DLG2',
 'DLG3',
 'DLG5',
 'DLGAP1',
 'DLGAP2',
 'DLGAP3',
 'DLGAP4',
 'DMD',
 'DNAJC5',
 'DTNA',
 'DTNB',
 'DUSP10',
 'EFR3A',
 'EFR3B',
 'EPB41',
 'EPB41L4A',
 'ERBB2IP',
 'ERBB4',
 'F8A1',
 'FAM171A1',
 'FAM171A2',
 'FAM171B',
 'FLOT1',
 'FRS2',
 'GAB1',
 'GJA1',
 'GPRIN3',
 'GRIN1',
 'GRIN2A',
 'GRIN2C',
 'GRIN3A',
 'GUCY1A2',
 'INADL',
 'KCNA4',
 'KCNJ12',
 'KCNJ4',
 'KHDRBS1',
 'KIAA0754',
 'KIF26B',
 'KRAS',
 'LCK',
 'LIN7A',
 'LIN7B',
 'LIN7C',
 'LLGL1',
 'LPHN2',
 'LYN',
 'MAGI2',
 'MARCKS',
 'MARK2',
 'MARK3',
 'MLLT4',
 'MPDZ',
 'MPP2',
 'MPP3',
 'MPP5',
 'MPP6',
 'MPP7',
 'MTMR2',
 'NETO1',
 'NLGN1',
 'NLGN2',
 'NOS1',
 'OCLN',
 'PALM',
 'PARD3',
 'PHACTR4',
 'PKP4',
 'PLCH1',
 'PLEKHA1',
 'PLEKHA2',
 'PSD3',
 'PTPN13',
 'PXDC1',
 'RAB35',
 'RHOB',
 'RHPN1',
 'RPS6KA1',
 'SCRIB',


In [63]:
for i in set(predicted_nodes+predicted_nodes):
    print(i)

GPRIN3
ADRB1
DTNB
LLGL1
MPP3
WWC1
DMD
ZGPAT
OCLN
CTNNA1
DLG3
DLG2
PTPN13
DLG5
DLGAP2
ZBTB7A
BAI1
LIN7B
STX7
PALM
GAB1
FAM171A1
SNTA1
ADRA1D
KCNJ4
GRIN2C
KCNJ12
CXADR
MARCKS
PLEKHA1
DUSP10
ZDHHC5
RAB35
CAV1
ABLIM1
PSD3
FAM171B
RHPN1
EFR3B
SPZ1
TNS1
MLLT4
PARD3
C11orf52
MARK2
MPDZ
LYN
KIF26B
CASK
NOS1
TAGAP
TNS3
MPP6
F8A1
NLGN2
LCK
SCRIB
KRAS
SNTB2
FAM171A2
DLGAP4
DIRAS3
PLEKHA2
INADL
ANKRD26
ARHGEF26
GRIN2A
EFR3A
PLCH1
CDC42BPA
DNAJC5
APBA1
KHDRBS1
LIN7C
RHOB
DTNA
ERBB4
ERBB2IP
SNTB1
CTNNAL1
MARK3
GUCY1A2
KIAA0754
RPS6KA1
TENC1
USP6NL
GJA1
SHANK2
DLGAP3
MAGI2
PHACTR4
UTRN
NETO1
PXDC1
GRIN1
EPB41L4A
C15orf59
PKP4
DLGAP1
MPP2
GRIN3A
MTMR2
EPB41
FLOT1
DLG1
KCNA4
NLGN1
LIN7A
FRS2
SNTG1
CRHR1
ZFPL1
LPHN2
MPP5
MPP7
