In [1]:
from code_py.DIAMOnD import *
from code_py.backbone import Human_Genes_Graph_Analysis
import markov_clustering as mc
from joblib import Parallel, delayed
import statistics 
from tqdm import tqdm

sys_path = '/Users/alessandroquattrociocchi/Git/BI/Final_project/'


#  Part 1: Preprocessing

In [2]:
disease_code = "C3714756"
hga = Human_Genes_Graph_Analysis(sys_path,disease_ID = disease_code)

#### 1.1 -- Filtering and revoving self-loops

In [3]:
# Preprocessing the Biogrid dataset by filtering Homo Sapiens, removing duplicated and self loops
hs_putative_genes = hga.preprocessing_dataset(homo_sap=True,drop_duplicates=True,remove_self_loops=True)


Number of putative genes: 889884


#### 1.2 -- Filtering Disease Genes

In [4]:
hs_disease_genes_df,hs_disease_genes = hga.query_disease_genes()

Found 447 disease genes in Intellectual Disability


#### 1.3 -- Creating LCC sub-graph,adjacency matrix,

In [5]:
# We are creating the graph with nx from the from the filtered PPI dataset and returning the sub graph, adj matrix, nodes and edges of LCC 
pgenes_sub_graph,pgenes_adj,pnodes,pedges = hga.LCC_to_adj(hs_putative_genes)

# of connected components: 1
19618
Graph with 19618 nodes and 665061 edges


#### 1.3 -- Cross Validation

In [17]:
ds_genes_train,ds_genes_test = hga.KFold_CV(hs_disease_genes,n_folds=5,shuffle_flag=True)

# Part 2: Algorithms  

### 2.1 -- MCL Algorithm 

In [10]:
# Applying MLC Algoritm by given inflation range (1.5, 2.7, step = 0.1)

results = Parallel(n_jobs=3)(delayed(hga.MCL)(pgenes_adj,i) for i in tqdm(np.arange(1.8,2.7,0.1)))
hga.list_to_pikle(results,'MLC_modularity_'+disease_code)


100%|██████████| 9/9 [40:34<00:00, 270.51s/it]


In [16]:
results_list_from_pkl = hga.read_pickle_list('MLC_modularity_'+disease_code)
for i in enumerate(np.arange(1.8,2.7,0.1)):
    print("inflation:", round(i[1],2), "modularity:", results_list_from_pkl[i[0]])
    

inflation: 1.8 modularity: 1
inflation: 1.9 modularity: 2
inflation: 2.0 modularity: 3
inflation: 2.1 modularity: 4
inflation: 2.2 modularity: 5
inflation: 2.3 modularity: 6
inflation: 2.4 modularity: 7
inflation: 2.5 modularity: 8
inflation: 2.6 modularity: 9


### 2.1.1 -- Creating Clusters

In [20]:
result = mc.run_mcl(pgenes_adj, inflation=1.8)
clusters = mc.get_clusters(result)
print(len(clusters))

2106


In [22]:
_, enriched_genes = hga.MLC_eval(pgenes_sub_graph,ds_genes_train,clusters)

Fold number:  0
18 disease genes in cluster 0 --> 0.092752
111 disease genes in cluster 2 --> 0.000252
6 disease genes in cluster 6 --> 0.119593
3 disease genes in cluster 7 --> 0.187683
23 disease genes in cluster 11 --> 0.056081
5 disease genes in cluster 15 --> 0.178167
4 disease genes in cluster 27 --> 0.131161
4 disease genes in cluster 46 --> 0.035018
20 disease genes in cluster 53 --> 0.06762
15 disease genes in cluster 67 --> 0.014179
4 disease genes in cluster 68 --> 0.182145
5 disease genes in cluster 82 --> 0.177803
3 disease genes in cluster 111 --> 0.202044
3 disease genes in cluster 198 --> 0.003711
4 disease genes in cluster 224 --> 0.198123
3 disease genes in cluster 290 --> 0.014835
Fold number:  1
24 disease genes in cluster 0 --> 0.047008
119 disease genes in cluster 2 --> 8e-06
4 disease genes in cluster 6 --> 0.052714
4 disease genes in cluster 7 --> 0.0954
17 disease genes in cluster 11 --> 0.088865
8 disease genes in cluster 46 --> 0.136309
18 disease genes in cl

### 2.2 -- DIAMOnD Algorithm

In [23]:
precision = []
recall = []
f1_score = []
for i in range(0,5):
    added_nodes, predicted_nodes = DIAMOnD(G_original=pgenes_sub_graph,
                            seed_genes=ds_genes_train[i],
                            max_number_of_added_nodes=len(ds_genes_train[i]),alpha=1)
    #TP numero di geni che effettivamente sono disease genes
    TP = len(set(predicted_nodes).intersection(set(ds_genes_test[i])))
    #numero di geni riportati come veri ma che non sono disease genes 
    FP = len(ds_genes_train[i]) - TP
    FN = len(ds_genes_test[i]) - TP
    precision.append(TP/(TP+FP))
    recall.append((TP)/(TP+FN))
    try:
        f1_score.append((2*precision[i]*recall[i])/(precision[i]+recall[i]))
    except:
        print("zero division")


print("Precision: " + str(round(statistics.mean(precision),6)) + " ± " +str(round(statistics.stdev(precision),6)))
print("Recall: " + str(round(statistics.mean(recall),6)) + " ± " +str(round(statistics.stdev(recall),6)))
print("F1 Score: " + str(round(statistics.mean(f1_score),6)) + " ± " +str(round(statistics.stdev(f1_score),6)))


DIAMOnD(): ignoring 15 of 357 seed genes that are not in the network
DIAMOnD(): ignoring 13 of 357 seed genes that are not in the network
DIAMOnD(): ignoring 15 of 358 seed genes that are not in the network
DIAMOnD(): ignoring 13 of 358 seed genes that are not in the network
DIAMOnD(): ignoring 12 of 358 seed genes that are not in the network
Precision: 0.016212 ± 0.007222
Recall: 0.064994 ± 0.029224
F1 Score: 0.025951 ± 0.011581


### 2.3 -- DiaBLE Algorithm

In [24]:
precision_diable = []
recall_diable = []
f1_score_diable = []
for i in range(0,5):
    added_nodes, predicted_nodes = DIAMOnD(G_original=pgenes_sub_graph,
                                           seed_genes=ds_genes_train[i],
                                           max_number_of_added_nodes=len(ds_genes_train[i]),
                                           alpha=1,DiaBLE=True)
                                           
    #TP numero di geni che effettivamente sono disease genes
    TP = len(set(predicted_nodes).intersection(set(ds_genes_test[i])))
    #numero di geni riportati come veri ma che non sono disease genes 
    FP = len(ds_genes_train[i]) - TP
    #numero di geni che 
    FN = len(ds_genes_train[i]) - TP
    precision_diable.append(TP/(TP+FP))
    recall_diable.append((TP)/(TP+FN))
    try:
        f1_score_diable.append((2*precision[i]*recall[i])/(precision[i]+recall[i]))
    except:
        print("zero division")


print("Precision: " + str(round(statistics.mean(precision_diable),6)) + " ± " +str(round(statistics.stdev(precision_diable),6)))
print("Recall: " + str(round(statistics.mean(recall_diable),6)) + " ± " +str(round(statistics.stdev(recall_diable),6)))
print("F1 Score: " + str(round(statistics.mean(f1_score_diable),6)) + " ± " +str(round(statistics.stdev(f1_score_diable),6)))


DIAMOnD(): ignoring 15 of 357 seed genes that are not in the network
DIAMOnD(): ignoring 13 of 357 seed genes that are not in the network
DIAMOnD(): ignoring 15 of 358 seed genes that are not in the network
DIAMOnD(): ignoring 13 of 358 seed genes that are not in the network
DIAMOnD(): ignoring 12 of 358 seed genes that are not in the network
Precision: 0.016212 ± 0.007222
Recall: 0.016212 ± 0.007222
F1 Score: 0.025951 ± 0.011581


### 2.6 -- Random Walk with Restart 

In [None]:
for fold in tqdm(range(0,5)):
    rwr_enriched_genes = hga.RWR(pgenes_sub_graph,ds_genes_train[fold],max_print_items=0)
    break