In [1]:
from code_py.DIAMOnD import *
from code_py.backbone import *

#  Part 1: Preprocessing

In [2]:
disease_genes_graph = Disease_Genes_Graph()

#### 1.1 -- Filtering and revoving self-loops

In [3]:
# Preprocessing the Biogrid dataset by filtering Homo Sapiens, removing duplicated and self loops
hs_putative_genes = disease_genes_graph.preprocessing_dataset(homo_sap=True,drop_duplicates=True,remove_self_loops=True)


Number of putative genes: 718129


#### 1.2 -- Filtering Disease Genes

In [4]:
hs_disease_genes_df,hs_disease_genes = disease_genes_graph.query_disease_genes(diseaseId="C1510586")

Found 85 disease genes in Autism Spectrum Disorders


#### 1.3 -- Creating LCC sub-graph,adjacency matrix,

In [5]:
# We are creating the graph with nx from the from the filtered PPI dataset and returning the sub graph, adj matrix, nodes and edges of LCC 
pgenes_sub_graph,pgenes_adj,pnodes,pedges = disease_genes_graph.LCC_to_adj(hs_putative_genes)

# of connected components: 1
19759
Graph with 19759 nodes and 675415 edges


# Part 2: Algorithms  

### 2.1 -- MLC Algorithm 

In [10]:
# Applying MLC Algoritm by given inflation range (1.8, 2.7, step = 0.1)
from joblib import Parallel, delayed

results = Parallel(n_jobs=3)(delayed(disease_genes_graph.MCL)(pgenes_adj,i) for i in tqdm(np.arange(1.8,2.7,0.1)))

100%|██████████| 9/9 [40:34<00:00, 270.51s/it]


In [25]:
for i in enumerate(np.arange(1.8,2.7,0.1)):
    print("inflation:", round(i[1],2), "modularity:", results[i[0]])

inflation: 1.8 modularity: 0.8364189644177686
inflation: 1.9 modularity: 0.8095275561789396
inflation: 2.0 modularity: 0.7560269862922929
inflation: 2.1 modularity: 0.7001738612779913
inflation: 2.2 modularity: 0.645825770553926
inflation: 2.3 modularity: 0.5957726942467184
inflation: 2.4 modularity: 0.5549960786780302
inflation: 2.5 modularity: 0.5213876838864968
inflation: 2.6 modularity: 0.4930706526371195


In [40]:
result = mc.run_mcl(pgenes_adj, inflation=1.8)
clusters = mc.get_clusters(result)
print(len(clusters))

2040


### 2.2 -- DIAMOnD Algorithm

In [6]:
# Creating train and validation folds for Cross Validation
St_train,Sp_val=disease_genes_graph.KFold_CV(hs_disease_genes)

In [7]:
added_nodes, predicted_nodes = DIAMOnD(G_original=pgenes_sub_graph,
                        seed_genes=St_train[0],
                        max_number_of_added_nodes=50,alpha=1)

DIAMOnD(): ignoring 3 of 76 seed genes that are not in the network


### 2.6 -- Random Walk with Restart 

In [6]:
disease_genes_graph.RWR(pgenes_sub_graph)


TRIM25 0.0036780463638231295
KIAA1429 0.002553971392189628
DDX58 0.0022945645479153136
DDX39A 0.0022317654596454156
APP 0.0020135037320078986
HNRNPL 0.0018565978542992826
HNRNPH1 0.0016549605248707682
TP53 0.001301421510198727
KRAS 0.0012649494484642152
ELAVL1 0.0012018688118047975
