# Section 3.3 Investigating global trends of disease-specific co-expression networks at the edge level

In [1]:
import getpass
import json
import os
import sys
import time
from collections import Counter
from itertools import chain

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pyobo
import seaborn as sns
from network_utils import edge_file_path, create_network_from_edge_file, most_common, below_cutoff, \
    load_interactome, above_cutoff, remove_stats, percent_edges_in_other_edges, top_edges_subgraph, \
    load_STRING, load_HIPPIE
from tqdm import tqdm

In [2]:
getpass.getuser()

'rfigueiredo'

In [3]:
sys.version

'3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2020, 12:10:52) \n[Clang 6.0 (clang-600.0.57)]'

In [4]:
time.asctime()

'Mon Apr 19 16:36:55 2021'

## Load data
#### download data from https://doi.org/10.5281/zenodo.4700652 and place the folder 'data/' in the root of the repository, or if placed elsewhere, write the full path to 'data/' as the variable data_dir below

#### optional: select your desired location of the output figures

In [5]:
# replace here the location of the data dir if not structured as instructed
data_dir = "../data"


# optional, replace here the desired location of the output figures
figures_dir = os.path.join(os.path.expanduser("~"), "coexpath_figures")
os.makedirs(figures_dir, exist_ok=True)

##### Load co-expression networks

In [6]:
network_dict = {
    doid : create_network_from_edge_file(edge_file_path(os.path.join(data_dir, "final_coexprNets"), doid), doid)
    for doid in tqdm(os.listdir(os.path.join(data_dir, "final_coexprNets")), desc="Creating/loading network objects") if doid != ".DS_Store"
}

Creating/loading network objects: 100%|██████████| 64/64 [03:44<00:00,  3.50s/it]


##### Load cluster assignments

In [7]:
with open(os.path.join(data_dir, "misc_data", 'doid_group_clusters_dict.json'), 'r') as f:
    doids_per_cluster = json.load(f)

doid_cluster_mapping = {doid : cluster for cluster, doid_list in doids_per_cluster.items() if cluster != "Normal" for doid in doid_list}

##### Load disease name mappings

In [8]:
doid_name_mapping = pyobo.get_id_name_mapping('doid')
doid_id_mapping = pyobo.get_name_id_mapping('doid')

##### Load most common edges.
We did this on a separate machine as it is very computationally intensive, and saved the top 100,000 edges.
Alternatively, comment out this cell below and un-comment + run the one below it to perform this here

In [9]:
with open(os.path.join(data_dir, "misc_data", "100000most_common_edges.json"), 'r') as f:
    most_common_edges = json.load(f)

In [10]:
# most_common_edges = most_common(network_dict.values(), comparison="edges")

##### Load interactome network

In [11]:
interactome = load_interactome(os.path.join(data_dir, "interactome", "interactome_18_01_2021.tsv"))

##### Load STRING network

In [12]:
STRING = load_STRING("https://stringdb-static.org/download/protein.physical.links.v11.0/9606.protein.physical.links.v11.0.txt.gz")

##### Load HIPPIE network

In [13]:
HIPPIE = load_HIPPIE("http://cbdm-01.zdv.uni-mainz.de/~mschaefer/hippie/hippie_current.txt")

2208 edges skipped due to discontinued gene ids


## Edge stats

In [14]:
print("There are", len(above_cutoff(most_common_edges, 44)), "edges in at least 44 diseases")
print("There are", len(above_cutoff(most_common_edges, 32)), "edges in at least 32 diseases")

There are 21 edges in at least 44 diseases
There are 202 edges in at least 32 diseases


#### Among the most common edges in all diseases (top X)... what proportion can be found in the interactome, STRING, and HIPPIE? 

In [15]:
overlap_with_interactome = {}
overlap_with_STRING = {}
overlap_with_HIPPIE = {}
percent_top_edges_in_interactome = {}
percent_top_edges_in_STRING = {}
percent_top_edges_in_HIPPIE = {}
top_10_genes_in_overlap_int = {}
top_10_genes_in_overlap_str = {}
top_10_genes_in_overlap_hip = {}
for i in tqdm(range(1000,10001,1000)):
    edges = remove_stats(most_common_edges[:i])
    overlap_int = []
    overlap_str = []
    overlap_hip = []
    for edge in edges:
        if interactome.has_edge(*edge) or interactome.has_edge(*edge[::-1]):
            overlap_int.append(edge)
        if STRING.has_edge(*edge):
            overlap_str.append(edge)
        if HIPPIE.has_edge(*edge):
            overlap_hip.append(edge)
    overlap_with_interactome[i] = overlap_int
    overlap_with_STRING[i] = overlap_str
    overlap_with_HIPPIE[i] = overlap_hip
    percent_top_edges_in_interactome[i] = f"{len(overlap_int)/len(edges):.2%}"
    percent_top_edges_in_STRING[i] = f"{len(overlap_str)/len(edges):.2%}"
    percent_top_edges_in_HIPPIE[i] = f"{len(overlap_hip)/len(edges):.2%}"
    nodes_int = [node for edge in overlap_int for node in edge]
    nodes_str = [node for edge in overlap_str for node in edge]
    nodes_hip = [node for edge in overlap_hip for node in edge]
    counter_obj_int = Counter(nodes_int)
    counter_obj_str = Counter(nodes_str)
    counter_obj_hip = Counter(nodes_hip)
    top_10_genes_in_overlap_int[i] = counter_obj_int.most_common()[:10]
    top_10_genes_in_overlap_str[i] = counter_obj_str.most_common()[:10]
    top_10_genes_in_overlap_hip[i] = counter_obj_hip.most_common()[:10]

100%|██████████| 10/10 [00:00<00:00, 70.63it/s]


In [16]:
for k in percent_top_edges_in_interactome.keys():
    print("percent of top", k, "edges that are also in interactome:", 
          percent_top_edges_in_interactome[k])
    # print("\ttop 10 genes (gene, #connections):", top_10_genes_in_overlap_int[k]) # TODO uncomment if you want to display the top genes from the overlap

percent of top 1000 edges that are also in interactome: 4.00%
percent of top 2000 edges that are also in interactome: 3.20%
percent of top 3000 edges that are also in interactome: 2.90%
percent of top 4000 edges that are also in interactome: 2.53%
percent of top 5000 edges that are also in interactome: 2.32%
percent of top 6000 edges that are also in interactome: 2.08%
percent of top 7000 edges that are also in interactome: 1.94%
percent of top 8000 edges that are also in interactome: 1.85%
percent of top 9000 edges that are also in interactome: 1.70%
percent of top 10000 edges that are also in interactome: 1.66%


In [17]:
for k in percent_top_edges_in_STRING.keys():
    print("percent of top", k, "edges that are also in STRING:", 
          percent_top_edges_in_STRING[k])
    # print("\ttop 10 genes (gene, #connections):", top_10_genes_in_overlap_str[k]) # TODO uncomment if you want to display the top genes from the overlap

percent of top 1000 edges that are also in STRING: 57.40%
percent of top 2000 edges that are also in STRING: 52.10%
percent of top 3000 edges that are also in STRING: 50.23%
percent of top 4000 edges that are also in STRING: 46.48%
percent of top 5000 edges that are also in STRING: 42.06%
percent of top 6000 edges that are also in STRING: 38.38%
percent of top 7000 edges that are also in STRING: 37.16%
percent of top 8000 edges that are also in STRING: 34.15%
percent of top 9000 edges that are also in STRING: 32.06%
percent of top 10000 edges that are also in STRING: 31.87%


In [18]:
for k in percent_top_edges_in_HIPPIE.keys():
    print("percent of top", k, "edges that are also in HIPPIE:", 
          percent_top_edges_in_HIPPIE[k])
    # print("\ttop 10 genes (gene, #connections):", top_10_genes_in_overlap_hip[k]) # TODO uncomment if you want to display the top genes from the overlap

percent of top 1000 edges that are also in HIPPIE: 6.70%
percent of top 2000 edges that are also in HIPPIE: 6.05%
percent of top 3000 edges that are also in HIPPIE: 5.40%
percent of top 4000 edges that are also in HIPPIE: 4.72%
percent of top 5000 edges that are also in HIPPIE: 4.44%
percent of top 6000 edges that are also in HIPPIE: 3.87%
percent of top 7000 edges that are also in HIPPIE: 3.57%
percent of top 8000 edges that are also in HIPPIE: 3.35%
percent of top 9000 edges that are also in HIPPIE: 3.20%
percent of top 10000 edges that are also in HIPPIE: 3.05%


---
#### Among the most common edges in all diseases (top X)... what proportion can be found in the normal network?
#### ---> of those, what proportion can be found in the interactome, STRING, and HIPPIE?

In [19]:
sorted_normal_edges = sorted(network_dict["normal"].edges(data=True), key=lambda t: abs(t[2].get('weight', 1)), reverse = True)

In [20]:
overlap_with_normal = {}
percent_top_edges_in_normal = {}
top_10_genes_in_overlap = {}
for i in tqdm(range(1000,10001,1000)):
    edges = remove_stats(most_common_edges[:i])
    overlap = []
    for edge in edges:
        if top_edges_subgraph(i, sorted_normal_edges).has_edge(*edge):
            overlap.append(edge)
    overlap_with_normal[i] = overlap
    percent_top_edges_in_normal[i] = f"{len(overlap)/len(edges):.2%}"
    nodes = [node for edge in overlap for node in edge]
    counter_obj = Counter(nodes)
    top_10_genes_in_overlap[i] = counter_obj.most_common()[:10]

100%|██████████| 10/10 [2:50:03<00:00, 1020.31s/it]


In [21]:
for k in percent_top_edges_in_normal.keys():
    print(k,"edges: overlap with normal network:", percent_top_edges_in_normal[k], 
          f"\n\toverlap of those with interactome:\t{percent_edges_in_other_edges(overlap_with_normal[k],interactome.edges()):.2%}",
          f"\n\toverlap of those with STRING:\t{percent_edges_in_other_edges(overlap_with_normal[k],STRING.edges()):.2%}",
          f"\n\toverlap of those with HIPPIE:\t{percent_edges_in_other_edges(overlap_with_normal[k],HIPPIE.edges()):.2%}") 
    
    # print("\ttop 10 genes:", top_10_genes_in_overlap[k]) # TODO uncomment if you want to display the top genes from the overlap

1000 edges: overlap with normal network: 19.20% 
	overlap of those with interactome:	7.81% 
	overlap of those with STRING:	30.21% 
	overlap of those with HIPPIE:	8.33%
2000 edges: overlap with normal network: 19.85% 
	overlap of those with interactome:	7.05% 
	overlap of those with STRING:	43.58% 
	overlap of those with HIPPIE:	9.57%
3000 edges: overlap with normal network: 19.43% 
	overlap of those with interactome:	5.66% 
	overlap of those with STRING:	46.66% 
	overlap of those with HIPPIE:	9.09%
4000 edges: overlap with normal network: 18.93% 
	overlap of those with interactome:	5.02% 
	overlap of those with STRING:	47.16% 
	overlap of those with HIPPIE:	8.85%
5000 edges: overlap with normal network: 18.12% 
	overlap of those with interactome:	4.64% 
	overlap of those with STRING:	47.24% 
	overlap of those with HIPPIE:	8.28%
6000 edges: overlap with normal network: 17.65% 
	overlap of those with interactome:	4.44% 
	overlap of those with STRING:	45.70% 
	overlap of those with HIPPIE

---
#### Among the most common edges in all diseases that are not in the normal network (top X)... what proportion can be found in the interactome, STRING, and HIPPIE?

In [22]:
edges_only_in_diseases = [edge for edge, _ in most_common_edges if edge not in overlap_with_normal[10000]]

In [23]:
overlap_with_disease_edges_int = {}
overlap_with_disease_edges_str = {}
overlap_with_disease_edges_hip = {}
percent_top_edges_in_diseases_int = {}
percent_top_edges_in_diseases_str = {}
percent_top_edges_in_diseases_hip = {}
for i in tqdm(range(1000,10001,1000)):
    edges = edges_only_in_diseases[:i]
    overlap_int = []
    overlap_str = []
    overlap_hip = []
    for edge in edges:
        if interactome.has_edge(*edge) or interactome.has_edge(*edge[::-1]):
            overlap_int.append(edge)
        if STRING.has_edge(*edge):
            overlap_str.append(edge)
        if HIPPIE.has_edge(*edge):
            overlap_hip.append(edge)
    overlap_with_disease_edges_int[i] = overlap_int
    overlap_with_disease_edges_str[i] = overlap_str
    overlap_with_disease_edges_hip[i] = overlap_hip
    percent_top_edges_in_diseases_int[i] = f"{len(overlap_int)/len(edges):.2%}"
    percent_top_edges_in_diseases_str[i] = f"{len(overlap_str)/len(edges):.2%}"
    percent_top_edges_in_diseases_hip[i] = f"{len(overlap_hip)/len(edges):.2%}"

100%|██████████| 10/10 [00:00<00:00, 49.28it/s]


In [24]:
for k in percent_top_edges_in_diseases_int.keys():
    print("percent of top", k, "disease-specific edges that are also in interactome:", percent_top_edges_in_diseases_int[k])

percent of top 1000 disease-specific edges that are also in interactome: 2.20%
percent of top 2000 disease-specific edges that are also in interactome: 2.30%
percent of top 3000 disease-specific edges that are also in interactome: 2.03%
percent of top 4000 disease-specific edges that are also in interactome: 1.73%
percent of top 5000 disease-specific edges that are also in interactome: 1.46%
percent of top 6000 disease-specific edges that are also in interactome: 1.45%
percent of top 7000 disease-specific edges that are also in interactome: 1.24%
percent of top 8000 disease-specific edges that are also in interactome: 1.20%
percent of top 9000 disease-specific edges that are also in interactome: 1.17%
percent of top 10000 disease-specific edges that are also in interactome: 1.05%


In [25]:
for k in percent_top_edges_in_diseases_str.keys():
    print("percent of top", k, "disease-specific edges that are also in STRING:", percent_top_edges_in_diseases_str[k])

percent of top 1000 disease-specific edges that are also in STRING: 53.80%
percent of top 2000 disease-specific edges that are also in STRING: 48.55%
percent of top 3000 disease-specific edges that are also in STRING: 42.37%
percent of top 4000 disease-specific edges that are also in STRING: 39.02%
percent of top 5000 disease-specific edges that are also in STRING: 33.30%
percent of top 6000 disease-specific edges that are also in STRING: 33.47%
percent of top 7000 disease-specific edges that are also in STRING: 28.70%
percent of top 8000 disease-specific edges that are also in STRING: 29.35%
percent of top 9000 disease-specific edges that are also in STRING: 27.89%
percent of top 10000 disease-specific edges that are also in STRING: 25.13%


In [26]:
for k in percent_top_edges_in_diseases_hip.keys():
    print("percent of top", k, "disease-specific edges that are also in HIPPIE:", percent_top_edges_in_diseases_hip[k])

percent of top 1000 disease-specific edges that are also in HIPPIE: 4.80%
percent of top 2000 disease-specific edges that are also in HIPPIE: 4.25%
percent of top 3000 disease-specific edges that are also in HIPPIE: 3.57%
percent of top 4000 disease-specific edges that are also in HIPPIE: 3.17%
percent of top 5000 disease-specific edges that are also in HIPPIE: 2.66%
percent of top 6000 disease-specific edges that are also in HIPPIE: 2.57%
percent of top 7000 disease-specific edges that are also in HIPPIE: 2.20%
percent of top 8000 disease-specific edges that are also in HIPPIE: 2.19%
percent of top 9000 disease-specific edges that are also in HIPPIE: 2.08%
percent of top 10000 disease-specific edges that are also in HIPPIE: 1.88%


---
#### Interactome vs normal network

In [27]:
print(f"Overlap between interactome and equivalent proportion of top edges in normal network: \
        {percent_edges_in_other_edges(list(top_edges_subgraph(len(interactome.edges), sorted_normal_edges).edges()), list(interactome.edges())):.2%}" )

Overlap between interactome and equivalent proportion of top edges in normal network:         1.09%
