In [None]:
#######################################
### Project: Network Medicine Framework for Identifying Drug Repurposing Opportunities for COVID-19.
### Description: Calculates the Network Separation from COVID-19 Disease Module to other diseases
### Author: Italo Faria do Valle
### email: italodovalle at gmail dot com 
### date: 1st March 2021
#######################################

In [None]:
import networkx as nx
import numpy as np
import pandas as pd
from collections import defaultdict
import sys
sys.path.append('./python/')


import tqdm
import separation

In [None]:
import matplotlib.pylab as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
def connected_component_subgraphs(G, copy=True):
    ## this function was removed from latest versions of networkx!!
    for c in nx.connected_components(G):
        if copy:
            yield G.subgraph(c).copy()
        else:
            yield G.subgraph(c)

In [None]:
interactome_file = './data/DatasetS2.csv'
mapping_file = './data/interactome_2019_merged_protAnnots.csv'
disease_genes_file = './data/Guney2016_GenesDisease.tsv'

In [None]:
hi = pd.read_csv(interactome_file)
edges = zip(hi.proteinA_entrezid, hi.proteinB_entrezid)
hi = nx.Graph()
hi.add_edges_from(edges)

## consider only LCC
G = list(connected_component_subgraphs(hi))[0]
print (len(G.nodes()), len(G.edges()))

In [None]:
mapping = pd.read_csv(mapping_file,
                     index_col = 0)
symbol2entrez = {i:j for i,j in zip(mapping.Symbol, mapping.GeneID)}

In [None]:
disease2genes = {}
for i in open(disease_genes_file).readlines():
    v = i.rstrip().split('\t')
    disease = v[1]
    genes = v[2:]
    if len(genes) > 19:
        disease2genes[disease] = [int(i) for i in genes]

In [None]:
diseasegenes = list(disease2genes.values())
diseasegenes = sum(diseasegenes, [])
diseasegenes = list(set(diseasegenes))
len(diseasegenes)

In [None]:
dic = {'CEP43':11116, 'MTARC1':64757}
symbol2entrez.update(dic)

In [None]:
dt = pd.read_csv('./data/DatasetS1.csv')
dt['EntrezID'] = [symbol2entrez[i] for i in dt.Symbol]
dt.head()

# $S_{AB}$

In [None]:
sab = {}
for disease in tqdm.tqdm(disease2genes.keys()):

#disease = list(disease2genes.keys())[0]
    genes_A = set(dt.EntrezID) & set(G.nodes())
    genes_B = set(disease2genes[disease]) & set(G.nodes())
    d_A = separation.calc_single_set_distance(G,genes_A)
    d_B = separation.calc_single_set_distance(G,genes_B)

    # distances BETWEEN the two gene sets:
    d_AB = separation.calc_set_pair_distances(G,genes_A,genes_B)

    # calculate separation
    s_AB = d_AB - (d_A + d_B)/2.
    sab[disease] = s_AB

In [None]:
dx = pd.DataFrame.from_dict(sab, orient='index')
dx = dx.reset_index()
dx.columns = ['disease', 'sab']
dx.head()

In [None]:
dx.to_csv('./output/separation/sab_virustargets.csv')

In [None]:
fig, ax = plt.subplots(1,1)
sns.distplot(dx['sab'],ax=ax)
ax.set_xlabel('$S_{ab}$')
ax.set_ylabel('Density')
fig.savefig('./output/separation/sab_virus_dist.png', dpi = 300)