In [None]:
import sys, os
import graph_tool.all as gt
import networkx as nx
import pandas as pd
import time
import NetworkMedicineToolbox.parse_ncbi as parse_ncbi
import NetworkMedicineToolbox.parse_mesh as parse_mesh


In [None]:
scratch_data_dir = '/scratch/j.aguirreplans/Scipher/SampleSize'
wto_results_dir = os.path.join(scratch_data_dir, 'networks_nonresponders_wto_N100')
global_network_file = os.path.join(wto_results_dir, 'wto_RNAseq_NonResponders_all.net')


In [None]:
start = time.time() # Starting time
co = pd.read_csv(global_network_file)
end = time.time() # End time
print("Runtime for loading the dataframe is {}".format(end-start))


In [None]:
# Filter by p-value
start = time.time() # Starting time
co_filt = co[co["pval.adj"] < 0.001]
end = time.time() # End time
print("Runtime for filtering the dataframe is {}".format(end-start))


In [None]:
co_filt.shape

In [None]:
start = time.time() # Starting time
g_filt = gt.Graph(directed=False)
ids_filt = g_filt.add_edge_list(co_filt[["Node.1", "Node.2"]].values, hashed=True)
end = time.time() # End time
print("Runtime for filtering the dataframe is {}".format(end-start))


In [None]:
start = time.time() # Starting time
print('Global gene co-expression network: {} nodes and {} edges'.format(len(g_filt.get_vertices()), len(g_filt.get_edges())))
end = time.time() # End time
print("Runtime for printing the numbers of the network is {}".format(end-start))


In [None]:
# Parse the whole network without filtering
start = time.time() # Starting time
g = gt.Graph(directed=False)
ids = g.add_edge_list(co[["Node.1", "Node.2"]].values, hashed=True)
end = time.time() # End time
print("Runtime for creating the network is {}".format(end-start))


In [None]:
start = time.time() # Starting time
print('Global gene co-expression network: {} nodes and {} edges'.format(len(g.get_vertices()), len(g.get_edges())))
end = time.time() # End time
print("Runtime for printing the numbers of the network is {}".format(end-start))


In [None]:
# Filter the network using graph-tools instead of using pandas
start = time.time() # Starting time
filtered_network = gt.GraphView(network, efilt=lambda e: network_pval_adj_prop[e] < 0.001)
end = time.time() # End time
print("Runtime for filtering the network is {}".format(end-start))


In [None]:
home_data_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/data'
databases_dir = '/home/j.aguirreplans/Databases'
mesh_file = os.path.join(databases_dir, 'MeSH/mtrees2021.bin')


In [None]:
m = parse_mesh.MESH(mesh_file)
g = m.get_ontology(lower_concepts=True)


In [None]:
for root_concept_id in m.root_concept_ids:
    root_concept_name = m.concept_id_to_concept[root_concept_id]
    #print(root_concept_id, root_concept_name)


In [None]:
ncbi_gene_info_file = os.path.join(home_data_dir, 'Homo_sapiens.gene_info')
geneid_to_genesymbol, genesymbol_to_geneid = parse_ncbi.get_geneid_symbol_mapping(ncbi_gene_info_file)


In [None]:
# Parse disease genes associated to diseases with 20 genes or more
disease_genes_file = '/home/j.aguirreplans/Projects/Scipher/SampleSize/data/Guney2016_GenesDisease.tsv'
disease2genes = {}
disease2types = {}
for i in open(disease_genes_file).readlines():
    v = i.rstrip().split('\t')
    disease = v[1]
    genes = v[2:]
    if len(genes) > 19:
        #disease2genes[disease] = [int(i) for i in genes]
        disease2genes[disease] = [geneid_to_genesymbol[i] for i in genes if str(i) in geneid_to_genesymbol]
        if disease in m.concept_to_concept_ids:
            concept_ids = m.concept_to_concept_ids[disease]
            for concept_id in concept_ids:
                root_concept_id = concept_id.split('.')[0]
                root_concept = m.concept_id_to_concept[root_concept_id]
                disease2types.setdefault(disease, set()).add(root_concept)
                #print(concept_id, disease, root_concept_id, root_concept)


In [None]:
for node in g.nodes():
    print(node, [x for x in g.neighbors(node)])