In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, "..")
from paths import *
from neo4j import GraphDatabase, basic_auth


In [17]:
compound_type = "combined"
sample = "serum"
sel_sheet_index = 2

extracted_paths_filename = "extracted_paths_for_{}_compounds_{}_sample_sheet_index_{}_dict.pickle".format(compound_type, sample, str(sel_sheet_index))
top_nodes_filename = "top_nodes_for_each_nodetype_for_{}_compounds_{}_sample_sheet_index_{}_list.pickle".format(compound_type, sample, str(sel_sheet_index))


In [18]:
def get_node_names(nodes_df):
    auth = basic_auth(SPOKE_USER, SPOKE_PASSWORD)
    sdb = GraphDatabase.driver(URI, auth=auth)

    query_gene_organism = """
                MATCH(n:{})
                WHERE n.identifier = {}
                RETURN n.name AS n_name
    """
    query = """
                MATCH(n:{})
                WHERE n.identifier = '{}'
                RETURN n.name AS n_name
    """
    node_list = []
    with sdb.session() as session:
        with session.begin_transaction() as tx:
            for index, row in nodes_df.iterrows():
                if row["node_type"] == "Variant" and row["node_type"] == "MiRNA" and row["node_type"] == "Reaction" and row["node_type"] == "EC":
                    node_list.append((row["node_id"], row["node_id"], row["node_type"]))
                elif row["node_type"] == "Gene" or row["node_type"]=="Organism":
                    result = tx.run(query_gene_organism.format(row["node_type"], int(row["node_id"])))
                    for row_ in result:
                        node_list.append((row["node_id"], row_["n_name"], row["node_type"]))
                else:
                    result = tx.run(query.format(row["node_type"], row["node_id"]))
                    for row_ in result:
                        node_list.append((row["node_id"], row_["n_name"], row["node_type"]))
    sdb.close()
    nodes_df_ = pd.DataFrame(node_list, columns = ["node_id", "node_name", "node_type"])
    return nodes_df_


def merge_paths_with_node_names(top_salient_proximal_paths, nodes_df):
    top_salient_proximal_paths_1 = pd.merge(top_salient_proximal_paths, nodes_df, left_on="source", right_on="composite_id")
    top_salient_proximal_paths_2 = pd.merge(top_salient_proximal_paths, nodes_df, left_on="target", right_on="composite_id")
    top_salient_proximal_paths_3 = pd.merge(top_salient_proximal_paths_1, top_salient_proximal_paths_2, on=["source", "target"])[["source", "target", "node_name_x", "node_name_y", "node_type_x", "node_type_y"]]
    top_salient_proximal_paths_3.rename(columns={"source":"source_id", "target":"target_id", "node_name_x":"source_name", "node_name_y":"target_name", "node_type_x": "source_node_type", "node_type_y":"target_node_type"}, inplace=True)
    top_salient_proximal_paths_3.drop_duplicates(inplace=True)
    return top_salient_proximal_paths_3



In [19]:
with open(os.path.join(OUTPUT_PATH, "after_spoke_compound_pruning", extracted_paths_filename), "rb") as f:
    extracted_paths = pickle.load(f)
    
with open(os.path.join(OUTPUT_PATH, "after_spoke_compound_pruning", top_nodes_filename), "rb") as f:
    top_nodes = pickle.load(f)
    

In [20]:
top_salient_proximal_nodes_list = list(extracted_paths["intermediate_to_MS"].keys())


In [21]:
N = 10
top_salient_proximal_negative_nodes = []
top_salient_proximal_positive_nodes = []
for item in top_nodes:
    try:
        item_top_negative_nodes = item["top_negative_nodes"].head(N)
        item_top_negative_nodes = list(np.unique(item_top_negative_nodes.node_type + ":" + item_top_negative_nodes.node_id))
        item_top_salient_proximal_negative_nodes = list(set(item_top_negative_nodes).intersection(set(top_salient_proximal_nodes_list)))
    except:
        item_top_salient_proximal_negative_nodes = []
    try:
        item_top_positive_nodes = item["top_positive_nodes"].tail(N)
        item_top_positive_nodes = list(np.unique(item_top_positive_nodes.node_type + ":" + item_top_positive_nodes.node_id))
        item_top_salient_proximal_positive_nodes = list(set(item_top_positive_nodes).intersection(set(top_salient_proximal_nodes_list)))
    except:
        item_top_salient_proximal_positive_nodes = []
    top_salient_proximal_negative_nodes.append(item_top_salient_proximal_negative_nodes)
    top_salient_proximal_positive_nodes.append(item_top_salient_proximal_positive_nodes)

top_salient_proximal_negative_nodes = np.concatenate(top_salient_proximal_negative_nodes)
top_salient_proximal_positive_nodes = np.concatenate(top_salient_proximal_positive_nodes)

    

In [22]:
top_salient_proximal_negative_nodes_paths = []
for item in top_salient_proximal_negative_nodes:
    top_salient_proximal_negative_nodes_paths.append(extracted_paths["intermediate_to_MS"][item])
    
top_salient_proximal_positive_nodes_paths = []
for item in top_salient_proximal_positive_nodes:
    top_salient_proximal_positive_nodes_paths.append(extracted_paths["intermediate_to_MS"][item])
    
top_salient_proximal_negative_nodes_paths = pd.concat(top_salient_proximal_negative_nodes_paths, ignore_index=True)
top_salient_proximal_positive_nodes_paths = pd.concat(top_salient_proximal_positive_nodes_paths, ignore_index=True)


In [23]:
nodes_df_negative = pd.DataFrame(pd.concat([top_salient_proximal_negative_nodes_paths.source, top_salient_proximal_negative_nodes_paths.target]), columns=["node_id"]).drop_duplicates()
nodes_df_negative["node_type"] = nodes_df_negative.node_id.apply(lambda x:x.split(":")[0])
nodes_df_negative["node_id"] = nodes_df_negative.node_id.apply(lambda x:":".join(x.split(":")[1:]))

nodes_df_positive = pd.DataFrame(pd.concat([top_salient_proximal_positive_nodes_paths.source, top_salient_proximal_positive_nodes_paths.target]), columns=["node_id"]).drop_duplicates()
nodes_df_positive["node_type"] = nodes_df_positive.node_id.apply(lambda x:x.split(":")[0])
nodes_df_positive["node_id"] = nodes_df_positive.node_id.apply(lambda x:":".join(x.split(":")[1:]))

nodes_df_negative_ = get_node_names(nodes_df_negative)
nodes_df_positive_ = get_node_names(nodes_df_positive)


nodes_df_negative_["composite_id"] = nodes_df_negative_["node_type"] + ":" + nodes_df_negative_["node_id"]
nodes_df_positive_["composite_id"] = nodes_df_positive_["node_type"] + ":" + nodes_df_positive_["node_id"]



In [24]:
top_salient_proximal_negative_nodes_paths_with_node_names = merge_paths_with_node_names(top_salient_proximal_negative_nodes_paths, nodes_df_negative_)
top_salient_proximal_positive_nodes_paths_with_node_names = merge_paths_with_node_names(top_salient_proximal_positive_nodes_paths, nodes_df_positive_)



In [11]:
%%time
with open(os.path.join("..", "..", "spoke_35M_compound_pruned_version.gpickle"), "rb") as f:
    G = pickle.load(f)

CPU times: user 3min 15s, sys: 8min 27s, total: 11min 42s
Wall time: 15min 23s


In [25]:
negative_edgetype_list = []
for index, row in top_salient_proximal_negative_nodes_paths_with_node_names.iterrows():
    negative_edgetype_list.append(G[row["source_id"]][row["target_id"]]["edgetype"])
    
top_salient_proximal_negative_nodes_paths_with_node_names["edgetype"] = negative_edgetype_list


positive_edgetype_list = []
for index, row in top_salient_proximal_positive_nodes_paths_with_node_names.iterrows():
    positive_edgetype_list.append(G[row["source_id"]][row["target_id"]]["edgetype"])

top_salient_proximal_positive_nodes_paths_with_node_names["edgetype"] = positive_edgetype_list
    


In [26]:
top_salient_proximal_positive_nodes_paths_with_node_names.loc[top_salient_proximal_positive_nodes_paths_with_node_names.source_id.isin(top_salient_proximal_positive_nodes), "source_node_flag"] = "salient_proximal"
top_salient_proximal_positive_nodes_paths_with_node_names.loc[~top_salient_proximal_positive_nodes_paths_with_node_names.source_id.isin(top_salient_proximal_positive_nodes), "source_node_flag"] = "connection_node"
top_salient_proximal_positive_nodes_paths_with_node_names.loc[top_salient_proximal_positive_nodes_paths_with_node_names.target_id.isin(top_salient_proximal_positive_nodes), "target_node_flag"] = "salient_proximal"
top_salient_proximal_positive_nodes_paths_with_node_names.loc[~top_salient_proximal_positive_nodes_paths_with_node_names.target_id.isin(top_salient_proximal_positive_nodes), "target_node_flag"] = "connection_node"


top_salient_proximal_negative_nodes_paths_with_node_names.loc[top_salient_proximal_negative_nodes_paths_with_node_names.source_id.isin(top_salient_proximal_negative_nodes), "source_node_flag"] = "salient_proximal"
top_salient_proximal_negative_nodes_paths_with_node_names.loc[~top_salient_proximal_negative_nodes_paths_with_node_names.source_id.isin(top_salient_proximal_negative_nodes), "source_node_flag"] = "connection_node"
top_salient_proximal_negative_nodes_paths_with_node_names.loc[top_salient_proximal_negative_nodes_paths_with_node_names.target_id.isin(top_salient_proximal_negative_nodes), "target_node_flag"] = "salient_proximal"
top_salient_proximal_negative_nodes_paths_with_node_names.loc[~top_salient_proximal_negative_nodes_paths_with_node_names.target_id.isin(top_salient_proximal_negative_nodes), "target_node_flag"] = "connection_node"



In [27]:

negative_edge_filename = "negative_network_edges_for_{}_compounds_{}_sample_sheet_index_{}.tsv".format(compound_type, sample, str(sel_sheet_index))
positive_edge_filename = "positive_network_edges_for_{}_compounds_{}_sample_sheet_index_{}.tsv".format(compound_type, sample, str(sel_sheet_index))


top_salient_proximal_negative_nodes_paths_with_node_names.to_csv(os.path.join(OUTPUT_PATH, "after_spoke_compound_pruning", negative_edge_filename), sep="\t", index=False, header=True)
top_salient_proximal_positive_nodes_paths_with_node_names.to_csv(os.path.join(OUTPUT_PATH, "after_spoke_compound_pruning", positive_edge_filename), sep="\t", index=False, header=True)



In [28]:
top_salient_proximal_negative_nodes_paths_with_node_names.shape

(84, 9)

In [29]:
top_salient_proximal_negative_nodes_paths_with_node_names

Unnamed: 0,source_id,target_id,source_name,target_name,source_node_type,target_node_type,edgetype,source_node_flag,target_node_flag
0,Anatomy:UBERON:0002107,Gene:4155,liver,MBP,Anatomy,Gene,DOWNREGULATES_AdG,salient_proximal,connection_node
1,Gene:4155,Disease:DOID:2377,MBP,multiple sclerosis,Gene,Disease,ASSOCIATES_DaG,connection_node,connection_node
170,Anatomy:UBERON:0006631,Gene:4155,right atrium auricular region,MBP,Anatomy,Gene,DOWNREGULATES_AdG,salient_proximal,connection_node
171,Anatomy:UBERON:0001388,Gene:4155,gastrocnemius,MBP,Anatomy,Gene,DOWNREGULATES_AdG,salient_proximal,connection_node
172,Anatomy:UBERON:0001161,Gene:4155,body of stomach,MBP,Anatomy,Gene,DOWNREGULATES_AdG,salient_proximal,connection_node
...,...,...,...,...,...,...,...,...,...
466,MolecularFunction:GO:0008270,Gene:920,zinc ion binding,CD4,MolecularFunction,Gene,PARTICIPATES_GpMF,salient_proximal,connection_node
467,Gene:920,Disease:DOID:2377,CD4,multiple sclerosis,Gene,Disease,ASSOCIATES_DaG,connection_node,connection_node
468,MolecularFunction:GO:0001067,Gene:7124,regulatory region nucleic acid binding,TNF,MolecularFunction,Gene,PARTICIPATES_GpMF,salient_proximal,connection_node
469,Organism:9606,Protein:P08571,Homo sapiens,CD14_HUMAN,Organism,Protein,ENCODES_OeP,salient_proximal,connection_node
