In [2]:
import requests
import pandas as pd
import networkx as nx
from tqdm import tqdm


In [3]:
config_data = {
    'BASE_URI' : 'https://spoke.rbvi.ucsf.edu',
    'cutoff_Compound_max_phase' : 3,
    'cutoff_Protein_source' : ['SwissProt'],
    'cutoff_DaG_diseases_sources' : ['knowledge', 'experiments'],
    'cutoff_DaG_textmining' : 3,
    'cutoff_CtD_phase' : 3,
    'cutoff_PiP_confidence' : 0.7,
    'cutoff_ACTeG_level' : ['Low', 'Medium', 'High'],
    'cutoff_DpL_average_prevalence' : 0.001,
    'depth' : 2
}


In [4]:
def get_spoke_api_resp(base_uri, end_point, params=None):
    uri = base_uri + end_point
    if params:
        return requests.get(uri, params=params)
    else:
        return requests.get(uri)

    
def get_context_using_spoke_api(node_value):
    type_end_point = "/api/v1/types"
    result = get_spoke_api_resp(config_data['BASE_URI'], type_end_point)
    data_spoke_types = result.json()
    node_types = list(data_spoke_types["nodes"].keys())
    edge_types = list(data_spoke_types["edges"].keys())
    node_types_to_remove = ["DatabaseTimestamp", "Version"]
    filtered_node_types = [node_type for node_type in node_types if node_type not in node_types_to_remove]
    api_params = {
        'node_filters' : filtered_node_types,
        'edge_filters': edge_types,
        'cutoff_Compound_max_phase': config_data['cutoff_Compound_max_phase'],
        'cutoff_Protein_source': config_data['cutoff_Protein_source'],
        'cutoff_DaG_diseases_sources': config_data['cutoff_DaG_diseases_sources'],
        'cutoff_DaG_textmining': config_data['cutoff_DaG_textmining'],
        'cutoff_CtD_phase': config_data['cutoff_CtD_phase'],
        'cutoff_PiP_confidence': config_data['cutoff_PiP_confidence'],
        'cutoff_ACTeG_level': config_data['cutoff_ACTeG_level'],
        'cutoff_DpL_average_prevalence': config_data['cutoff_DpL_average_prevalence'],
        'depth' : config_data['depth']
    }
    node_type = "Disease"
    attribute = "name"
    nbr_end_point = "/api/v1/neighborhood/{}/{}/{}".format(node_type, attribute, node_value)
    result = get_spoke_api_resp(config_data['BASE_URI'], nbr_end_point, params=api_params)
    node_context = result.json()
    nbr_nodes = []
    nbr_edges = []
    for item in node_context:
        if "_" not in item["data"]["neo4j_type"]:
            try:
                if item["data"]["neo4j_type"] == "Protein":
                    nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["description"]))
                else:
                    nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["name"]))
            except:
                nbr_nodes.append((item["data"]["neo4j_type"], item["data"]["id"], item["data"]["properties"]["identifier"]))
        elif "_" in item["data"]["neo4j_type"]:
            try:
                provenance = ", ".join(item["data"]["properties"]["sources"])
            except:
                try:
                    provenance = item["data"]["properties"]["source"]
                    if isinstance(provenance, list):
                        provenance = ", ".join(provenance)                    
                except:
                    try:                    
                        preprint_list = ast.literal_eval(item["data"]["properties"]["preprint_list"])
                        if len(preprint_list) > 0:                                                    
                            provenance = ", ".join(preprint_list)
                        else:
                            pmid_list = ast.literal_eval(item["data"]["properties"]["pmid_list"])
                            pmid_list = map(lambda x:"pubmedId:"+x, pmid_list)
                            if len(pmid_list) > 0:
                                provenance = ", ".join(pmid_list)
                            else:
                                provenance = "Based on data from Institute For Systems Biology (ISB)"
                    except:                                
                        provenance = "SPOKE-KG"     
            try:
                evidence = item["data"]["properties"]
            except:
                evidence = None
            nbr_edges.append((item["data"]["source"], item["data"]["neo4j_type"], item["data"]["target"], provenance, evidence))
    nbr_nodes_df = pd.DataFrame(nbr_nodes, columns=["node_type", "node_id", "node_name"])
    nbr_edges_df = pd.DataFrame(nbr_edges, columns=["source", "edge_type", "target", "provenance", "evidence"])
    merge_1 = pd.merge(nbr_edges_df, nbr_nodes_df, left_on="source", right_on="node_id").drop("node_id", axis=1)
    merge_1.loc[:,"node_name"] = merge_1.node_type + " " + merge_1.node_name
    merge_1.drop(["source", "node_type"], axis=1, inplace=True)
    merge_1 = merge_1.rename(columns={"node_name":"source"})
    merge_2 = pd.merge(merge_1, nbr_nodes_df, left_on="target", right_on="node_id").drop("node_id", axis=1)
    merge_2.loc[:,"node_name"] = merge_2.node_type + " " + merge_2.node_name
    merge_2.drop(["target", "node_type"], axis=1, inplace=True)
    merge_2 = merge_2.rename(columns={"node_name":"target"})
    merge_2 = merge_2[["source", "edge_type", "target", "provenance", "evidence"]]
    merge_2.loc[:, "predicate"] = merge_2.edge_type.apply(lambda x:x.split("_")[0])
    merge_2.loc[:, "context"] =  merge_2.source + " " + merge_2.predicate.str.lower() + " " + merge_2.target + " and Provenance of this association is " + merge_2.provenance + "."
    context = merge_2.context.str.cat(sep=' ')
    context += node_value + " has a " + node_context[0]["data"]["properties"]["source"] + " identifier of " + node_context[0]["data"]["properties"]["identifier"] + " and Provenance of this is from " + node_context[0]["data"]["properties"]["source"] + "."
    return context, merge_2


In [5]:
%%time

node_name = 'amyloidosis'
node_context,context_table = get_context_using_spoke_api(node_name)


CPU times: user 645 ms, sys: 125 ms, total: 770 ms
Wall time: 11.1 s


In [6]:
%%time

graph = nx.Graph()

for index, row in tqdm(context_table.iterrows()):
    graph.add_edge(row['source'], row['target'], edge_type=row["predicate"])
    

51093it [00:01, 46067.40it/s]

CPU times: user 1.1 s, sys: 56.4 ms, total: 1.16 s
Wall time: 1.16 s





In [7]:
disease_node = 'Disease amyloidosis'
graph[disease_node]['Gene APOE']

{'edge_type': 'ASSOCIATES'}

In [8]:
%%time

extracted_path = []
for neighbor_1 in graph.neighbors(disease_node):
    if neighbor_1.startswith('Gene') | neighbor_1.startswith('Protein') | neighbor_1.startswith('Disease'):
        for neighbor_2 in graph.neighbors(neighbor_1):
            if neighbor_2.startswith('Compound'):
                if graph[neighbor_1][neighbor_2]['edge_type'] != 'CONTRAINDICATES':
                    extracted_path.append((disease_node, graph[disease_node][neighbor_1]['edge_type'], neighbor_1, 
                                           graph[neighbor_1][neighbor_2]['edge_type'], neighbor_2))
            
    

CPU times: user 16.7 ms, sys: 1.02 ms, total: 17.8 ms
Wall time: 17.6 ms


In [9]:
extracted_path[10:16]

[('Disease amyloidosis',
  'RESEMBLES',
  'Disease cardiomyopathy',
  'TREATS',
  'Compound Dexrazoxane'),
 ('Disease amyloidosis',
  'RESEMBLES',
  'Disease cardiomyopathy',
  'TREATS',
  'Compound Prednisone'),
 ('Disease amyloidosis',
  'ASSOCIATES',
  'Gene APOE',
  'DOWNREGULATES',
  'Compound Alizapride'),
 ('Disease amyloidosis',
  'ASSOCIATES',
  'Gene APOE',
  'DOWNREGULATES',
  'Compound Proglumide'),
 ('Disease amyloidosis',
  'ASSOCIATES',
  'Gene APOE',
  'DOWNREGULATES',
  'Compound Idelalisib'),
 ('Disease amyloidosis',
  'ASSOCIATES',
  'Gene APOE',
  'UPREGULATES',
  'Compound Lorazepam')]

In [49]:
# %%time

for path in find_connected_compounds(graph, "Disease amyloidosis"):
#     print(path)
    print(" -> ".join(path))

Current node: Disease amyloidosis, Path: ['Disease amyloidosis']


In [39]:
disease_node = 'Disease amyloidosis'
queue = deque([(disease_node, [disease_node])])

while queue:
    current_node, path = queue.popleft()

    # Check if the current node is a Gene node
    if current_node.startswith("Gene"):
        # If so, check its neighbors for Compound nodes
        for neighbor in graph.neighbors(current_node):
            if neighbor.startswith("Compound"):
                # If a Compound node is found, yield the path
                yield path + [neighbor]
            elif neighbor not in path:
                # If a non-Compound node is found, add it to the queue
                queue.append((neighbor, path + [neighbor]))


SyntaxError: 'yield' outside function (3157857207.py, line 13)

In [92]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

In [None]:
embedding_function = 

In [None]:
node_context_list = node_context.split(". ")        
node_context_embeddings = embedding_function.embed_documents(node_context_list)