In [7]:
import pandas as pd
import py2neo
import os

In [8]:
NEO4J_CONN_URL = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASS = "graph"

In [9]:
pd.options.display.max_colwidth = 100

In [10]:
graph = py2neo.Graph(NEO4J_CONN_URL, auth=(NEO4J_USER, NEO4J_PASS))

# Find Important Nodes

for this to work you need to install and configure pagerank algorithm in Neoj4 (follow docs)

In [11]:
def get_nodes_by_pagerank(graph, ent_type):
    query = """
CALL algo.pageRank.stream('%s', 'REL', {iterations:20, dampingFactor:0.85})
YIELD nodeId, score
RETURN algo.asNode(nodeId).ename AS page, score
ORDER BY score DESC
    """ % (ent_type)
    results = graph.run(query).data()
    return pd.DataFrame(results)

In [13]:
important_entities_df = get_nodes_by_pagerank(graph, "ENTITY")
important_entities_df.head(10)

Unnamed: 0,page,score
0,RNA_Sequence,64.238304
1,Trans_Fatty_Acids,61.843542
2,Hold_dosing_instruction_fragment,57.251569
3,GZMB_protein_human,56.530338
4,Count,54.344053
5,Patients,47.259701
6,Final,40.012675
7,General_medical_service,38.346763
8,Exposure_Domain,37.530772
9,chlorendic_acid,36.96676


# Find interesting neighbors

In [14]:
def get_neighbors_by_type(graph, src_name, src_type, neighbor_type):
    query = """
MATCH (e1:%s {ename:"%s"})<-[r:REL]->(e2:%s) 
RETURN e1.ename AS src, e2.ename AS dst
    """ % (src_type, src_name, neighbor_type)
    results = graph.run(query).data()
    results_df = (pd.DataFrame(results)
        .groupby(["src", "dst"])["dst"]
        .count()
        .reset_index(name="count")
        .sort_values("count", ascending=False)
    )
    return results_df

In [15]:
djt_per_neighbors_df = get_neighbors_by_type(graph, "GZMB_protein_human", "ENTITY", "ENTITY")
djt_per_neighbors_df.head()

Unnamed: 0,src,dst,count
2174,GZMB_protein_human,Late_Infantile_Neuronal_Ceroid_Lipfuscinosis,17
2147,GZMB_protein_human,Laboratory_domain,17
2145,GZMB_protein_human,Laboratory_biosafety_level,17
4028,GZMB_protein_human,Trial_of_Labor,17
2212,GZMB_protein_human,Likelihood_of_Cancer_Cure,17


In [None]:
# FIXME: change to postgres
# def build_sentence_dictionary(sent_file):
#     sent_dict = {}
#     fsent = open(sent_file, "r")
#     for line in fsent:
#         pid, sid, sent_text = line.strip().split('\t')
#         sent_dict[sid] = sent_text
#     fsent.close()
#     return sent_dict

# sent_dict = build_sentence_dictionary(os.path.join(DATA_DIR, "sentences.tsv"))
# len(sent_dict)

In [None]:
# FIXME: change use of sent dictionary
# def show_connecting_sentences(graph, src_name, src_type, dst_name, dst_type, sent_dict):
#     query = """
# MATCH (e1:%s {ename:"%s"})<-[r:REL]->(e2:%s {ename:"%s"}) 
# RETURN e1.ename AS src, e2.ename AS dst, r.sid AS sid
# ORDER BY sid
#     """ % (src_type, src_name, dst_type, dst_name)
#     result = graph.run(query).data()
#     result_df = pd.DataFrame(result)
#     result_df["sent_text"] = result_df["sid"].apply(lambda x: sent_dict[x])
#     return result_df

In [None]:
# EXAMPLE:
# djt_db_rel_df = show_connecting_sentences(graph, "Donald J. Trump", "PER", "Deutsche Bank", "ORG", sent_dict)
# djt_db_rel_df.head(10)

# Find path connecting a pair of nodes

In [16]:
# def get_path_between(graph, src_name, src_type, dst_name, dst_type):
#     query = """
# MATCH (start:%s {ename:'%s'}), (end:%s {ename:'%s'})
# CALL algo.shortestPath.stream(start, end)
# YIELD nodeId, cost
# RETURN algo.asNode(nodeId).ename AS name, cost    
#     """ % (src_type, src_name, dst_type, dst_name)
#     results = graph.run(query).data()
#     path = [x["name"] for x in results]
#     return path

In [None]:
# djt_putin_link = get_path_between(graph, "Donald J. Trump", "PER", "Vladimir Putin", "PER")
# print(djt_putin_link)