In [68]:
# Import necessary libraries
from py2neo import Graph,Node,Relationship
import pandas as pd
from py2neo.bulk import create_nodes
import time

In [71]:
# Conection Variables
url = "bolt://localhost:7687"
user = "neo4j"
pwd = "brandoncohen"

In [74]:
# Establish a connection to Neo4j 
graph = Graph("bolt://localhost:7687", auth=("neo4j", "brandoncohen"))

# Check if I connected
try:
    # Run a simple query to test the connection
    result = graph.run("RETURN 1 AS check").data()
    if len(result) > 0 and result[0]['check'] == 1:
        print("Connected to Neo4j.")
    else:
        print("Connection test failed.")
except Exception as e:
    print("Connection error:", e)

Connected to Neo4j.


In [42]:
nodes_path = "C:/Users/bwc07/Downloads/nodes_test.tsv"
data = []   # Holds each line seperated in an array
temp = []   # Used to hold line after split by '\t'

# Read the file by each line
with open(nodes_path, "r") as file:
    for line in file:
        line = line[:-1]   #Remove '\n' at end of line
        temp = line.split('\t')
        data.append(temp)
        
# Make dataframe
df_nodes = pd.DataFrame(data)
df_nodes = df_nodes.drop(df_nodes.columns[-1], axis=1)   # Drop last column
df_nodes.columns = df_nodes.iloc[0]   # Make first row the Column titles
df_nodes = df_nodes[1:]   # Remove the first column
df_nodes.head()

Unnamed: 0,id,name,kind
1,Anatomy::UBERON:0000002,uterine cervix,Anatomy
2,Anatomy::UBERON:0000004,nose,Anatomy
3,Anatomy::UBERON:0000006,islet of Langerhans,Anatomy
4,Anatomy::UBERON:0000007,pituitary gland,Anatomy
5,Anatomy::UBERON:0000010,peripheral nervous system,Anatomy


In [None]:
"""
ANOTHER METHOD OF ADDING NODES THAT WILL ALSO COUNT THE NUMBER OF DIFFERENT LABELED NODES

# Counts: Anatomies->402, Compounds->1555, Diseases->138, Genes->20947
count_a, count_c, count_d, count_g = 0, 0, 0, 0  # Initialize counts

# Create nodes and count how many nodes for each labels 
for index, row in df_nodes.iterrows():
    
    node_type = row[2]
    
    if node_type == "Anatomy":
        identification = row[0][9:]
        name = row[1]
        count_a += 1
        
        #Create Node and send ot Neo4j
        node_properties = {
            "id": identification,
            "name": name,
            "kind": "Anatomy"
        }
        new_node = Node("Anatomy", **node_properties)
        graph.create(new_node)
        
    elif node_type == "Compound":
        identification = row[0][10:]
        name = row[1]
        count_c += 1
        
        #Create Node and send ot Neo4j
        node_properties = {
            "id": identification,
            "name": name,
            "kind": "Compound"
        }
        new_node = Node("Compound", **node_properties)
        graph.create(new_node)
        
    elif node_type == "Disease":
        identification = row[0][9:]
        name = row[1]
        count_d += 1
        
        #Create Node and send ot Neo4j
        node_properties = {
            "id": identification,
            "name": name,
            "kind": "Disease"
        }
        new_node = Node("Disease", **node_properties)
        graph.create(new_node)
        
    else:
        id = row[0][6:]
        name = row[1]
        count_g += 1
        
        #Create Node and send ot Neo4j
        node_properties = {
            "id": identification,
            "name": name,
            "kind": "Gene"
        }
        new_node = Node("Gene", **node_properties)
        graph.create(new_node)
    
total_count = count_a + count_c + count_d + count_g
print(count_a, count_c, count_d, count_g)
"""

In [75]:
# Convert dataframe to dictionary
nodes_dict = df_nodes.to_dict(orient='records')

In [76]:
# Record the start time
start_time = time.time()

# Upload nodes to Neo4j
for ndata in nodes_dict:
    nLabel = ndata.get("kind")  
    node = Node(nLabel, **ndata)
    graph.create(node)
    

# Verify the number of nodes created
node_count = graph.run("MATCH () RETURN COUNT(*) AS count").evaluate()
print(f"Total nodes created: {node_count}")

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Total nodes created: 23042
Elapsed time: 140.7447121143341 seconds


In [77]:
edges_path = "C:/Users/bwc07/Downloads/edges_test.tsv"
data_e = []   # Holds each line seperated in an array
temp_e = []   # Used to hold line after split by '\t'

# Read the file by each line
with open(edges_path, "r") as file:
    for line in file:
        line = line[:-1]   #Remove '\n' at end of line
        temp_e = line.split('\t')
        data_e.append(temp_e)
        
# Make dataframe
df_edges = pd.DataFrame(data_e)
df_edges = df_edges.drop(df_edges.columns[-1], axis=1)   # Drop last column
df_edges.columns = df_edges.iloc[0]   # Make first row the Column titles
df_edges = df_edges[1:]   # Remove the first column
df_edges

Unnamed: 0,ource,metaedge,target
1,Gene::801,GiG,Gene::7428
2,Gene::5987,GiG,Gene::9412
3,Gene::5747,GiG,Gene::79738
4,Gene::3725,GiG,Gene::10514
5,Gene::10014,GiG,Gene::55844
...,...,...,...
1292206,Anatomy::UBERON:0000057,AeG,Gene::65009
1292207,Anatomy::UBERON:0000474,AeG,Gene::80279
1292208,Anatomy::UBERON:0002048,AeG,Gene::1211
1292209,Anatomy::UBERON:0002048,AeG,Gene::8843


In [78]:
# I have to do this because there was a format error when reading the data
temp_split = df_edges.iloc[409606][0].split(" ")
# Correct data
df_edges['ource'] = df_edges['ource'].replace([df_edges.iloc[409606][0]], temp_split[0])
df_edges['metaedge'] = df_edges['metaedge'].replace(["Gene::10000000"], temp_split[1])
df_edges['target'] = df_edges['target'].replace([None], "Gene::10000000")

In [79]:
relations = ['GiG', 'CrC', 'DdG', 'DlA', 'CtD', 'CbG', 'CuG', 'CdG', 'DrD', 
             'DaG', 'CpD', 'AdG', 'AuG', 'GcG', 'Gr>G', 'DuG', 'AeG']

# GiG = Gene interacts with Gene
# CrC = Compound resembles Compound
# DdG = Disease downregulates Gene
# DlA = Disease localizes Anatomy
# CtD = Compound treats Disease
# CbG = Compound binds to Gene
# CuG = Compound upregulates Gene
# CdG = Compound downregulates Gene
# DrD = Disease resembles Disease
# DaG = Disease associates Gene
# CpD = Compound palliates Disease
# AdG = Anatomy downregulates Gene
# AuG = Anatomy upregulates Gene
# Gene::10000000
# GcG = Gene covaries Gene
# Gr>G = Gene regulates Gene
# DuG = Disease upregulates Gene
# AeG = Anatomy expresses Gene

# Relationships: 'UPREGULATES', 'DOWNREGULATES', 'EXPRESSES', 'INTERACTS', 'COVARIES', 
#'REGULATES', 'ASSOCIATES', 'BINDS', 'RESEMBLES', 'TREATS', 'PALLIATES', 'LOCALIZES'  

In [80]:
# Add new relationship to current batch
def add_batch(row, batch, relationship):
    # Get labels of each node
    n1_label = row[0].split("::")[0]
    n2_label = row[2].split("::")[0]

    # Append the nodes and relationship to the current batch
    batch.append({ "source_id": row["ource"], 
                  "target_id": row["target"], 
                  "relationship_type": relationship, 
                  "source_label": n1_label, 
                  "target_label": n2_label 
                 })    

def Query(batch):
    # Create transaction
    tx = graph.begin()
    
    try:
        # Iterate over all relationships 
        for record in batch:
            # Create relationships using Cypher for faster run time
            query = (
                f"MATCH (n1:{record['source_label']}), (n2:{record['target_label']}) "
                f"WHERE n1.id = '{record['source_id']}' AND n2.id = '{record['target_id']}' "
                f"CREATE (n1)-[:{record['relationship_type']}]->(n2)"
            )
            tx.run(query)
        
        # Commit the transaction
        graph.commit(tx)

    # Exception used to reverse the error
    except Exception as e:
        # Rollback the transaction and send up
        tx.rollback()
        raise e 

In [86]:
# Record the start time
start_time = time.time()

# Initialize an empty batch
current_batch = []

# Define the batch size
MAX_BATCH_SIZE = 1

# Add relationships
for index, row in df_edges.iloc[500000:].iterrows():
    
    if row["metaedge"] == relations[0]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "INTERACTS")

        
    elif row["metaedge"] == relations[1]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "RESEMBLES")
        

    elif row["metaedge"] == relations[2]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "DOWNREGULATES")
        

    elif row["metaedge"] == relations[3]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "LOCALIZES")


    elif row["metaedge"] == relations[4]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "TREATS")


    elif row["metaedge"] == relations[5]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "BINDS")


    elif row["metaedge"] == relations[6]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "UPREGULATES")


    elif row["metaedge"] == relations[7]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "DOWNREGULATES")


    elif row["metaedge"] == relations[8]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "RESEMBLES")


    elif row["metaedge"] == relations[9]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "ASSOCIATES")


    elif row["metaedge"] == relations[10]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "PALLIATES")


    elif row["metaedge"] == relations[11]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "DOWNREGULATES")


    elif row["metaedge"] == relations[12]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "UPREGULATES")


    elif row["metaedge"] == relations[13]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "COVARIES")


    elif row["metaedge"] == relations[14]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "REGULATES")


    elif row["metaedge"] == relations[15]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "UPREGULATES")

        
    elif row["metaedge"] == relations[16]:
        # Check if the batch size is reached
        if len(current_batch) >= MAX_BATCH_SIZE:
            Query(current_batch)
            current_batch = []

        add_batch(row, current_batch, "EXPRESSES")

# Query last batch
if current_batch:
    Query(current_batch)

# Record the end time
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 9451.440549373627 seconds


In [55]:
"""
DO NOT RUN UNLESS YOU WANT TO DELETE ALL NODES!!! 
#query = "MATCH ()-[r]->() DELETE r"
#graph.run(query)
"""

In [None]:
# Question 1 : Based on a disease, what compounds treats or palliates the disease?
# If you want to see one disease, just match d2 to the certain id#.
# For example, MATCH (d2:Disease) WHERE d2.id = 'id#'
cypher_query = """
    MATCH (c1:Compound)-[:RESEMBLES]->(c2:Compound)
    MATCH (c1)-[:TREATS|PALLIATES]->(d1:Disease)
    MATCH (c2)-[:TREATS|PALLIATES]->(d2:Disease)
    WHERE NOT (d1 = d2)
    MATCH (d2)-[:ASSOCIATES]->(gene:Gene)
    MATCH (d2)-[:LOCALIZES]->(a:Anatomy)
    RETURN c2, d2, gene, a
"""

# Execute the query
result = graph.run(cypher_query)

# Process and print the results
for record in result:
    print(record)


In [None]:
# Question 2 : What new compounds can treat or palliate a disease that were recently not know to interact with the disease?
# Define your Cypher query
cypher_query = """
    MATCH (gene:Gene)
    WITH gene
    MATCH (gene)<-[:UPREGULATES]-(anatomy:Anatomy)
    MATCH (gene)<-[:DOWNREGULATES]-(compound:Compound)
    MATCH (gene)-[:REGULATES]->(gen)
    MATCH (gen)-[:COVARIES]->(ge)
    MATCH (ge)-[:INTERACTS]->(g)
    MATCH (disease:Disease)-[:ASSOCIATES]->(g)
    WHERE NOT (compound)-[:TREATS]->(disease)
    AND NOT (compound)-[:PALLIATES]->(disease)
    RETURN compound, disease
    UNION DISTINCT
    MATCH (gene:Gene)
    WITH gene
    MATCH (gene)<-[:DOWNREGULATES]-(anatomy:Anatomy)
    MATCH (gene)<-[:UPREGULATES]-(compound:Compound)
    MATCH (gene)-[:REGULATES]->(gen)
    MATCH (gen)-[:COVARIES]->(ge)
    MATCH (ge)-[:INTERACTS]->(g)
    MATCH (disease:Disease)-[:ASSOCIATES]->(g)
    WHERE NOT (compound)-[:TREATS]->(disease)
    AND NOT (compound)-[:PALLIATES]->(disease)
    RETURN compound, disease
"""

# Execute the query
result = graph.run(cypher_query)

# Process and print the results
for record in result:
    print(record)
