In [None]:
import pandas as pd
from neo4j import GraphDatabase

In [4]:
uri = "bolt://spokedev.cgl.ucsf.edu:7687"
user = "neo4j"
password = "SPOKEdev"

driver = GraphDatabase.driver(uri, auth=(user, password))

In [5]:
node_query = """
MATCH (n)-[r]-(m)
RETURN DISTINCT n
SKIP $skip LIMIT $limit
"""

In [6]:
edge_query = """
MATCH (n)-[r]-(m)
RETURN DISTINCT r
SKIP $skip LIMIT $limit
"""

In [7]:
batch_size = 100000
node_file = "nodes.tsv"
edge_file = "edges.tsv"

In [8]:
# Function to fetch nodes in batches
def fetch_nodes(tx, skip, limit):
    result = tx.run(node_query, skip=skip, limit=limit)
    nodes = []
    for record in result:
        node = record["n"]
        node_dict = dict(node)
        nodes.append({
            "id": node.element_id,
            "labels": list(node.labels),
            "identifier": node_dict.get("identifier"),
            "properties": node_dict
        })
    return nodes

In [9]:
# Function to fetch edges in batches
def fetch_edges(tx, skip, limit):
    result = tx.run(edge_query, skip=skip, limit=limit)
    edges = []
    for record in result:
        edge = record["r"]
        edge_dict = dict(edge)
        edges.append({
            "id": edge.element_id,
            "start_node": edge.start_node.element_id,
            "end_node": edge.end_node.element_id,
            "edge_type": edge.type,
            "properties": edge_dict
        })
    return edges

In [10]:
# Initialize empty TSV files with headers
pd.DataFrame(columns=["id", "labels", "identifier", "properties"]).to_csv(
    node_file, sep="\t", index=False
)
pd.DataFrame(columns=["id", "start_node", "end_node", "edge_type", "properties"]).to_csv(
    edge_file, sep="\t", index=False
)

NameError: name 'pd' is not defined

In [None]:
# Fetch nodes in batches and save to TSV
with driver.session() as session:
    # Fetch and save nodes
    skip = 0
    while True:
        print(f"Fetching nodes batch starting at {skip}")
        nodes = session.execute_read(fetch_nodes, skip=skip, limit=batch_size)
        if not nodes:
            break
        # Convert to DataFrame
        node_df = pd.DataFrame(nodes)
        # Append to TSV
        node_df.to_csv(node_file, sep="\t", index=False, header=False, mode="a")
        skip += batch_size
        
driver.close()
print(f"Nodes saved to {node_file}.")

# Fetch edges in batches and save to TSV
with driver.session() as session:
    # Fetch and save edges
    skip = 0
    while True:
        print(f"Fetching edges batch starting at {skip}")
        edges = session.execute_read(fetch_edges, skip=skip, limit=batch_size)
        if not edges:
            break
        # Convert to DataFrame
        edge_df = pd.DataFrame(edges)
        # Append to TSV
        edge_df.to_csv(edge_file, sep="\t", index=False, header=False, mode="a")
        skip += batch_size
        
driver.close()
print(f"Edges saved to {edge_file}.")
