In [13]:
import neo4j
from neo4j import GraphDatabase, RoutingControl
import networkx as nx
import requests

In [14]:
pub_listing_url = "https://www.designsafe-ci.org/api/publications/v2"

def get_ds_pubs():
    """Return a generator of top-level publication metadata"""
    offset = 0
    limit = 100
    res_length = 100
    while res_length == 100:
        res = requests.get(pub_listing_url, params={"offset": offset, "limit": limit})
        res_json = res.json()

        yield from res_json["result"]
        res_length = len(res_json["result"])
        offset += 100


def get_publication(project_id: str):
    """Retrieve published metadata using the project ID."""
    res = requests.get(f"{pub_listing_url}/{project_id}")
    return res.json()


def iterate_publications():
    """Generator of all published metadata"""
    for pub in get_ds_pubs():
        if pub["type"] not in ["other", "field_reconnaissance"]:
            yield get_publication(pub["projectId"])

In [15]:
URI = "neo4j://localhost:7687"

In [16]:
def setup_db(driver: neo4j.Driver):
    """Create constraints/indices for DesignSafe entities"""
    driver.execute_query("""

    CREATE INDEX entity_uuid IF NOT EXISTS FOR (e:Entity) ON e.uuid
    """)


def cleanup_db(driver: neo4j.Driver):
    """Clear all entries in the database"""
    driver.execute_query("MATCH (n) DETACH DELETE n")

In [17]:
def ingest_entity(driver: neo4j.Driver, uuid: str, title: str, description: str, **kwargs):
    """
    Ingest an entity in neo4j. Its properties will be the provided uuid/title/description 
    and any other provided kwargs. If 'name' is provided it will be used as an additional label
    """
    label = ":Entity"
    # Replacing spaces & periods with underscores
    if 'name' in kwargs and kwargs["name"]:
        cleaned_name = '_'.join(kwargs['name'].split()).replace('.', '_')
        label += f":{cleaned_name}" 
        
    extra_fields = [f"{k}: ${k}" for k in kwargs]
    extra_fields_str = ""
    if extra_fields:
        extra_fields_str = ", " + ", ".join(extra_fields)
    add_entity_query = \
    f"""
    MERGE (e{label} {{uuid: $uuid}})
    SET e = {{uuid: $uuid}} // Allow properties to be unset with subsequent calls
    SET e += {{ title: $title,
                description: $description
                {extra_fields_str}
             }}
    """
    driver.execute_query(add_entity_query, uuid=uuid, title=title, description=description, **kwargs)


def ingest_entity_rel(driver: neo4j.Driver, parent_uuid: str, child_uuid: str, order: int=0):
    """
    Create a :HAS_CHILD relationship in the graph between 2 nodes given their UUIDs
    """
    add_rel_query = \
    """
    MATCH (parent:Entity {uuid: $parent_uuid}) 
    WITH parent 
    MATCH(child: Entity {uuid: $child_uuid}) 
    MERGE (parent)-[:HAS_CHILD {order: $order}]->(child)
    """
    driver.execute_query(add_rel_query, parent_uuid=parent_uuid, child_uuid=child_uuid, order=order)


In [18]:
def ingest_file(driver: neo4j.Driver, file_info: dict):
    """
    Ingest a file in Neo4j using file_info.
    File's path as its unique identifier.
    """
    file_path = file_info.get("path")
    merge_query = """
    MERGE (f:File {path: $file_path})
    SET f.name = $file_name, 
        f.type = $file_type,
        f.length = $file_length,
        f.system = $file_system,
        f.lastModified = $file_last_modified
    """
    driver.execute_query(
        merge_query,
        file_path=file_path,
        file_name=file_info.get("name"),
        file_type=file_info.get("type"),
        file_length=file_info.get("length"),
        file_system=file_info.get("system"),
        file_last_modified=file_info.get("lastModified")
    )

def ingest_entity_file_rel(driver: neo4j.Driver, entity_uuid: str, file_path: str):
    """
    Create a relationship from an entity node to a file node.
    """
    query = """
    MATCH (e:Entity {uuid: $entity_uuid})
    MATCH (f:File {path: $file_path})
    MERGE (e)-[:HAS_FILE]->(f)
    """
    driver.execute_query(query, entity_uuid=entity_uuid, file_path=file_path)
        

In [19]:
def ingest_person(driver: neo4j.Driver, person_data: dict):
    """
    Create or update a :Person node based on unique person identifier.
    Using email as unique identifier.
    """
    unique_id = person_data.get("email")
    
    merge_person_query = """
    MERGE (p:Person {personId: $unique_id})
    SET p.inst = $inst,
        p.email = $email,
        p.fname = $fname,
        p.lname = $lname,
        p.username = $username
    """
    driver.execute_query(
        merge_person_query,
        unique_id=unique_id,
        inst=person_data.get("inst"),
        email=person_data.get("email"),
        fname=person_data.get("fname"),
        lname=person_data.get("lname"),
        username=person_data.get("username")
    )

def ingest_entity_person_rel(driver: neo4j.Driver, entity_uuid: str, person_data: dict, source: str):
    """
    Create a :CONTRIBUTED relationship from an Entity node to a Person node.
    Stores the person's role (e.g., "pi", "co_pi", etc.) and the source array ("users" or "authors").
    """
    unique_id = person_data.get("email")
    contrib_query = """
    MATCH (e:Entity {uuid: $entity_uuid})
    MATCH (p:Person {personId: $unique_id})
    MERGE (e)-[r:CONTRIBUTED {role: $role, source: $source}]->(p)
    """
    driver.execute_query(
        contrib_query,
        entity_uuid=entity_uuid,
        unique_id=unique_id,
        role=person_data.get("role"),
        source=source
    )

In [20]:
def ingest_facility(driver:neo4j.Driver, facility_info: dict):
    """
    Ingest a facility node in Neo4j using facility_info.
    Uses the facility 'name' as the unique identifier.
    """
    facility_id = facility_info.get("id")
    facility_name = facility_info.get("name")
    
    merge_query = """
    MERGE (fac:Facility {name: $facility_name})
    SET fac.facilityId = $facility_id
    """
    
    driver.execute_query(
        merge_query,
        facility_name=facility_name,
        facility_id=facility_id
    )


def ingest_entity_facility_rel(driver: neo4j.Driver, entity_uuid:str, facility_info:dict):
    """
    Create a relationship from an Entity node to a Facility node.
    The relationship is labeled :HAS_FACILITY.
    """
    facility_name = facility_info.get("name")
    query = """
    MATCH (e:Entity {uuid: $entity_uuid})
    MATCH (fac:Facility {name: $facility_name})
    MERGE (e)-[:HAS_FACILITY]->(fac)
    """
    
    driver.execute_query(query, entity_uuid=entity_uuid, facility_name=facility_name)

In [21]:
pub_json = get_publication("PRJ-5829")

with GraphDatabase.driver(URI) as driver:
    setup_db(driver)

# Create a node in the neo4j graph for every node in the project tree
with GraphDatabase.driver(URI) as driver:
    pub_tree = nx.tree_graph(pub_json['tree'])
    for node in pub_tree.nodes:
        node_data = pub_tree.nodes[node]
        title = node_data["value"]["title"]
        uuid = node
        name = node_data["name"]
        meta_uuid = node_data["uuid"]
        description = node_data["value"].get("description", None)
        ingest_entity(driver, uuid, title, description, name=name, meta_uuid=meta_uuid)
        
        fac = node_data["value"].get("facility")
        facs = node_data["value"].get("facilities", [])
        facilities = []
        if fac:
            facilities.append(fac)
        facilities.extend(facs)
        
        for facility_info in facilities:
            ingest_facility(driver, facility_info)
            ingest_entity_facility_rel(driver, uuid, facility_info)

        file_objs = node_data["value"].get("fileObjs", [])
        for file_info in file_objs:
            ingest_file(driver, file_info)
            ingest_entity_file_rel(driver, uuid, file_info["path"])

        users = node_data["value"].get("users", [])
        for user in users:
            ingest_person(driver, user)
            ingest_entity_person_rel(driver, uuid, user, source="users")

        authors = node_data["value"].get("authors", [])
        for author in authors:
            ingest_person(driver, author)
            ingest_entity_person_rel(driver, uuid, author, source="authors")
        

In [22]:
# Get a dictionary mapping each node ID to its parent in the tree
dfs_pred = nx.dfs_predecessors(pub_tree, 'NODE_ROOT')

# Go through each relationship and insert an edge in the graph
with GraphDatabase.driver(URI) as driver:
    for key in dfs_pred:
        child_uuid = key
        parent_uuid = dfs_pred[key]
        order = pub_tree.nodes[key].get("order", 0)
        ingest_entity_rel(driver, parent_uuid, child_uuid, order)


In [12]:
# Run this to clean up the db if something goes wrong
with GraphDatabase.driver(URI) as driver:
    cleanup_db(driver)