In [1]:
import os
import xml.etree.ElementTree as ET
from neo4j import GraphDatabase
from typing import List, Dict
from pathlib import Path

# CONFIGURATION
# Update these with your Neo4j credentials
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "dudu062002"

INPUT_DIR = Path('graph_runs') / 'gpt5-1_gpt5-1'
NODES_XML = os.path.join(INPUT_DIR, "nodes.xml")
RELATIONS_XML = os.path.join(INPUT_DIR, "relations.xml")

class Neo4jLoader:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def clear_database(self):
        """Wipes the entire database to start fresh."""
        print("üßπ Clearing Neo4j Database...")
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
            
    def create_constraints(self):
        """Creates unique constraints to speed up lookups and prevent duplicates."""
        print("üîí Creating constraints...")
        with self.driver.session() as session:
            # Create a unique constraint on 'id' for KnowledgeNode label
            # Note: Syntax varies slightly by Neo4j version (4.x vs 5.x)
            # This is the modern 5.x syntax
            try:
                session.run("CREATE CONSTRAINT node_id_unique IF NOT EXISTS FOR (n:KnowledgeNode) REQUIRE n.id IS UNIQUE")
            except Exception as e:
                print(f"   ‚ö†Ô∏è Constraint warning: {e}")

    def load_nodes(self, xml_path):
        if not os.path.exists(xml_path):
            print(f"‚ùå Nodes file not found: {xml_path}")
            return

        print(f"üì• Loading Nodes from {xml_path}...")
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        with self.driver.session() as session:
            count = 0
            for node in root.findall("node"):
                # Extract attributes
                props = node.attrib
                node_id = props.get("id")
                node_type = props.get("type", "chapter").capitalize() # e.g. "Extracted", "Chapter"
                
                # Dynamic labels: Every node gets :KnowledgeNode AND a specific type label (e.g. :Concept)
                labels = f"KnowledgeNode:{node_type}"
                
                # Cypher query to create/update node
                query = (
                    f"MERGE (n:KnowledgeNode {{id: $id}}) "
                    f"SET n:{node_type}, n += $props "
                    "RETURN n"
                )
                
                session.run(query, id=node_id, props=props)
                count += 1
                
                if count % 100 == 0:
                    print(f"   Processed {count} nodes...", end="\r")
            print(f"\n   ‚úÖ Loaded {count} nodes.")

    def load_relations(self, xml_path):
        if not os.path.exists(xml_path):
            print(f"‚ùå Relations file not found: {xml_path}")
            return

        print(f"üîó Loading Relations from {xml_path}...")
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        with self.driver.session() as session:
            count = 0
            for rel in root.findall("relation"):
                source_id = rel.get("source")
                target_id = rel.get("target")
                rel_type_raw = rel.get("type").upper().replace("-", "_") # e.g. part-of -> PART_OF
                
                # Extract inner context text if available
                context_elem = rel.find("context")
                context = context_elem.text if context_elem is not None else ""
                
                # Cypher query to link nodes
                # Note: We match by ID first, then create the relationship
                query = (
                    "MATCH (a:KnowledgeNode {id: $source_id}) "
                    "MATCH (b:KnowledgeNode {id: $target_id}) "
                    f"MERGE (a)-[r:{rel_type_raw}]->(b) "
                    "SET r.context = $context "
                )
                
                session.run(query, source_id=source_id, target_id=target_id, context=context)
                count += 1
                
                if count % 100 == 0:
                    print(f"   Processed {count} relations...", end="\r")
            print(f"\n   ‚úÖ Loaded {count} relationships.")

def main():
    loader = Neo4jLoader(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)
    
    try:
        loader.clear_database()
        loader.create_constraints()
        loader.load_nodes(NODES_XML)
        loader.load_relations(RELATIONS_XML)
        print("\nüéâ Graph successfully imported into Neo4j!")
    except Exception as e:
        print(f"\n‚ùå Error during import: {e}")
    finally:
        loader.close()

In [2]:
main()

üßπ Clearing Neo4j Database...
üîí Creating constraints...
üì• Loading Nodes from graph_runs/gpt5-1_gpt5-1/nodes.xml...
   Processed 400 nodes...
   ‚úÖ Loaded 459 nodes.
üîó Loading Relations from graph_runs/gpt5-1_gpt5-1/relations.xml...
   Processed 1700 relations...
   ‚úÖ Loaded 1726 relationships.

üéâ Graph successfully imported into Neo4j!
