#### The purpose of this notebook is to test reading json file into neo4j for eventual automated cypher queries

In [10]:
from neo4j import GraphDatabase
import json

### Loading data

In [8]:
# loading data extracted from pyalex
with open('./data/works_24_filtered.json', 'r') as f:
    data = json.load(f)

### Adding global vars

In [17]:
# Creating connection to neo4j database
DB_URI = 'neo4j://127.0.0.1:7687'
TARGET_DB = 'works'
AUTH = ('neo4j', 'password')

In [31]:
# creating the cypher queries
CREATE_WORK = """
MERGE (w:Work {id: $id})
SET w.doi = $doi,
    w.title = $title,
    w.display_name = $display_name,
    w.publication_date =
        CASE
            WHEN $publication_date IS NULL OR $publication_date = "" THEN NULL
            ELSE date($publication_date)
        END,
    w.type = $type,
    w.related_works = $related_works,
    w.stub = false,
    w.partial = false
"""

CREATE_RELATED = """
MATCH (a:Work {id: $source})
MERGE (b: Work {id: $target})
ON CREATE SET b.stub = true, b.partial = true
MERGE (a)-[:RELATED_TO]->(b)
"""

### Creating Ingestion Class for Neo4j

In [32]:

class Neo4jIngestor:
    def __init__(self, uri, auth, database):
        self.driver = GraphDatabase.driver(uri=uri, auth=auth)
        self.database = database

    def close(self):
        self.driver.close()

    def run_query(self, query, params=None):
        with self.driver.session(database=self.database) as session:
            return session.run(query, params or {})

### creating ingestion functions

In [33]:
def create_work_record(ingestor, record):
    pub_date = record.get("publication_date")
    if not pub_date or pub_date in ["null", "None", None, ""]:
        pub_date = None

    params = {
        "id": record["id"],
        "doi": record.get("doi"),
        "title": record.get("title"),
        "display_name": record.get("display_name"),
        "publication_date": pub_date,
        "type": record.get("type"),
        "related_works": record.get("related_works", [])
    }

    ingestor.run_query(CREATE_WORK, params)


def create_related_links(ingestor, record):
    source = record["id"]
    for target in record.get("related_works", []):
        params = {"source": source, "target": target}
        ingestor.run_query(CREATE_RELATED, params)

### Main Job

In [34]:
def ingest_json(data):
    ingestor = Neo4jIngestor(DB_URI, AUTH, TARGET_DB)

    for i, record in enumerate(data):
        try:
            print(f"[{i+1}/{len(data)}] Inserting {record['id']}")
            create_work_record(ingestor, record)
        except Exception as e:
            print(f"Error on record {i+1}: {record}")
            print("Exception:", e)
            continue

    ingestor.close()
    print("✔ Ingestion completed.")


In [35]:
ingest_json(data)

[1/25] Inserting W4392145873
[2/25] Inserting W4393935425
[3/25] Inserting W4293584584
[4/25] Inserting W2962862931
[5/25] Inserting W4396721167
[6/25] Inserting W4390946922
[7/25] Inserting W4392777592
[8/25] Inserting W4294558607
[9/25] Inserting W4394894573
[10/25] Inserting W4402690901
[11/25] Inserting W4391649568
[12/25] Inserting W4394785938
[13/25] Inserting W4327810158
[14/25] Inserting W2222702172
[15/25] Inserting W4391169031
[16/25] Inserting W4395669330
[17/25] Inserting W4394726321
[18/25] Inserting W4393858312
[19/25] Inserting W4401164724
[20/25] Inserting W4401211033
[21/25] Inserting W4396946025
[22/25] Inserting W4396583326
[23/25] Inserting W4395010520
[24/25] Inserting W4391109864
[25/25] Inserting W4402557213
✔ Ingestion completed.
