In [2]:
%pip install openai

Collecting openai
  Downloading openai-1.54.5-py3-none-any.whl (389 kB)
Collecting anyio<5,>=3.5.0
  Downloading anyio-4.6.2.post1-py3-none-any.whl (90 kB)
Collecting jiter<1,>=0.4.0
  Downloading jiter-0.7.1-cp310-none-win_amd64.whl (201 kB)
Collecting pydantic<3,>=1.9.0
  Downloading pydantic-2.9.2-py3-none-any.whl (434 kB)
Collecting httpx<1,>=0.23.0
  Downloading httpx-0.27.2-py3-none-any.whl (76 kB)
Collecting distro<2,>=1.7.0
  Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Collecting sniffio
  Downloading sniffio-1.3.1-py3-none-any.whl (10 kB)
Collecting httpcore==1.*
  Downloading httpcore-1.0.7-py3-none-any.whl (78 kB)
Collecting h11<0.15,>=0.13
  Using cached h11-0.14.0-py3-none-any.whl (58 kB)
Collecting annotated-types>=0.6.0
  Downloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
Collecting pydantic-core==2.23.4
  Downloading pydantic_core-2.23.4-cp310-none-win_amd64.whl (1.9 MB)
Installing collected packages: sniffio, h11, pydantic-core, httpcore, anyio, annotate

You should consider upgrading via the 'c:\Users\chao\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [1]:
def create_prompt(question, nodes, instances):
    """Build and format the prompt."""
    formatted_prompt = [
        {"role": "system",
        "content": "You are an experienced Cypher developer to convert natural language questions to Cypher queries!"},
        {"role": "user",
         "content": f"""
        Convert the natural language question into Cypher query language, please only output the Cypher.
        ###Question: {question}, Nodes: {nodes}, Instances: {instances}"""
        """
        ###Here is the graph schema:
        Node properties are the following:
        MolecularFunction {url: STRING, license: STRING, name: STRING, identifier: STRING, source: STRING},
        Pathway {license: STRING, source: STRING, identifier: STRING, name: STRING, url: STRING},
        Anatomy {license: STRING, mesh_id: STRING, source: STRING, url: STRING, name: STRING, identifier: STRING, bto_id: STRING},
        PharmacologicClass {class_type: STRING, license: STRING, source: STRING, url: STRING, identifier: STRING, name: STRING},
        Gene {license: STRING, chromosome: STRING, url: STRING, source: STRING, description: STRING, identifier: INTEGER, name: STRING},
        Symptom {license: STRING, name: STRING, source: STRING, identifier: STRING, url: STRING},
        BiologicalProcess {license: STRING, source: STRING, url: STRING, identifier: STRING, name: STRING},
        Disease {license: STRING, source: STRING, url: STRING, identifier: STRING, name: STRING},
        Compound {license: STRING, inchi: STRING, inchikey: STRING, source: STRING, url: STRING, identifier: STRING, name: STRING},
        CellularComponent {license: STRING, name: STRING, url: STRING, identifier: STRING, source: STRING},
        SideEffect {license: STRING, source: STRING, url: STRING, name: STRING, identifier: STRING}
        ###The relationships are the following:
        (:Anatomy)-[:EXPRESSES_AeG]->(:Gene),
        (:Anatomy)-[:DOWNREGULATES_AdG]->(:Gene),
        (:Anatomy)-[:UPREGULATES_AuG]->(:Gene),
        (:PharmacologicClass)-[:INCLUDES_PCiC]->(:Compound),
        (:Gene)-[:PARTICIPATES_GpMF]->(:MolecularFunction),
        (:Gene)-[:PARTICIPATES_GpBP]->(:BiologicalProcess),
        (:Gene)-[:COVARIES_GcG]->(:Gene),
        (:Gene)-[:REGULATES_GrG]->(:Gene),
        (:Gene)-[:INTERACTS_GiG]->(:Gene),
        (:Gene)-[:PARTICIPATES_GpPW]->(:Pathway),
        (:Gene)-[:PARTICIPATES_GpCC]->(:CellularComponent),
        (:Disease)-[:LOCALIZES_DlA]->(:Anatomy),
        (:Disease)-[:ASSOCIATES_DaG]->(:Gene),
        (:Disease)-[:PRESENTS_DpS]->(:Symptom),
        (:Disease)-[:RESEMBLES_DrD]->(:Disease),
        (:Disease)-[:DOWNREGULATES_DdG]->(:Gene),
        (:Disease)-[:UPREGULATES_DuG]->(:Gene),
        (:Compound)-[:UPREGULATES_CuG]->(:Gene),
        (:Compound)-[:DOWNREGULATES_CdG]->(:Gene),
        (:Compound)-[:BINDS_CbG]->(:Gene),
        (:Compound)-[:CAUSES_CcSE]->(:SideEffect),
        (:Compound)-[:RESEMBLES_CrC]->(:Compound),
        (:Compound)-[:TREATS_CtD]->(:Disease),
        (:Compound)-[:PALLIATES_CpD]->(:Disease)
        ###Please only output the Cypher
        """}]
    return formatted_prompt

In [None]:
import os
import json
import time
from typing import Any, Dict, List
from openai import OpenAI

# ---- SAFETY: set OPENAI_API_KEY in your environment ----
# Windows PowerShell:  setx OPENAI_API_KEY "sk-...."
# Then restart terminal / IDE

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get(""),
)


def read_json(file_path: str) -> Any:
    """Reads a JSON file into a Python object."""
    with open(file_path, "r", encoding="utf-8") as fp:
        return json.load(fp)


def write_json_safely(obj, path: str):
    """Write JSON with UTF-8 encoding and indentation."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)


def prompt_model(message):
    """Call the model and return text output."""
    if isinstance(message, str):
        message = [{"role": "user", "content": message}]

    resp = client.chat.completions.create(
        model="gpt-5.1",
        max_completion_tokens=1000,
        temperature=0.7,
        messages=message,
    )
    return resp.choices[0].message.content


def load_checkpoint(path: str) -> Dict[str, Any]:
    """Checkpoint stores processed outputs and error logs."""
    if os.path.exists(path):
        return read_json(path)
    return {"done": {}, "errors": []}
    # done: dict of {id: output_text}
    # errors: list of dicts with diagnostics


def save_checkpoint(path: str, ckpt: Dict[str, Any]):
    write_json_safely(ckpt, path)


def process_file_in_batches(
    input_path: str,
    output_path: str,
    checkpoint_path: str,
    batch_size: int = 30,
    sleep_s: float = 0.0,
    id_key: str = "id",
):
    """
    Processes instances in batches, saving after each batch.
    - output_path: final enriched dataset
    - checkpoint_path: incremental {done, errors} for resume
    """
    data: List[Dict[str, Any]] = read_json(input_path)
    ckpt = load_checkpoint(checkpoint_path)

    # Ensure each instance has a stable id for resume
    for idx, inst in enumerate(data):
        if id_key not in inst:
            inst[id_key] = f"{idx}"  # stable as long as file order unchanged

    total = len(data)
    processed_now = 0

    # Iterate in batches
    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        batch = data[start:end]

        # Process each item in the batch
        for inst in batch:
            inst_id = str(inst[id_key])

            # Skip if already done (resume)
            if inst_id in ckpt["done"]:
                inst["GPT-5.1-Output"] = ckpt["done"][inst_id]
                continue

            try:
                # Your create_prompt should return either a string or messages list
                msg = create_prompt(inst["rewrite_nl"], inst["nodes"], inst["instance"])
                out = prompt_model(msg)

                inst["GPT-5.1-Output"] = out
                ckpt["done"][inst_id] = out
                processed_now += 1

            except Exception as e:
                # Store error with context; keep running
                err = {
                    "id": inst_id,
                    "error": repr(e),
                    "rewrite_nl": inst.get("rewrite_nl", None),
                }
                ckpt["errors"].append(err)
                inst["GPT-5.1-Output"] = None
                inst["GPT-5.1-Error"] = repr(e)

        # Save checkpoint and partial output after each batch
        save_checkpoint(checkpoint_path, ckpt)
        write_json_safely(data, output_path)

        print(f"[{os.path.basename(input_path)}] saved batch {start}-{end} / {total} "
              f"(new processed this run: {processed_now}, total done: {len(ckpt['done'])}, errors: {len(ckpt['errors'])})")

        if sleep_s > 0:
            time.sleep(sleep_s)


# -------------------- RUN --------------------
