In [1]:
%pip install --quiet pandas neo4j-rust-ext

Note: you may need to restart the kernel to use updated packages.


In [8]:
GRAPHRAG_FOLDER = "output/20250912-180410/artifacts"

In [3]:
import time

import pandas as pd
from neo4j import GraphDatabase

In [4]:
NEO4J_URI = "neo4j://localhost"  # or neo4j+s://xxxx.databases.neo4j.io
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "password"  # your password
NEO4J_DATABASE = "neo4j"

# Create a Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [5]:
def batched_import(statement, df, batch_size=1000):
    """
    Import a dataframe into Neo4j using a batched approach.

    Parameters: statement is the Cypher query to execute, df is the dataframe to import, and batch_size is the number of rows to import in each batch.
    """
    total = len(df)
    start_s = time.time()
    for start in range(0, total, batch_size):
        batch = df.iloc[start : min(start + batch_size, total)]
        result = driver.execute_query(
            "UNWIND $rows AS value " + statement,
            rows=batch.to_dict("records"),
            database_=NEO4J_DATABASE,
        )
        print(result.summary.counters)
    print(f"{total} rows in {time.time() - start_s} s.")
    return total

In [6]:
# create constraints, idempotent operation

statements = [
    "\ncreate constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique",
    "\ncreate constraint document_id if not exists for (d:__Document__) require d.id is unique",
    "\ncreate constraint entity_id if not exists for (c:__Community__) require c.community is unique",
    "\ncreate constraint entity_id if not exists for (e:__Entity__) require e.id is unique",
    "\ncreate constraint entity_title if not exists for (e:__Entity__) require e.name is unique",
    "\ncreate constraint entity_title if not exists for (e:__Covariate__) require e.title is unique",
    "\ncreate constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique",
    "\n",
]

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


In [9]:
doc_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_documents.parquet", columns=["id", "title"]
)
doc_df.head(2)

Unnamed: 0,id,title
0,8feea7195c789f4a8d3ec24ffcc358c9866b3874656277...,全国社保基金理事会-了解.md
1,94d225c1c8ae2dacd121dda777e3f003cc28f7a82da4d6...,价值投资入门.md


In [10]:
# Import documents
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
"""

batched_import(statement, doc_df)

{'_contains_updates': True, 'labels_added': 7, 'nodes_created': 7, 'properties_set': 14}
7 rows in 0.5400161743164062 s.


7

In [11]:
text_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_text_units.parquet",
    columns=["id", "text", "n_tokens", "document_ids"],
)
text_df.head(2)

Unnamed: 0,id,text,n_tokens,document_ids
0,ca8fd26596b488b08f1eb2e65582d583a75132f2042c75...,三一重工研究,9,[2b55193f0b6dd5706c7ae963d68d9865cb1872b3fa4da...
1,1d14e2e7e2dda35ec1b199fd8a13fb46730494874f232b...,平安研究,7,[7dddb724e6f9f0d0d010463fd108b1d53bc4ba2757f77...


In [12]:
statement = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

batched_import(statement, text_df)

{'_contains_updates': True, 'labels_added': 14, 'relationships_created': 14, 'nodes_created': 14, 'properties_set': 42}
14 rows in 0.32142210006713867 s.


14

In [13]:
print(f"{GRAPHRAG_FOLDER}/create_final_entities.parquet")


output/20250912-180410/artifacts/create_final_entities.parquet


In [14]:
entity_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_entities.parquet",
    columns=[
        "title",
        "type",
        "description",
        "human_readable_id",
        "id",
        # "description_embedding",
        "text_unit_ids",
    ],
)
entity_df.head(40)

Unnamed: 0,title,type,description,human_readable_id,id,text_unit_ids
0,三一重工,ORGANIZATION,三一重工是一家企业，文本围绕其开展研究,0,6d639d0f-14e2-4a79-882a-411192b347d1,[ca8fd26596b488b08f1eb2e65582d583a75132f2042c7...
1,无,,,1,d565fb39-63cc-42ce-b9d0-31a281f84b83,[ca8fd26596b488b08f1eb2e65582d583a75132f2042c7...
2,三一重工研究,,,2,a59e61b0-5386-4ee2-99bb-7a91efaca5cd,[ca8fd26596b488b08f1eb2e65582d583a75132f2042c7...
3,平安研究,ORGANIZATION,"平安研究是一个开展研究工作的组织，但文本未提供更多详细信息)<|COMPLETE|>(""en...",3,000dee49-94a0-4c26-a539-5e4b2c6f2ec3,[1d14e2e7e2dda35ec1b199fd8a13fb46730494874f232...
4,招商银行,ORGANIZATION,一家开展银行业务的金融机构，进行相关研究工作,4,b7f6fc5e-aea4-4caa-b890-4e3257cb2d86,[0c4ef25969dbb2d18248c248423d6afcaaf4a2c442258...
5,研究,CONCEPT,招商银行开展的对各类事物进行探究、分析等活动）\n招商银行所进行的对金融相关或其他领域知识、...,5,50ca7b56-e627-428c-ae14-b0301cd81f2b,[0c4ef25969dbb2d18248c248423d6afcaaf4a2c442258...
6,全国社会保障基金理事会,ORGANIZATION,我国国家层面设立的、对国家社保基金进行决策和监督的最高机构，是国务院授权的事业性单位，直属财...,6,d6d47261-c0d1-451a-946e-547316124905,[0be002ba5024e23207dacf3a4c6c500821464ed3f9475...
7,国家社保基金,CONCEPT,即National Social Security Fund，是我国城镇职工基本养老保险基金...,7,d3255c65-6669-4676-94b5-d3a41f062b66,[0be002ba5024e23207dacf3a4c6c500821464ed3f9475...
8,社保基金理事会,ORGANIZATION,全国社会保障基金理事会的简称，承担着为我国基本养老保险体系提供长期稳定资金支持的重任，通过制...,8,b9003ef7-2ab6-4e5c-9992-763b5215eebc,[0be002ba5024e23207dacf3a4c6c500821464ed3f9475...
9,2000年4月,EVENT,全国社会保障基金理事会经国务院批准成立的时间,9,fe3d21c8-90da-4be9-8030-c3f794d82aa0,[0be002ba5024e23207dacf3a4c6c500821464ed3f9475...


In [15]:
# 修改后的Cypher查询语句
entity_statement = """
MERGE (e:__Entity__ {id:value.id})
SET e += value {.human_readable_id, name:replace(value.title,'"',''), .description}
WITH e, value
// 过滤 description_embedding 中的无效值
WITH e, value, [x IN coalesce(value.description_embedding, [0.1]) WHERE x IS NOT NULL AND x <> 'NaN' AND x <> 'Infinity' AND x <> '-Infinity'] AS valid_description_embedding
CALL db.create.setNodeVectorProperty(e, "description_embedding", valid_description_embedding)
CALL apoc.create.addLabels(e, case when coalesce(value.type,"") = "" then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND value.text_unit_ids AS text_unit
MATCH (c:__Chunk__ {id:text_unit})
MERGE (c)-[:HAS_ENTITY]->(e)
"""

batched_import(entity_statement, entity_df)

ClientError: {code: Neo.ClientError.Procedure.ProcedureNotFound} {message: There is no procedure with the name `apoc.create.addLabels` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.}

In [16]:
rel_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_relationships.parquet",
    columns=[
        "source",
        "target",
        "id",
        # "rank",
        "weight",
        "human_readable_id",
        "description",
        "text_unit_ids",
    ],
)
rel_df.head(2)

Unnamed: 0,source,target,id,weight,human_readable_id,description,text_unit_ids
0,三一重工,三一重工研究,87545acf-b20a-4a98-8d2c-d1dfa19cfeea,1.0,0,三一重工研究是针对三一重工进行的,[ca8fd26596b488b08f1eb2e65582d583a75132f2042c7...
1,无,无,8cbd6b89-2c8d-471d-8fb1-a8186259398d,4.0,1,根据提供的描述，当前文本中未出现两个相关联的实体，也不存在明确提及的实体对或明显的实体关系。...,[ca8fd26596b488b08f1eb2e65582d583a75132f2042c7...


In [17]:
rel_statement = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

batched_import(rel_statement, rel_df)

{}
263 rows in 0.3031890392303467 s.


263

In [18]:
community_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_communities.parquet",
    columns=["id", "level", "title", "text_unit_ids", "relationship_ids"],
)

community_df.head(2)

Unnamed: 0,id,level,title,text_unit_ids,relationship_ids
0,9bef19f5-87e1-426a-a8eb-6fcd93ff522a,0,Community 0,[5b66e700cc02ae197424a860eb9ca55055de332ba7fc5...,"[171d0d53-4ff7-4fcc-b3e9-04219fa8e966, 17f0045..."
1,c345fc46-b2be-473b-810a-7233ee91df1e,0,Community 1,[9e469aa54fcfdbdfc7a9a6c0a34f72fee99ca77f00681...,"[1557813a-7a61-44e8-9127-1f6276a155ee, 3d80cfe..."


In [19]:
statement = """
MERGE (c:__Community__ {community:value.id})
SET c += value {.level, .title}
/*
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id:text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
*/
WITH *
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
MERGE (end)-[:IN_COMMUNITY]->(c)
RETURn count(distinct c) as createdCommunities
"""

batched_import(statement, community_df)

{'_contains_updates': True, 'labels_added': 34, 'nodes_created': 34, 'properties_set': 102}
34 rows in 0.2894890308380127 s.


34

In [20]:
community_report_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_community_reports.parquet",
    columns=[
        "id",
        "community",
        "level",
        "title",
        "summary",
        "findings",
        "rank",
        "rank_explanation",
        "full_content",
    ],
)
community_report_df.head(2)

Unnamed: 0,id,community,level,title,summary,findings,rank,rank_explanation,full_content
0,52bb32121400439bbdf384c998bf89bc,11,1,公司节点变化社区,该社区围绕公司节点变化展开，节点变化包含战略决策、人才储备、政府关系和领导人动向等要素，这些...,[{'explanation': '节点变化是该社区的核心实体，涵盖了公司发展过程中的关键变...,7.0,由于节点变化涉及公司发展的关键方面，其影响严重性评分为较高。,# 公司节点变化社区\n\n该社区围绕公司节点变化展开，节点变化包含战略决策、人才储备、政府...
1,99f3c4970e22481ca34d0e7423d28fc2,12,1,价值投资社区,该社区围绕价值投资策略展开，涉及股票、企业内在价值等关键实体。价值投资以企业内在价值为决策依...,[{'explanation': '价值投资是该社区的核心策略，它基于企业内在价值进行投资决...,7.0,价值投资作为一种重要的投资策略，对金融市场和投资者决策有较大影响，因此影响严重性评分为较高。,# 价值投资社区\n\n该社区围绕价值投资策略展开，涉及股票、企业内在价值等关键实体。价值投...


In [21]:
# Import communities
community_statement = """
MERGE (c:__Community__ {community:value.community})
SET c += value {.level, .title, .rank, .rank_explanation, .full_content, .summary}
WITH c, value
UNWIND range(0, size(value.findings)-1) AS finding_idx
WITH c, value, finding_idx, value.findings[finding_idx] as finding
MERGE (c)-[:HAS_FINDING]->(f:Finding {id:finding_idx})
SET f += finding
"""
batched_import(community_statement, community_report_df)

{'_contains_updates': True, 'labels_added': 210, 'relationships_created': 179, 'nodes_created': 210, 'properties_set': 754}
31 rows in 0.24725723266601562 s.


31

In [22]:
cov_df = (pd.read_parquet(f"{GRAPHRAG_FOLDER}/create_final_covariates.parquet"),)
#                         columns=["id","text_unit_id"])
cov_df.head(2)
# Subject id do not match entity ids

FileNotFoundError: [Errno 2] No such file or directory: 'output/20250912-180410/artifacts/create_final_covariates.parquet'

In [23]:
# Import covariates
cov_statement = """
MERGE (c:__Covariate__ {id:value.id})
SET c += apoc.map.clean(value, ["text_unit_id", "document_ids", "n_tokens"], [NULL, ""])
WITH c, value
MATCH (ch:__Chunk__ {id: value.text_unit_id})
MERGE (ch)-[:HAS_COVARIATE]->(c)
"""
batched_import(cov_statement, cov_df)

NameError: name 'cov_df' is not defined