In [1]:
%pip install --quiet pandas neo4j-rust-ext

Note: you may need to restart the kernel to use updated packages.


In [2]:
GRAPHRAG_FOLDER = "output/2025912-180410/artifacts"

In [3]:
import time

import pandas as pd
from neo4j import GraphDatabase

In [4]:
NEO4J_URI = "neo4j://localhost"  # or neo4j+s://xxxx.databases.neo4j.io
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "password"  # your password
NEO4J_DATABASE = "neo4j"

# Create a Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [5]:
def batched_import(statement, df, batch_size=1000):
    """
    Import a dataframe into Neo4j using a batched approach.

    Parameters: statement is the Cypher query to execute, df is the dataframe to import, and batch_size is the number of rows to import in each batch.
    """
    total = len(df)
    start_s = time.time()
    for start in range(0, total, batch_size):
        batch = df.iloc[start : min(start + batch_size, total)]
        result = driver.execute_query(
            "UNWIND $rows AS value " + statement,
            rows=batch.to_dict("records"),
            database_=NEO4J_DATABASE,
        )
        print(result.summary.counters)
    print(f"{total} rows in {time.time() - start_s} s.")
    return total

In [6]:
# create constraints, idempotent operation

statements = [
    "\ncreate constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique",
    "\ncreate constraint document_id if not exists for (d:__Document__) require d.id is unique",
    "\ncreate constraint entity_id if not exists for (c:__Community__) require c.community is unique",
    "\ncreate constraint entity_id if not exists for (e:__Entity__) require e.id is unique",
    "\ncreate constraint entity_title if not exists for (e:__Entity__) require e.name is unique",
    "\ncreate constraint entity_title if not exists for (e:__Covariate__) require e.title is unique",
    "\ncreate constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique",
    "\n",
]

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


In [7]:
doc_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_documents.parquet", columns=["id", "title"]
)
doc_df.head(2)

Unnamed: 0,id,title
0,6c29e796e48351f3cec2ca9014d88fd3c435586234cc97...,0313抓大鹅小游戏.csv
1,fe719e9837ef5872bbc9696bbfbadde65579efec13057a...,0329苹果.csv


In [8]:
# Import documents
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
"""

batched_import(statement, doc_df)

{'_contains_updates': True, 'labels_added': 39, 'nodes_created': 39, 'properties_set': 78}
39 rows in 0.11804890632629395 s.


39

In [9]:
text_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_text_units.parquet",
    columns=["id", "text", "n_tokens", "document_ids"],
)
text_df.head(2)

Unnamed: 0,id,text,n_tokens,document_ids
0,759c5af2aad3e1985b6893b369a9b9781f045235d05d9f...,广告主id cj8wd89bjr24\n产品名称\t我的亲家\n所属行业\t\n下单主...,265,[01584e6502dcd7e2e5743e63268e918edcf0dcc5d57bb...
1,ffb7364bae253823e37dd9c595945797e21df3bc98d217...,广告主id f8dfdh7w8nrby2o487\n产品名称\t农夫山泉\n所属行业\...,264,[0363ee6502a3691e4a6218251c5ea5c1fe8b8c4ed2703...


In [10]:
statement = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

batched_import(statement, text_df)

{'_contains_updates': True, 'labels_added': 42, 'relationships_created': 42, 'nodes_created': 42, 'properties_set': 126}
42 rows in 0.1136021614074707 s.


42

In [11]:
print(f"{GRAPHRAG_FOLDER}/create_final_entities.parquet")


output/20250109-110754/artifacts/create_final_entities.parquet


In [12]:
entity_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_entities.parquet",
    columns=[
        "title",
        "type",
        "description",
        "human_readable_id",
        "id",
        # "description_embedding",
        "text_unit_ids",
    ],
)
entity_df.head(40)

Unnamed: 0,title,type,description,human_readable_id,id,text_unit_ids
0,CJ8WD89BJR24,广告主ID,广告主的唯一标识符,0,ea06f8f1-4e57-42ab-9be8-0460235f055a,[759c5af2aad3e1985b6893b369a9b9781f045235d05d9...
1,我的亲家,产品,我的亲家是一个家长帮子女相亲的平台,1,8f1aa682-6ad1-4b2f-8d21-756dbca89d37,[759c5af2aad3e1985b6893b369a9b9781f045235d05d9...
2,相亲平台,行业,"我的亲家所属的行业""\n我的亲家所属的行业",2,4791dea7-c2f7-4c27-b7e4-cedfe7e5d6dc,[759c5af2aad3e1985b6893b369a9b9781f045235d05d9...
3,真实可靠,特点,我的亲家是一个真实可靠的家长帮子女相亲平台,3,7b075ae9-4766-46b7-9088-bd57b5465761,[759c5af2aad3e1985b6893b369a9b9781f045235d05d9...
4,50岁以上老年人群,受众,我的亲家的目标人群是50岁以上老年人群,4,c526b9cf-a0cc-4e1c-b07d-80f082b7f4cb,[759c5af2aad3e1985b6893b369a9b9781f045235d05d9...
5,注册,诉求,我的亲家的推广诉求是增加注册用户,5,9b81b812-99be-47d5-9dcb-c6077239d861,[759c5af2aad3e1985b6893b369a9b9781f045235d05d9...
6,视频号,场景,视频号作为一个广告投放平台，被多个广告主选作其广告投放的场景。具体而言，《斗破苍穹·巅峰对决...,6,4b85d5c0-8574-48aa-a7d6-cb9893d7587d,[759c5af2aad3e1985b6893b369a9b9781f045235d05d9...
7,F8DFDH7W8NRBY2O487,广告主ID,广告主的唯一标识符,7,c2dd0961-e61f-49e1-a9ea-3008a3273797,[ffb7364bae253823e37dd9c595945797e21df3bc98d21...
8,农夫山泉,产品,农夫山泉是一款食品饮料产品,8,463d06f1-08ba-43f2-89a0-6633564b658a,[ffb7364bae253823e37dd9c595945797e21df3bc98d21...
9,食品饮料,行业,农夫山泉所属的行业,9,77090906-9e7c-4ca9-abe7-5cb04caa92d5,[ffb7364bae253823e37dd9c595945797e21df3bc98d21...


In [13]:
# 修改后的Cypher查询语句
entity_statement = """
MERGE (e:__Entity__ {id:value.id})
SET e += value {.human_readable_id, name:replace(value.title,'"',''), .description}
WITH e, value
// 过滤 description_embedding 中的无效值
WITH e, value, [x IN coalesce(value.description_embedding, [0.1]) WHERE x IS NOT NULL AND x <> 'NaN' AND x <> 'Infinity' AND x <> '-Infinity'] AS valid_description_embedding
CALL db.create.setNodeVectorProperty(e, "description_embedding", valid_description_embedding)
CALL apoc.create.addLabels(e, case when coalesce(value.type,"") = "" then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND value.text_unit_ids AS text_unit
MATCH (c:__Chunk__ {id:text_unit})
MERGE (c)-[:HAS_ENTITY]->(e)
"""

batched_import(entity_statement, entity_df)

{'_contains_updates': True, 'labels_added': 517, 'relationships_created': 552, 'nodes_created': 517, 'properties_set': 2068}
517 rows in 0.46076107025146484 s.


517

In [14]:
rel_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_relationships.parquet",
    columns=[
        "source",
        "target",
        "id",
        # "rank",
        "weight",
        "human_readable_id",
        "description",
        "text_unit_ids",
    ],
)
rel_df.head(2)

Unnamed: 0,source,target,id,weight,human_readable_id,description,text_unit_ids
0,CJ8WD89BJR24,我的亲家,d5c505a3-5865-4798-b141-841d0a3896f7,18.0,0,广告主id为cj8wd89bjr24的广告主的产品是“我的亲家”,[759c5af2aad3e1985b6893b369a9b9781f045235d05d9...
1,我的亲家,相亲平台,e76b87ca-9531-4ad7-ad87-644adf4d3a62,18.0,1,我的亲家是一个相亲平台,[759c5af2aad3e1985b6893b369a9b9781f045235d05d9...


In [15]:
rel_statement = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

batched_import(rel_statement, rel_df)

{'_contains_updates': True, 'relationships_created': 485, 'properties_set': 2425}
485 rows in 0.2628810405731201 s.


485

In [16]:
community_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_communities.parquet",
    columns=["id", "level", "title", "text_unit_ids", "relationship_ids"],
)

community_df.head(2)

Unnamed: 0,id,level,title,text_unit_ids,relationship_ids
0,12d86941-2903-4bdf-a6b3-cde7e24c2157,0,Community 0,[81b93d94bd0707aefca13a56a03f9453cf62b90875c45...,"[0822c360-037f-438f-b626-dd389314d650, 0860762..."
1,00e857f7-cdd8-4214-a0c1-549296baae9b,0,Community 1,[8d7d308e2ecdade189510d84d64a81470466e382e01bc...,"[086ca1a8-29a2-40cd-a20e-86350bcb5072, 1387f8a..."


In [17]:
statement = """
MERGE (c:__Community__ {community:value.id})
SET c += value {.level, .title}
/*
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id:text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
*/
WITH *
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
MERGE (end)-[:IN_COMMUNITY]->(c)
RETURn count(distinct c) as createdCommunities
"""

batched_import(statement, community_df)

{'_contains_updates': True, 'labels_added': 43, 'relationships_created': 402, 'nodes_created': 43, 'properties_set': 129}
43 rows in 0.3289179801940918 s.


43

In [18]:
community_report_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_community_reports.parquet",
    columns=[
        "id",
        "community",
        "level",
        "title",
        "summary",
        "findings",
        "rank",
        "rank_explanation",
        "full_content",
    ],
)
community_report_df.head(2)

Unnamed: 0,id,community,level,title,summary,findings,rank,rank_explanation,full_content
0,156f8f918d8347e0b2fefa3e5c3a89c8,14,1,牵手APP用户分析,牵手APP作为一个生活服务-线上婚恋平台，主要针对25-29岁和18-24岁的粉丝群体，重点...,[{'explanation': '牵手APP的主要目标人群为25-29岁和18-24岁的粉...,6.0,牵手APP作为一个针对特定年龄段和地区的线上婚恋平台，具有一定的用户基础和市场影响力。其用户...,# 牵手APP用户分析\n\n牵手APP作为一个生活服务-线上婚恋平台，主要针对25-29岁...
1,2b1cdfded53d4f6eaa9c7b81adf062db,15,1,VNUW2BNVIPTRE广告主的合作偏好,VNUW2BNVIPTRE广告主展现了多元化的达人类型合作期望，并倾向于采用二创剪辑及原创定...,[{'explanation': 'VNUW2BNVIPTRE广告主期望合作的达人类型包括真...,6.0,VNUW2BNVIPTRE广告主的合作偏好广泛且具体，显示出其对不同达人类型和广告形式的多样...,# VNUW2BNVIPTRE广告主的合作偏好\n\nVNUW2BNVIPTRE广告主展现了...


In [19]:
# Import communities
community_statement = """
MERGE (c:__Community__ {community:value.community})
SET c += value {.level, .title, .rank, .rank_explanation, .full_content, .summary}
WITH c, value
UNWIND range(0, size(value.findings)-1) AS finding_idx
WITH c, value, finding_idx, value.findings[finding_idx] as finding
MERGE (c)-[:HAS_FINDING]->(f:Finding {id:finding_idx})
SET f += finding
"""
batched_import(community_statement, community_report_df)

{'_contains_updates': True, 'labels_added': 220, 'relationships_created': 177, 'nodes_created': 220, 'properties_set': 832}
43 rows in 0.09284520149230957 s.


43

In [20]:
cov_df = (pd.read_parquet(f"{GRAPHRAG_FOLDER}/create_final_covariates.parquet"),)
#                         columns=["id","text_unit_id"])
cov_df.head(2)
# Subject id do not match entity ids

FileNotFoundError: [Errno 2] No such file or directory: 'output/20250109-110754/artifacts/create_final_covariates.parquet'

In [21]:
# Import covariates
cov_statement = """
MERGE (c:__Covariate__ {id:value.id})
SET c += apoc.map.clean(value, ["text_unit_id", "document_ids", "n_tokens"], [NULL, ""])
WITH c, value
MATCH (ch:__Chunk__ {id: value.text_unit_id})
MERGE (ch)-[:HAS_COVARIATE]->(c)
"""
batched_import(cov_statement, cov_df)

NameError: name 'cov_df' is not defined