#### 导入依赖库

In [1]:
import pandas as pd
from neo4j import GraphDatabase
import time

#### 联结本地Neo4j实例

In [2]:
NEO4J_URI = "neo4j://localhost"  # or neo4j+s://xxxx.databases.neo4j.io
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "12345678" #你自己的密码
NEO4J_DATABASE = "neo4j"

# Create a Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

#### 配置本次索引目录

In [3]:
GRAPHRAG_FOLDER = "./ragtest_test2/output/hongloumeng2/artifacts"

#### 创建Neo4j索引

在Neo4j中，索引仅用于查找图查询的起始点，例如快速查找两个节点以进行连接。约束用于避免重复，主要在实体类型的id上创建。我们使用带有两个下划线的类型作为标记，以区分它们与实际的实体类型。

In [16]:
statements = """
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
create constraint document_id if not exists for (d:__Document__) require d.id is unique;
create constraint entity_id if not exists for (c:__Community__) require c.community is unique;
create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
create constraint entity_title if not exists for (e:__Entity__) require e.name is unique;
create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;
""".split(";")

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


#### 创建批量导入函数

In [None]:
def batched_import(statement, df, batch_size=1000):
    """
    Import a dataframe into Neo4j using a batched approach.
    Parameters: statement is the Cypher query to execute, df is the dataframe to import, and batch_size is the number of rows to import in each batch.
    """
    total = len(df)
    start_s = time.time()
    for start in range(0,total, batch_size):
        batch = df.iloc[start: min(start+batch_size,total)]
        result = driver.execute_query("UNWIND $rows AS value " + statement, 
                                      rows=batch.to_dict('records'),
                                      database_=NEO4J_DATABASE)
        print(f"from row {start} to {min(start+batch_size,total)}")
        print(result.summary.counters)
    print(f'{total} rows in { time.time() - start_s} s.')    
    return total

#### 导入文档

In [4]:
doc_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_documents.parquet', columns=["id", "title"])
doc_df.head(2)

# import documents
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
"""

batched_import(statement, doc_df)

NameError: name 'batched_import' is not defined

In [5]:
doc_df.head()

Unnamed: 0,id,title
0,5a96165d79b335622a844c8fa4009d56,红楼梦第一回.txt
1,5a1839ae8ebdea298aa2851fa71c9412,红楼梦第一百一回.txt
2,0e165dac83502a2295a730e4f8c7ecb4,红楼梦第一百七回.txt
3,8af29a9d09170de48da537824c279504,红楼梦第一百三回.txt
4,f8d5a0cc2e0209d25be532736fa7dd40,红楼梦第一百九回.txt


#### 导入text units

In [6]:
text_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_text_units.parquet',
                          columns=["id","text","n_tokens","document_ids"])
text_df.head(2)

statement = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

batched_import(statement, text_df)

NameError: name 'batched_import' is not defined

In [7]:
text_df.head()

Unnamed: 0,id,text,n_tokens,document_ids
0,08aec1aaf412eb074e8ef8f491942fc1,红楼梦第一百十一回\n我们那一个又病着，也难照应。”想了一回，回贾政道：“老爷且歇歇儿，等进...,1600,[00b74711e385e2b643f7393a95dbecbe]
1,6304673ad9ac8010d7e2cdd4f1a95fc9,红楼梦第一百十一回\n不知情之一字，喜怒哀乐未发之时，便是个‘性’；喜怒哀乐已发，便是‘情’...,1521,[00b74711e385e2b643f7393a95dbecbe]
2,7a320fa425b8705c6aa2edd8edb2c299,红楼梦第一百十一回\n内中紫鹃也想起自己终身，一无着落，恨不跟了林姑娘去，又全了主仆的恩义，...,1521,[00b74711e385e2b643f7393a95dbecbe]
3,d3f5920ffa53863206c2ba6b13368050,红楼梦第一百十一回\n只有平儿同着惜春各处走了一走，吩咐了上夜的人，也便各自归房。\n\n\...,1490,[00b74711e385e2b643f7393a95dbecbe]
4,8d431472c1f94c67e0f29d1236434b66,红楼梦第一百十一回\n看个风头，等个门路，若到了手，你我在这里也无益，不如大家下海去受用，不...,1586,[00b74711e385e2b643f7393a95dbecbe]


#### 加载实体

In [25]:
entity_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_entities.parquet',
                            columns=["name", "type", "description", "human_readable_id", "id", "description_embedding",
                                     "text_unit_ids"])
entity_df.head(2)

entity_statement = """
MERGE (e:__Entity__ {id:value.id})
SET e += value {.human_readable_id, .description, name:substring(value.name, 1, size(value.name) - 2)}
WITH e, value
CALL db.create.setNodeVectorProperty(e, "description_embedding", value.description_embedding)
CALL apoc.create.addLabels(e, case when coalesce(value.type,"") = "" then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND value.text_unit_ids AS text_unit
MATCH (c:__Chunk__ {id:text_unit})
MERGE (c)-[:HAS_ENTITY]->(e)
"""

batched_import(entity_statement, entity_df)

NameError: name 'batched_import' is not defined

In [38]:
corr_dict = {}
for i in range(len(entity_df)):
    for j in range(i, len(entity_df)):
        corr_dict[(i, j)] = sum(entity_df["description_embedding"].iloc[i] * (entity_df["description_embedding"].iloc[j]))

In [42]:
corr_dict = dict(sorted(corr_dict.items(), key=lambda item: item[1]))

In [50]:
cnt = 0
for key in list(corr_dict.keys())[::-1]:
    if corr_dict[key] < 0.5:
        break
    i, j = key
    if i == j:
        continue
    if (entity_df["name"].iloc[i] in entity_df["name"].iloc[j]) or (entity_df["name"].iloc[j] in entity_df["name"].iloc[i]):
        cnt += 1
        print(entity_df["name"].iloc[i], entity_df["name"].iloc[j], corr_dict[key])

"笔锭如意" "紫金"笔锭如意"锞" 0.8578599712292582
"福寿绵长"宫绸" "宫绸" 0.7743215019751366
"富贵长春"宫缎" "宫缎" 0.762126574367138
"宝姑娘" "宝姑娘"<|("ENTITY" 0.5697638561788726
"园子" "园子"<|("ENTITY" 0.5322362198132898
"玉" "玉"字辈的嫡派" 0.5031145503821799


In [18]:
entity_df.head()

Unnamed: 0,name,type,description,human_readable_id,id,description_embedding,text_unit_ids
0,"""凤姐""","""人物""",凤姐是贾琏的妻子，贾母的孙媳妇，王夫人的侄媳妇。她是一个聪明、机智、活泼的女性，具有强烈的控...,0,b45241d70f0e43fca764df95b2b81f77,"[-0.052942004, 0.03363313, -0.059086226, -0.02...","[003f42c65686c7327aeb6c52eda40a45, 01c3eebe542..."
1,"""平儿""","""人物""",平儿是贾府中的一位丫鬟，具有较强的主动性和管理能力。她是凤姐身边的得力助手，负责处理一些事务...,1,4119fd06010c494caa07f439b333f4c5,"[-0.055716213, 0.021328818, -0.043988734, 0.01...","[01c3eebe54224a9eded4e6265377c486, 0296e3d4a37..."
2,"""秋桐""","""人物""",秋桐是贾琏的妾，具有较低的社会地位，曾经受到贾琏的宠爱，但后来被贾琏嫌弃。她是凤姐身边的丫鬟...,2,d3835bf3dda84ead99deadbeac5d0d7d,"[-0.038844585, 0.056890063, -0.07289991, -0.02...","[067ecf242ec820ee040cda22f97d9a4b, 0f47944c4d2..."
3,"""丰儿""","""人物""",丰儿是贾府的一名丫鬟，主要服侍凤姐。凤姐让她收下贾芸送来的香料。丰儿是凤姐的贴身丫鬟，陪同凤...,3,077d2820ae1845bcbb1803379a3d1eae,"[-0.017726636, 0.029423915, -0.051893927, -0.0...","[067ecf242ec820ee040cda22f97d9a4b, 0aa7fa13b3e..."
4,"""邢夫人""","""人物""",邢夫人是贾赦的妻子，贾琏的母亲，贾蓉的祖母，邢岫烟的母亲。她是贾府的二夫人，地位较高，具有较...,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,"[-0.020466173, 0.03014151, -0.04014905, -0.008...","[01c3eebe54224a9eded4e6265377c486, 0baccfa6f9a..."


#### 导入关系

In [9]:
rel_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_relationships.parquet',
                         columns=["source", "target", "id", "rank", "weight", "human_readable_id", "description",
                                  "text_unit_ids"])
rel_df.head(2)

rel_statement = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

batched_import(rel_statement, rel_df)

NameError: name 'batched_import' is not defined

In [10]:
rel_df.head()

Unnamed: 0,source,target,id,rank,weight,human_readable_id,description,text_unit_ids
0,"""凤姐""","""贾府""",59c5ed74bc3d45c398f914d3bea1a59a,886,26.0,0,凤姐是贾府的成员，具有较高的智慧和影响力。她是贾府的二奶奶，负责管理家务，安排事务，并且是贾...,"[0322527832917c0049f736c5da92263f, 04bfac627e3..."
1,"""凤姐""","""贾芸""",1ca89f8d3253448b8a593c993365d382,406,11.0,1,凤姐和贾芸是贾府的下人，负责看家和处理事务。贾芸是凤姐的外甥，表现出对凤姐的尊重。凤姐对贾芸...,"[228b3849dc18e829d503ce467cf13cb8, 3a85a36a153..."
2,"""凤姐""","""上夜的人""",8cdf06aa0ef94c8b86fc1f4278eddad6,334,1.0,2,"""凤姐要把上夜的人交给营里去审问。""",[3a85a36a153338f4d43611302fe9135a]
3,"""凤姐""","""王夫人""",bed30b4da9ba48ab990917282480cc39,487,39.0,3,凤姐是王夫人的侄女和儿媳妇，也是宁国府的女主人。凤姐和王夫人是贾府的管家和夫人，帮助料理家务...,"[102297b4f2a48ff35219b9c7f13ae3fe, 1d597c18b6e..."
4,"""凤姐""","""贾琏""",adee8503cddb49ada471ffc9eba9f24b,599,56.0,4,凤姐和贾琏是贾府的女主人和少爷，负责管理贾府的家务。他们的关系是夫妻关系，具有复杂的权力结构...,"[067ecf242ec820ee040cda22f97d9a4b, 08633e4451a..."


#### 导入社区

In [11]:
community_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_communities.parquet',
                               columns=["id", "level", "title", "text_unit_ids", "relationship_ids"])

community_df.head(2)

statement = """
MERGE (c:__Community__ {community:value.id})
SET c += value {.level, .title}
/*
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id:text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
*/
WITH *
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
MERGE (end)-[:IN_COMMUNITY]->(c)
RETURn count(distinct c) as createdCommunities
"""

batched_import(statement, community_df)

NameError: name 'batched_import' is not defined

In [12]:
community_df.head()

Unnamed: 0,id,level,title,text_unit_ids,relationship_ids
0,15,0,Community 15,"[003f42c65686c7327aeb6c52eda40a45,01c3eebe5422...","[59c5ed74bc3d45c398f914d3bea1a59a, 1ca89f8d325..."
1,4,0,Community 4,"[01c3eebe54224a9eded4e6265377c486,0baccfa6f9ad...","[4c598e67af0b413585172a3d0fad5bef, 0cd31d6520d..."
2,10,0,Community 10,"[00bce2b56e0aa1a7b57092b78e4fa382,0101f4d63123...","[cd0bfe1bd48c4a74bf771ff96ec2de63, b1f30fe968a..."
3,8,0,Community 8,"[1c2837793d3c16131b7c730c2a8b1826,23e6b2bafc7f...","[aa7684c4c1dd45ae99c383aa0eca2dfb, 6553d9a82d5..."
4,0,0,Community 0,"[42aa503905870943128defa2f3683a44,db9cbe0fc443...","[f1121000efe54bccb129a6eb8f553507, f543f627daf..."


#### 导入社区报告

In [13]:
community_report_df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_community_reports.parquet',
                                      columns=["id", "community", "level", "title", "summary", "findings", "rank",
                                               "rank_explanation", "full_content"])
community_report_df.head(2)
# import communities
community_statement = """MATCH (c:__Community__ {community: value.community})
SET c += value {.level, .title, .rank, .rank_explanation, .full_content, .summary}
WITH c, value
UNWIND range(0, size(value.findings)-1) AS finding_idx
WITH c, value, finding_idx, value.findings[finding_idx] as finding
MERGE (c)-[:HAS_FINDING]->(f:Finding {id: finding_idx})
SET f += finding"""
batched_import(community_statement, community_report_df)

NameError: name 'batched_import' is not defined

In [14]:
community_report_df.head()

Unnamed: 0,id,community,level,title,summary,findings,rank,rank_explanation,full_content
0,4662a319-5363-49fb-953d-1737e494c611,535,4,紫鹃与贾府的关系网络,该社区围绕着紫鹃，紫鹃是黛玉的贴身丫鬟和陪侍者。紫鹃与贾府的多个成员有着复杂的关系，包括宝玉...,[{'explanation': '紫鹃是黛玉的贴身丫鬟和陪侍者，负责照顾黛玉的日常生活，关...,8.0,由于紫鹃在贾府中扮演着重要的角色，且其关系网络复杂，影响严重性评分为较高。,# 紫鹃与贾府的关系网络\n\n该社区围绕着紫鹃，紫鹃是黛玉的贴身丫鬟和陪侍者。紫鹃与贾府的...
1,60f56bbe-8bea-4996-b71f-bbf6971c353a,536,4,雪雁与林黛玉的关系网络,该社区围绕着雪雁，雪雁是林黛玉的贴身丫鬟。雪雁与林黛玉、宝玉、薛姨妈、平儿、紫鹃等实体有着密...,[{'explanation': '雪雁是林黛玉的贴身丫鬟，了解林黛玉的感受，并对她的病情和...,6.0,由于雪雁在林黛玉身边的重要性及其参与的事件，影响严重性评分为中等。,# 雪雁与林黛玉的关系网络\n\n该社区围绕着雪雁，雪雁是林黛玉的贴身丫鬟。雪雁与林黛玉、宝...
2,7942adb3-50c6-46ec-b38b-353517bef17b,472,3,诗歌社区,该社区围绕着诗歌这一文学形式，主要实体包括宝玉、黛玉、湘云、妙玉和香菱等人。他们都与诗歌创作...,[{'explanation': '宝玉是诗歌社区中的一个重要人物，他创作诗歌，经常与黛玉、...,6.0,由于诗歌社区中的人物都对诗歌有着深刻的理解和创作，且他们之间的关系密切，因此影响严重性评分为中等。,# 诗歌社区\n\n该社区围绕着诗歌这一文学形式，主要实体包括宝玉、黛玉、湘云、妙玉和香菱等...
3,ae7db4ef-e0ec-49e7-8719-2123178e9e88,474,3,林姑娘与贾府的关系,该社区围绕着林姑娘，林姑娘是贾府的一位成员。林姑娘与贾宝玉、尤太太、黛玉、薛蟠等实体有关系，...,[{'explanation': '林姑娘与贾宝玉是表姐妹，住在大观园中，他们曾经一起长大。...,6.0,由于林姑娘的死亡对贾府产生了重大影响，因此影响严重性评分为中等。,# 林姑娘与贾府的关系\n\n该社区围绕着林姑娘，林姑娘是贾府的一位成员。林姑娘与贾宝玉、尤...
4,792da811-6737-4e1f-8c8a-67da27bd73a3,475,3,宝姑娘与贾府的关系网络,该社区围绕着宝姑娘，一个与贾府有着密切关系的女性。宝姑娘与贾母、宝玉、黛玉、太岁奶奶等贾府成...,[{'explanation': '宝姑娘是贾母的外孙女，贾母与太太、二奶奶商量宝玉娶宝姑娘...,6.0,由于宝姑娘在贾府中的重要性及其与多个成员的关系，影响严重性评分为中等。,# 宝姑娘与贾府的关系网络\n\n该社区围绕着宝姑娘，一个与贾府有着密切关系的女性。宝姑娘与...


In [4]:
import json

In [5]:
with open('honglou.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [6]:
data

{'categories': {'person': '人物', 'event': '事件', 'location': '地点'},
 'data': {'nodes': [{'label': '共读西厢',
    'value': 2,
    'id': 3779,
    'categories': ['event'],
    'info': '宝玉到沁芳桥边桃花底下看《西厢记》，正准备将落花送进池中，黛玉说她早已准备了一个花冢，正来葬花。黛玉发现《西厢记》，宝玉借书中词句，向黛玉表白。黛玉觉得冒犯了自己尊严，引起口角，宝玉赔礼讨饶，黛玉也借《西厢记》词句，嘲笑了宝玉。于是两人收拾落花，葬到花冢里去。'},
   {'label': '林如海捐馆扬州城',
    'value': 4,
    'id': 3780,
    'categories': ['event'],
    'info': '林如海考中探花后，迁为兰台寺大夫，钦点为扬州巡盐御史。后身染重病于九月初三日巳时而亡。'},
   {'label': '海棠诗社',
    'value': 8,
    'id': 3781,
    'categories': ['event'],
    'info': '初秋季节，贾探春提议邀集大观园中有文采的人所组的诗社。诗社成立目的 旨在“宴集诗人於风庭月榭；醉飞吟盏於帘杏溪桃，作诗吟辞以显大观园众姊妹之文采不让桃李须眉。”诗社成员 有林黛玉、薛宝钗、史湘云、贾迎春、贾探春、贾惜春、贾宝玉及李纨。稻香老农(李纨)为社长，菱洲(迎春)、藕榭（惜春）为副社长，一人出题，一人监场。'},
   {'label': '紫鹃试玉',
    'value': 2,
    'id': 3782,
    'categories': ['event'],
    'info': '紫鹃想试试宝玉对黛玉的情意，就单独骗他说自己要和黛玉回苏州，宝玉本来不信，但在紫鹃一番骗小孩的语言下，痴情迷糊的宝玉还是相信了。就这一骗弄得宝玉是发了疯一样，指着墙上画中的小船说那是来接林妹妹的船……后来弄得大家都慌了，最后在紫鹃诉说下才好起来。真是个可爱迷糊又痴情的人。'},
   {'label': '魇魔姊弟',
    'value': 6,
    'id': 3783

In [8]:
entity_statement = """
MERGE (e:__Entity__ {id: $value.id})
SET e.info = $value.info,
    e.name = $value.label
WITH e, $value AS value
CALL apoc.create.addLabels(
    e, 
    CASE 
        WHEN coalesce(value.categories[0], "") = "" 
        THEN [] 
        ELSE [apoc.text.upperCamelCase(replace(value.categories[0], '"', ''))] 
    END
) YIELD node
RETURN e
"""

In [9]:
for nodes in data["data"]["nodes"]: 
    driver.execute_query(entity_statement, 
                     value=nodes,
                     database_=NEO4J_DATABASE)

In [10]:
rel_statement = """
    WITH $value AS value
    MATCH (source:__Entity__ {id: value.from})
    MATCH (target:__Entity__ {id: value.to})
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value
    RETURN count(*) AS createdRels
"""

In [11]:
for rel in data["data"]["edges"]: 
    driver.execute_query(rel_statement, 
                     value=rel,
                     database_=NEO4J_DATABASE)

In [12]:
df = pd.read_csv("./triples.csv")

In [13]:
df

Unnamed: 0,head,tail,relation,label
0,贾代善,贾源,son,子
1,娄氏,贾源,daughter_in_law_of_grandson,重孙媳妇
2,贾母,贾代善,wife,妻
3,老姨奶奶,贾代善,concubine,妾
4,贾敏,贾代善,daughter,女
...,...,...,...,...
375,尤氏,贾珍,wife,妻
376,尤二姐,尤老娘,daughter,女
377,尤三姐,尤老娘,daughter,女
378,贾蓉,贾珍,son,子


In [14]:
entity_statement = """
MATCH (e:__Entity__)
WITH COALESCE(MAX(e.id), 0) + 1 AS new_id
MERGE (newEntity:__Entity__:Person {name: $value.name})
SET newEntity.name = $value.name,
    newEntity.id = new_id
RETURN newEntity
"""

In [15]:
for i in range(len(df)):
    head = df["head"].iloc[i]
    tail = df["tail"].iloc[i]
    node_head = {"name": head}
    driver.execute_query(entity_statement, 
                     value=node_head,
                     database_=NEO4J_DATABASE)
    node_tail = {"name": tail}
    driver.execute_query(entity_statement, 
                     value=node_tail,
                     database_=NEO4J_DATABASE)

In [16]:
rel_statement = """
    WITH $value AS value
    MATCH (source:__Entity__ {id: value.from})
    MATCH (target:__Entity__ {id: value.to})
    WITH value, source, target
    // Find the maximum id of relationships between source and target
    MATCH (source)-[rel:RELATED]->(target)
    WITH value, source, target, COALESCE(MAX(toInteger(rel.id)), 0) + 1 AS new_rel_id
    MERGE (source)-[newRel:RELATED {id: COALESCE(value.id, new_rel_id)}]->(target)
    SET newRel += value
    RETURN count(*) AS createdRels
"""

In [17]:
for i in range(len(df)):
    head = df["head"].iloc[i]
    tail = df["tail"].iloc[i]
    label = df["label"].iloc[i]
    rel = {"from": head, "to": tail, "label": label}
    driver.execute_query(rel_statement, 
                     value=rel,
                     database_=NEO4J_DATABASE)