## 模型

In [1]:
#!/usr/bin/env python3
# coding=utf-8

"""
Baidu Qianfan: https://qianfan.cloud.baidu.com/
"""

import os
from typing import Union

from langchain_community.chat_models import QianfanChatEndpoint
from langchain_community.embeddings import QianfanEmbeddingsEndpoint
from langchain_community.llms import QianfanLLMEndpoint
from langchain_core.embeddings.embeddings import Embeddings
from langchain_core.language_models import BaseLanguageModel, BaseChatModel
from langchain_core.messages import BaseMessage

# https://cloud.baidu.com/doc/WENXINWORKSHOP/s/Nlks5zkzu
# INSTRUCT_MODEL = 'ERNIE-Speed-8K'
INSTRUCT_MODEL = 'ERNIE-4.0-Turbo-8K'
# INSTRUCT_MODEL = 'ERNIE-3.5-8K'
CHAT_MODEL = INSTRUCT_MODEL
EMBEDDINGS_MODEL = 'bge-large-zh'

common_options = {
    'qianfan_ak': 'evU2FdQZwaHqeaSmGDXfnMzF',
    'qianfan_sk': 'R1auZ39Z3z06ClN8xQruclNLWcMe9e8q'
}

_llm, _chat_llm, _embeddings = None, None, None


def create_llm(**kwargs) -> BaseLanguageModel[Union[str, BaseMessage]]:
    """create `QianfanLLM`, can be used to replace `OpenAI`"""
    global _llm

    if len(kwargs) == 0:
        if _llm is None:
            _llm = QianfanLLMEndpoint(model=INSTRUCT_MODEL, **common_options)
        return _llm

    options = {'model': INSTRUCT_MODEL, **common_options, **kwargs}
    return QianfanLLMEndpoint(**options)


def create_chat_llm(**kwargs) -> BaseChatModel:
    """create `QianfanChat`, can be used to replace `ChatOpenAI`"""
    global _chat_llm

    if len(kwargs) == 0:
        if _chat_llm is None:
            _chat_llm = QianfanChatEndpoint(model=CHAT_MODEL, **common_options)
        return _chat_llm

    options = {'model': CHAT_MODEL, **common_options, **kwargs}
    return QianfanChatEndpoint(**options)


def create_embeddings(**kwargs) -> Embeddings:
    """create `QianfanEmbeddings`, can be used to replace `OpenAIEmbeddings`"""
    global _embeddings

    if len(kwargs) == 0:
        if _embeddings is None:
            _embeddings = QianfanEmbeddingsEndpoint(model=EMBEDDINGS_MODEL, **common_options)
        return _embeddings

    options = {'model': EMBEDDINGS_MODEL, **common_options, **kwargs}
    return QianfanEmbeddingsEndpoint(**options)


creators = (create_llm, create_chat_llm, create_embeddings)




In [3]:
from langchain_core.runnables import  RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
import json
from langchain_community.graphs import Neo4jGraph
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

## GraphRAG

In [4]:
graph = Neo4jGraph(url="bolt://localhost:7687", username="neo4j", password="12345678")

## 加载文档

In [91]:
loader = TextLoader(file_path="20240920_7_王志纲年度报告·2021-页面-1_1.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=90, chunk_overlap=10)
documents = text_splitter.split_documents(documents=docs)
print(documents[0:1])
print(len(documents))
print(documents)

[Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='王志纲年度报告2021\n发刊词\n实战派的战略思维 第一讲\n硬科技发力：为什么新机会反而在中西部？\n.......9 第二讲\n协作共赢：民营企业的机会究竟在哪？\n.16 第三讲')]
109
[Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='王志纲年度报告2021\n发刊词\n实战派的战略思维 第一讲\n硬科技发力：为什么新机会反而在中西部？\n.......9 第二讲\n协作共赢：民营企业的机会究竟在哪？\n.16 第三讲'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='.16 第三讲\n卖生活：房产企业怎么才能活下去？\n23 第四讲\n公平优先：为什么“双减”不是偶然的政策变量？\n30 第五讲\n老而不衰：为什么老龄化是个商业问题？\n36 第六讲'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='36 第六讲\n家风胜万贯：为什么要关注民营企业二代传承问题？\n...42 第七讲\n好人赚钱：为什么共同富裕不只是要分蛋糕？\n.49 第八讲\n大国博弈：为什么碳中和不只是环保问题？'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='55 第九讲\n算经济账：为什么要从成本博弈看中美关系？\n.61 第十讲\n个人定位：为什么内卷不是时代的错？\n68 发刊词|实战派的战略思维'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='你好，我是王志纲，欢迎来到我的《年度报告2021》。我先简单做一个自我介

In [105]:
from langchain.document_loaders import CSVLoader

loader = CSVLoader(file_path="get_relation_data.csv")
documents = loader.load()


## 建立向量数据库

faiss 检索
pip install faiss-cpu

感觉 faiss 向量化要快一些

In [112]:
from langchain_community.vectorstores import FAISS
 

# 加载本地 embedding 模型
embedding =create_embeddings()
# 创建向量数据库
db = FAISS.from_documents(documents, embedding)
# 保存
db.save_local("./faiss_index")
'''
如果已经创建好了，可以直接读取
db = FAISS.load_local("./faiss_index", embeddings)
'''
 
# 直接传入文本
query = "人到老年"
docs = db.similarity_search(query, k=3)
print(docs[0].page_content)
 


[Document(metadata={'source': 'get_relation_data.csv', 'row': 3}, page_content='output: 养老观念 - BELONGS_TO -> 中国人'), Document(metadata={'source': 'get_relation_data.csv', 'row': 4}, page_content='output: 养老方式 - BELONGS_TO -> 中国人')]


In [98]:
# 传入向量去搜索
embedding_vector = embedding.embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector, k=3)
print(docs[0].page_content)

[Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='有老龄化、二代传承、内卷等社会性问题。'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='到关键时期的关键变量，去做正确的事和正确地做事，使自己的人生更加精彩。'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='的去做自己的战略规划，把握自己的人生航向。')]


带分数的top k

In [99]:
docs = db.similarity_search_with_score(query, k=3)  # 带分数的
print(docs[0])

(Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='有老龄化、二代传承、内卷等社会性问题。'), 0.49693966)


In [74]:
retriever = db.as_retriever(search_kwargs={'k': 30})  # 构建检索器
docs = retriever.get_relevant_documents(query)
print(docs)

[Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='和、养老产业的机遇等。'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='作为一个50后，我有可能是得到上年龄最大'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='年纪大，不过，多少与我的人生阅历有关。'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='-得到App出品-'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='，把握自己的人生航向。'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='可能是得到上年龄最大的老师。为什么要由我'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='么老龄化是个商业问题？'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='有老龄化、二代传承、内卷等社会性问题。'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='，和一个半辈子都看不透的人，人生是天差地'), Document(metadata={'source': '20240920_7_王志纲年度报告·2021-页面-1_1.txt'}, page_content='地做事

相似数阈值
相似度大于 0.5 的拿出来

In [78]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5})  # 构建检索器
docs = retriever.get_relevant_documents(query)
print(docs[0].page_content)

和、养老产业的机遇等。


## 模型合成知识图谱元数据

In [30]:
llm = create_llm()
llm_transformer = LLMGraphTransformer(llm=llm)

graph_documents = llm_transformer.convert_to_graph_documents(documents)

print(graph_documents[0])


nodes=[] relationships=[] source=Document(metadata={'source': '1.txt'}, page_content='我和万维钢未曾谋面，但在网上神交已久，互为读者和粉丝。在模糊的记忆中，最早似乎是通过刘夙给的链接发现了同人于野的博客，读过一篇之后便停不下来了。恰逢需要出差，灵机一动，利用一台贪便宜买来又没啥用的MP4的电子书功能，把他博客里的几十篇文章都下载到MP4里面，在动车上大饱眼福。其实旅程的无聊不算什么痛苦，毕竟有窗外的景色可看，知道有好文章却要在数日后方能有时间')


转换数据格式

In [55]:
len(graph_documents),graph_documents

(6,
 [GraphDocument(nodes=[], relationships=[], source=Document(metadata={'source': '1.txt', 'id': 'ac1d519a514f00ab646e6a82f59ada86'}, page_content='我和万维钢未曾谋面，但在网上神交已久，互为读者和粉丝。在模糊的记忆中，最早似乎是通过刘夙给的链接发现了同人于野的博客，读过一篇之后便停不下来了。恰逢需要出差，灵机一动，利用一台贪便宜买来又没啥用的MP4的电子书功能，把他博客里的几十篇文章都下载到MP4里面，在动车上大饱眼福。其实旅程的无聊不算什么痛苦，毕竟有窗外的景色可看，知道有好文章却要在数日后方能有时间')),
  GraphDocument(nodes=[], relationships=[], source=Document(metadata={'source': '1.txt', 'id': '473e7dc532d1209251bca6a6218258d7'}, page_content='可看，知道有好文章却要在数日后方能有时间看，才足以令人牵肠挂肚，很不舒服。')),
  GraphDocument(nodes=[Node(id='长辈', type='Person', properties={}), Node(id='人到老年，看书看皮儿，看报看题儿', type='Statement', properties={}), Node(id='年轻人', type='Person', properties={}), Node(id='看东西多了就会发现，真正有价值，值得花功夫和精力认真去看的作品，少之又少', type='Opinion', properties={}), Node(id='这两个原因固然都有，但不是最主要的。最主要的是，看东西多了就会发现，真正有价值，值得花功夫和精力认真去看的作品，少之又少', type='Opinion', properties={}), Node(id='大概是因为老花眼看不清小字之故，或者是来日无多，学习的性价比下降，学习欲望也就相应减退了', type='Thought', properties={}), Node(id='我', type='Person', p

In [5]:
# from typing import List

# # 假设 Node 和 Relationship 类已经被定义，并且有如下属性
# # Node: id, type, properties
# # Relationship: source, target, type, properties
# # GraphDocument: nodes, relationships, source

# def clean_node(node):
#     # 清理空标签或包含非法字符的标签
#     if not node.type or any(ord(c) == 0 for c in node.type):
#         # 如果标签为空或包含非法字符，可以设置一个默认标签
#         node.type = "DefaultLabel"  # 设置一个默认标签
#         # 或者可以选择抛出异常或记录日志
#         raise ValueError(f"Invalid label found: {node.type}")
#     return node

# def clean_graph_document(graph_doc):
#     # 遍历文档中的所有节点并清理标签
#     graph_doc.nodes = [clean_node(node) for node in graph_doc.nodes]
#     return graph_doc

# # 假设 graph_documents 是你的文档列表
# cleaned_graph_documents: List[graph_documents] = [clean_graph_document(doc) for doc in graph_documents]

# 现在使用清理过的文档来添加图文档
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

NameError: name 'graph_documents' is not defined

## 制作知识图谱

In [6]:
os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "12345678"

def showGraph():
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run("MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t").graph())
    widget.node_label_mapping = 'id'
    return widget

showGraph()

GraphWidget(layout=Layout(height='500px', width='100%'))

## 导出为CSV

#### 按节点保存

In [43]:
import os
from neo4j import GraphDatabase
import pandas as pd

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "12345678"
def export_to_csv():
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with driver.session() as session:
        result = session.run("MATCH (s)-[r:!MENTIONS]->(t) RETURN s, r, t")
        
        # 提取节点和关系的信息到列表
        nodes = []
        relationships = []
        
        for record in result:
            # source_node = {
            #     "id": record['s'].id,
            #     # "labels": list(record['s'].labels),
            #     "properties": dict(record['s'])
            # }
            # target_node = {
            #     "id": record['t'].id,
            #     # "labels": list(record['t'].labels),
            #     # "properties": dict(record['t'])
            # }
            # relationship = {
            #     # "start_node_id": record['s'].id,
            #     # "end_node_id": record['t'].id,
            #     "type": record['r'].type,
            #     # "properties": dict(record['r'])
            # }
            source_node = {
                "id": record['s'].id,
                "source_properties": dict(record['s']),
                "type": record['r'].type,
                "target_properties": dict(record['t']),
                "relationship_properties": dict(record['r']),
            }


            # 确保节点没有重复
            # if not any(node['id'] == source_node['id'] for node in nodes):
            #     nodes.append(source_node)
            nodes.append(source_node)
            # if not any(node['id'] == target_node['id'] for node in nodes):
            #     nodes.append(target_node)
            
            # 添加关系
            # relationships.append(relationship)
        
        # 创建DataFrame
        nodes_df = pd.DataFrame(nodes)
        # relationships_df = pd.DataFrame(relationships)
        
        # 导出为CSV
        nodes_df.to_csv('nodes.csv', index=False)
        # relationships_df.to_csv('relationships.csv', index=False)

# 调用函数导出数据
export_to_csv()

  "id": record['s'].id,


#### 按查询保存

In [54]:
import os
from neo4j import GraphDatabase
import pandas as pd

# 假设 remove_lucene_chars 和 prompt_query 是已经定义的函数
def remove_lucene_chars(input: str) -> str:
    # 这里可以实现去除 Lucene 特殊字符的逻辑
    return input  # 示例中直接返回输入

def prompt_query(question: str) -> list:
    # 这里可以实现从问题中提取特征实体的逻辑
    return [question]  # 示例中直接返回问题本身作为实体

def generate_full_text_query(input: str) -> str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " AND ".join([f"{word}~2" for word in words])
    print(f"Generated Query: {full_text_query}")
    return full_text_query.strip()

def create_fulltext_index():
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with driver.session() as session:
        # 创建全文索引
        session.run(
            """
            CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]
            """
        )

def export_to_csv(question: str):
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with driver.session() as session:
        # 提取特征实体
        entities = prompt_query(question)
        
        data = []
        for entity in entities:
            query = generate_full_text_query(entity)
            if not query:
                continue
            
            result = session.run(
                """
                CALL db.index.fulltext.queryNodes('entity', $query, {limit: 2})
                YIELD node, score
                CALL {
                  WITH node
                  MATCH (node)-[r:!MENTIONS]->(neighbor)
                  RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
                  UNION ALL
                  WITH node
                  MATCH (node)<-[r:!MENTIONS]-(neighbor)
                  RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
                }
                RETURN output LIMIT 50
                """,
                {"query": query}
            )
            
            # 提取查询结果
            data.extend([record['output'] for record in result])
        
        # 创建DataFrame
        df = pd.DataFrame(data, columns=['output'])
        
        # 导出为CSV
        df.to_csv('output.csv', index=False)

# 调用函数创建全文索引
create_fulltext_index()

# 调用函数导出数据
export_to_csv("我")

Generated Query: 我~2


#### 按知识图谱保存

In [9]:
import os
from neo4j import GraphDatabase
import pandas as pd

def create_fulltext_index():
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with driver.session() as session:
        # 创建全文索引
        session.run(
            """
            CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]
            """
        )

def export_all_relationships_to_csv():
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with driver.session() as session:
        # 查询所有关系数据
        result = session.run(
            """
            MATCH (s)-[r:!MENTIONS]->(t)
            RETURN s.id + ' - ' + type(r) + ' -> ' + t.id AS output
            UNION ALL
            MATCH (s)<-[r:!MENTIONS]-(t)
            RETURN t.id + ' - ' + type(r) + ' -> ' + s.id AS output
            """
        )
        
        # 提取查询结果
        data = [record['output'] for record in result]
        
        # 创建DataFrame
        df = pd.DataFrame(data, columns=['output'])
        
        # 导出为CSV
        df.to_csv('graph_relationship_data.csv', index=False)

# 调用函数创建全文索引
create_fulltext_index()

# 调用函数导出所有关系数据
export_all_relationships_to_csv()

#### 按csv格式保存

In [10]:
import os
from neo4j import GraphDatabase
import pandas as pd 

def create_fulltext_index():
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with driver.session() as session:
        # 创建全文索引
        session.run(
            """
            CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]
            """
        )

def export_all_relationships_to_csv():
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with driver.session() as session:
        # 查询所有关系数据（排除 MENTIONS 关系）
        result = session.run(
            """
            MATCH (n)-[r]->(neighbor)
            WHERE NOT type(r) = 'MENTIONS'
            RETURN n.id AS source, type(r) AS relationship, neighbor.id AS target
            UNION
            MATCH (n)<-[r]-(neighbor)
            WHERE NOT type(r) = 'MENTIONS'
            RETURN neighbor.id AS source, type(r) AS relationship, n.id AS target
            """
        )
        
        # 提取查询结果
        data = [(record['source'], record['relationship'], record['target']) for record in result]
        
        # 创建DataFrame
        df = pd.DataFrame(data, columns=['source_id', 'relationship_type', 'target_id'])
        
        # 导出为CSV
        df.to_csv('all_relationships.csv', index=False)

# 调用函数创建全文索引
# create_fulltext_index()

# 调用函数导出所有关系数据
export_all_relationships_to_csv()

### 加载CSV

In [None]:
import csv
from neo4j import GraphDatabase
import os

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "12345678"
# 连接到 Neo4j 数据库
driver = GraphDatabase.driver(
    uri=os.environ["NEO4J_URI"],
    auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
)

def load_nodes(session, nodes_file):
    with open(nodes_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            # 假设每个节点都有一个唯一的 ID 和标签
            node_id = row['id']
            labels = row['labels'].split(':')  # 如果有多个标签，用:分隔
            properties = {k: v for k, v in row.items() if k not in ['id', 'labels']}
            
            # 创建节点
            session.run(
                f"MERGE (n:{':'.join(labels)} {{id: $node_id}}) ON CREATE SET n += $properties",
                node_id=node_id,
                properties=properties
            )

def load_relationships(session, relationships_file):
    with open(relationships_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            # 假设每条关系都有起始节点ID、结束节点ID、类型和属性
            start_node_id = row['start_node']
            end_node_id = row['end_node']
            rel_type = row['type']
            properties = {k: v for k, v in row.items() if k not in ['start_node', 'end_node', 'type']}
            
            # 创建关系
            session.run(
                "MATCH (a {id: $start_node_id}), (b {id: $end_node_id}) "
                "MERGE (a)-[r:%s]->(b) ON CREATE SET r += $properties" % rel_type,
                start_node_id=start_node_id,
                end_node_id=end_node_id,
                properties=properties
            )

with driver.session() as session:
    # 加载节点
    load_nodes(session, 'nodes.csv')
    # 加载关系
    load_relationships(session, 'relationships.csv')

# 关闭连接
driver.close()

### 导出json

In [17]:
import json

def export_to_json():
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    with driver.session() as session:
        result = session.run("MATCH (s)-[r:!MENTIONS]->(t) RETURN s, r, t")
        # 将结果转换为字典列表
        data = []
        for record in result:
            data.append({
                "source": {"id": record['s'].id, "labels": list(record['s'].labels), "properties": dict(record['s'])},
                "relationship": {"type": record['r'].type, "properties": dict(record['r'])},
                "target": {"id": record['t'].id, "labels": list(record['t'].labels), "properties": dict(record['t'])}
            })
        # 写入JSON文件
        with open('graph_data.json', 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2,ensure_ascii=False)

export_to_json()

  "source": {"id": record['s'].id, "labels": list(record['s'].labels), "properties": dict(record['s'])},
  "target": {"id": record['t'].id, "labels": list(record['t'].labels), "properties": dict(record['t'])}


In [16]:
import json
from collections import defaultdict
from neo4j import GraphDatabase

def export_to_json():
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with driver.session() as session:
        # 查询所有节点和关系
        result = session.run("MATCH (s)-[r:!MENTIONS]->(t) RETURN s, r, t")
        
        # 用于存储节点信息
        nodes = {}
        # 用于存储关系信息
        relationships = []
        
        # 遍历结果
        for record in result:
            # 获取源节点信息
            source_id = record['s'].id
            source_labels = list(record['s'].labels)
            source_properties = dict(record['s'])
            
            # 获取目标节点信息
            target_id = record['t'].id
            target_labels = list(record['t'].labels)
            target_properties = dict(record['t'])
            
            # 获取关系信息
            relationship_type = record['r'].type
            relationship_properties = dict(record['r'])
            
            # 确保源节点和目标节点没有重复
            if source_id not in nodes:
                nodes[source_id] = {
                    "id": source_id,
                    "labels": source_labels,
                    "properties": source_properties
                }
            if target_id not in nodes:
                nodes[target_id] = {
                    "id": target_id,
                    "labels": target_labels,
                    "properties": target_properties
                }
            
            # 添加关系
            relationships.append({
                "source_id": source_id,
                "target_id": target_id,
                "relationship": {
                    "type": relationship_type,
                    "properties": relationship_properties
                }
            })
        
        # 将节点和关系转换为最终的导出格式
        data = []
        for rel in relationships:
            data.append({
                "source": nodes[rel['source_id']],
                "relationship": rel['relationship'],
                "target": nodes[rel['target_id']]
            })
        
        # 写入JSON文件
        with open('graph_data.json', 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

# 调用函数导出数据
export_to_json()

  source_id = record['s'].id
  target_id = record['t'].id


### 加载json数据

In [13]:
import os
import json
from neo4j import GraphDatabase

def load_data_from_json(json_file):
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    with driver.session() as session:
        for item in data:
            # 创建或匹配源节点
            source_id = item['source']['id']
            source_labels = item['source']['labels']
            source_properties = item['source']['properties']
            session.run(
                "MERGE (s:%s {id: $id}) ON CREATE SET s += $properties ON MATCH SET s += $properties" % (":".join(source_labels)),
                id=source_id,
                properties=source_properties
            )
            
            # 创建或匹配目标节点
            target_id = item['target']['id']
            target_labels = item['target']['labels']
            target_properties = item['target']['properties']
            session.run(
                "MERGE (t:%s {id: $id}) ON CREATE SET t += $properties ON MATCH SET t += $properties" % (":".join(target_labels)),
                id=target_id,
                properties=target_properties
            )
            
            # 创建或匹配关系
            relationship_type = item['relationship']['type']
            relationship_properties = item['relationship']['properties']
            session.run(
                """
                MATCH (s {id: $source_id}), (t {id: $target_id})
                MERGE (s)-[r:%s]->(t)
                ON CREATE SET r += $properties
                ON MATCH SET r += $properties
                """ % relationship_type,
                source_id=source_id,
                target_id=target_id,
                properties=relationship_properties
            )

# 调用函数加载数据
load_data_from_json('graph_data.json')

ConstraintError: {code: Neo.ClientError.Schema.ConstraintValidationFailed} {message: Node(8) already exists with label `__Entity__` and property `id` = '我'}

## 向量化

In [None]:
vector_index = Neo4jVector.from_existing_graph(
    create_embeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()

## 提取问题的特征关键词

In [11]:
def prompt_query(query):
    llm = create_llm()
    prompt = (
        "任务：query特征提取,只要输出词语，不要输出任何其他信息。\n"
        "例子：\n"
        "输入：小红和小丽都是谁呀？\n"
        "输出：小红，小丽\n"
        "输入：我不管我要吃巧克力和香蕉\n"
        "输出：巧克力，香蕉\n"
        "输入：巴黎奥运金牌第一名\n"
        "输出：巴黎奥运会，金牌\n"
        "按照上述事实回答问题:\n"
        f"输入：{query}\n"
        "输出："
    )
    response = llm(prompt)
    # response_list = [response.strip()] if response else []
    return ','.join(response) if isinstance(response, list) else response.strip()
prompt_query("麻辣烫的配送速度与包装做得好吗？")

  response = llm(prompt)
[INFO][2024-10-05 17:03:45.459] oauth.py:277 [t:12340]: trying to refresh token for ak `evU2Fd***`
[INFO][2024-10-05 17:03:45.548] oauth.py:304 [t:12340]: successfully refresh token


'麻辣烫，配送速度，包装'

In [16]:
def generate_full_text_query(input: str) -> str:
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

entities = prompt_query("就是大批的青年人“躺平”或者“啃老”")
print(generate_full_text_query(entities))

躺平，啃老~2


## 用知识图谱检索领近的节点

In [22]:
def generate_full_text_query(input: str) -> str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " AND ".join([f"{word}~2" for word in words])
    print(f"Generated Query: {full_text_query}")
    return full_text_query.strip()

graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")


def graph_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    #prompt_query提取特征实体
    entities = prompt_query(question)
    for entity in entities:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result



In [23]:
print(graph_retriever("年轻人"))

Generated Query: 年~2
Generated Query: 轻~2
Generated Query: 人~2
老年人 - IS_NOT -> 一穷二白，守着两亩薄田过活的人
大批的青年人 - BEHAVIOR -> 躺平
大批的青年人 - BEHAVIOR -> 啃老养老观念 - BELONGS_TO -> 中国人
老龄化 - ASSOCIATED_WITH -> 个人发展机会减少
老龄化 - ASSOCIATED_WITH -> 家庭负担增加
老龄化 - ASSOCIATED_WITH -> 经济减速富人 - HAS_ASSET -> 1000万以上
老年人 - IS_NOT -> 一穷二白，守着两亩薄田过活的人


In [24]:
def full_retriever(question: str):
    graph_data = graph_retriever(question)
    vector_data = [el.page_content for el in vector_retriever.invoke(question)]
    final_data = f"""Graph data:
{graph_data}
vector data:
{"#Document ". join(vector_data)}
    """
    print(final_data)
    return final_data

## RAG实现

In [25]:
template = """
仅根据下列上下文回答问题:
{context}

Question: {question}
使用自然语言，简洁明了.
"""
prompt = ChatPromptTemplate.from_template(template)

chain = ({"context": full_retriever,"question": RunnablePassthrough(),}
    | prompt
    | llm
    | StrOutputParser()
)

NameError: name 'llm' is not defined

In [121]:
chain.invoke(input="麻辣烫的汤怎么样?")

In [6]:
os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "12345678"
driver = GraphDatabase.driver(uri = os.environ["NEO4J_URI"],auth = (os.environ["NEO4J_USERNAME"],os.environ["NEO4J_PASSWORD"]))

def clear_database(tx):
    # 删除所有关系
    tx.run("MATCH ()-[r]->() DELETE r")
    # 删除所有节点
    tx.run("MATCH (n) DELETE n")

with driver.session() as session:
    session.write_transaction(clear_database)

# 关闭驱动
driver.close()

  session.write_transaction(clear_database)
