In [1]:
import nest_asyncio

nest_asyncio.apply()

In [2]:
from langchain_community.document_loaders import (
    UnstructuredMarkdownLoader,
)
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.schema import BaseNode, Document, IndexNode, NodeWithScore

from readai.components.retrievers import BM25Retriever

In [3]:
import os
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()

# 获取项目根目录路径
project_root = Path(os.getenv("PROJECT_ROOT"))
test_data_path = project_root / "readai/tests/data"

In [29]:
book_name = "comunication_cleaned.md"
book_path = test_data_path / book_name
# 使用unstructured的markdownloader加载
loader = UnstructuredMarkdownLoader(book_path, mode="elements")
documents = loader.load()
# 查看节点数量
print(f"节点数量: {len(documents)}")
# 遍历documents,重构metadata，只保留category，filename这两个属性
for doc in documents:
    category = doc.metadata["category"]
    filename = doc.metadata["filename"]
    doc.metadata = {
        "category": category,
        "filename": filename,
    }

节点数量: 41


In [30]:
llamindex_docs = [
    Document(
        text=doc.page_content,
        metadata=doc.metadata,
        metadata_seperator="::",
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )
    for doc in documents
]

In [31]:
print(len(llamindex_docs))

41


In [32]:
node_parser = SimpleNodeParser.from_defaults(
    chunk_size=1024,
    chunk_overlap=50,
    separator="\n\n",
    secondary_chunking_regex="[^,.;。？！]+[,.;。？！]?",
)
hybrid_nodes = node_parser.get_nodes_from_documents(llamindex_docs, show_progress=True)
print(hybrid_nodes[0].text)
print(len(hybrid_nodes))

Parsing nodes:   0%|          | 0/41 [00:00<?, ?it/s]

马歇尔·卢森堡博士发现了一种沟通方式，依照它来谈话和聆听，能使人们情意相通，和谐相处，这就是“非暴力沟通”。 做为一个遵纪守法的好人，也许我们从来没有把谈话和“暴力”扯上关系。不过如果稍微留意一下现实生活中的谈话方式，并且用心体会各种谈话方式给我们的不同感受，我们一定会发现，有些话确实伤人！言语上的指责、嘲讽、否定、说教以及任意打断、拒不回应、随意出口的评价和结论给我们带来的情感和精神上的创伤甚至比肉体的伤害更加令人痛苦。这些无心或有意的语言暴力让人与人变得冷漠、隔膜、敌视。 非暴力沟通能够： ● 疗愈内心深处的隐秘伤痛； ● 超越个人心智和情感的局限性； ● 突破那些引发愤怒、沮丧、焦虑等负面情绪的思维方式； ● 用不带伤害的方式化解人际间的冲突； ● 学会建立和谐的生命体验。 图书在版编目(CIP)数据 非暴力沟通／（美）马歇尔·卢森堡（Marshall B.Rosenberg）著；刘轶译.-2版（修订本）-北京：华夏出版社有限公司，2021.5 书名原文：Nonviolent Communication ISBN 978-7-5222-0051-4 Ⅰ.①非⋯Ⅱ.①马⋯②刘⋯Ⅲ.①心理交往-通俗读物Ⅳ.①C912.11-49 中国版本图书馆CIP数据核字（2021）第006542号 Translated from the book Nonviolent Communication:A Language of Life 3rd Edition,ISBN 13/10:9781892005281/189200528X by Marshall B.Rosenberg. Copyright ? Fall 2015 Puddle Dancer Press,published by Puddle Dancer Press. All rights reserved. Used with permission. For further information about Nonviolent Communication(TM) please visit the Center for Nonviolent Communication on the Web at:www.cnvc.org.
170


In [4]:
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.deepseek import DeepSeek

api_key = os.getenv("DEEPSEEK_API_KEY")
model_name = os.getenv("DEEPSEEK_MODEL")
embed_model = OllamaEmbedding(
    model_name="quentinz/bge-large-zh-v1.5", base_url="http://localhost:11434"
)

# 设置 LLM
llm = DeepSeek(model=model_name, api_key=api_key)

In [5]:
from llama_index.core.indices import load_index_from_storage
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.core.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import AsyncQdrantClient, QdrantClient

client = QdrantClient(url="http://localhost:6333")
aclient = AsyncQdrantClient(url="http://localhost:6333")


In [6]:
collection_name = "qdrant_nodes_test"
client.collection_exists(collection_name)

True

In [7]:
# 1. 构建索引并持久化
# 创建 Qdrant 向量存储（指定集合名称，可任选）
qdrant_vs = QdrantVectorStore(
    collection_name=collection_name, client=client, aclient=aclient
)

In [39]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store import SimpleIndexStore

docstore = SimpleDocumentStore()
docstore.add_documents(hybrid_nodes)
index_store = SimpleIndexStore()

In [40]:
# 准备文档数据或节点
# 使用 Qdrant 向量存储构建 VectorStoreIndex
storage_context = StorageContext.from_defaults(
    vector_store=qdrant_vs, docstore=docstore, index_store=index_store
)
index = VectorStoreIndex(
    nodes=hybrid_nodes,
    storage_context=storage_context,
    embed_model=embed_model,
    llm=llm,
    show_progress=True,
)


Generating embeddings:   0%|          | 0/170 [00:00<?, ?it/s]

In [41]:
persist_dir = test_data_path / collection_name
# index.storage_context.persist(persist_dir=persist_dir)
storage_context.persist(persist_dir)

In [42]:
print(len(storage_context.docstore.docs))

170


### 测试加载

In [16]:
# 从持久化目录加载 DocStore 和 IndexStorre
persist_dir = test_data_path / collection_name

storage_context = StorageContext.from_defaults(
    persist_dir=persist_dir, vector_store=qdrant_vs
)
loaded_index = load_index_from_storage(
    storage_context, llm=llm, embed_model=embed_model
)


In [14]:
retriever = loaded_index.as_retriever(similarity_top_k=5)

In [15]:
nodes = retriever.retrieve("在日常交流中，我们如何区分客观的观察和主观的评价？")
for node in nodes:
    display_source_node(node)

**Node ID:** 82cd620b-d0f1-46a7-8690-57bbbe8da0e7<br>**Similarity:** 0.60743076<br>**Text:** 去观察，就像信仰一样重要。 ——弗雷德里克·布希纳（Frederick Buechner） 我欣然接受你告诉我， 我做了什么或者我未做什么。 我也欣然接受你的评论， 但请不要将两者混淆。 如果你...<br>

**Node ID:** b98f7cc1-d6cf-44bc-9284-4d8ee291a908<br>**Similarity:** 0.59909445<br>**Text:** 当我指出这一点后，另一位老师响应道：“我知道他的意思了。校长的话太多！”这仍不是一个清晰的观察，而是对校长说多少话的评论。随后，第三位老师说：“他认为只有他想说的话是重要的。”我进而向他们解释，...<br>

**Node ID:** 0af9b440-d87f-4f0d-9940-f7a6e343bd93<br>**Similarity:** 0.5780674<br>**Text:** 但他不是个懒汉。 请在说我胡言乱语之前， 想一想，他真的是个懒汉，还是 他的行为被我们贴上了“懒惰”的标签？ 我从未见过什么傻孩子； 这个孩子有时做的事， 我不理解或始料不及， 这个孩子的看法与...<br>

In [None]:
# 测试查询
query_engine = loaded_index.as_query_engine(llm=llm, embed_model=embed_model)
response = query_engine.query("在日常交流中，我们如何区分客观的观察和主观的评价？")
print(response)

## Index Store

In [None]:
# %pip install llama-index-storage-docstore-redis
# %pip install llama-index-storage-index-store-redis

In [None]:
storage_context = StorageContext.from_defaults(
    docstore=RedisDocumentStore.from_host_and_port(
        host=REDIS_HOST, port=REDIS_PORT, namespace="llama_index"
    ),
    index_store=RedisIndexStore.from_host_and_port(
        host=REDIS_HOST, port=REDIS_PORT, namespace="llama_index"
    ),
)

storage_context.docstore.add_documents(nodes)
len(storage_context.docstore.docs)

In [None]:
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.storage.index_store.redis import RedisIndexStore
from llama_index.core import VectorStoreIndex

# create (or load) docstore and add nodes
index_store = RedisIndexStore.from_host_and_port(
    host="127.0.0.1", port="6379", namespace="llama_index"
)

# create storage context
storage_context = StorageContext.from_defaults(index_store=index_store)

# build index
index = VectorStoreIndex(nodes, storage_context=storage_context)

# or alternatively, load index
from llama_index.core import load_index_from_storage

index = load_index_from_storage(storage_context)