In [2]:
import os
from pathlib import Path

import mlflow
from langchain_community.document_loaders import (
    UnstructuredEPubLoader,
    UnstructuredMarkdownLoader,
)
from llama_index.core import Document
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    get_leaf_nodes,
    get_root_nodes,
)
from llama_index.core.settings import Settings
from llama_index.core.storage import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.deepseek import DeepSeek
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

from readai.components.custom.text_splitters.chinese_text_splitter import (
    ChineseRecursiveTextSplitter,
)

使用epubloader，效果不太理想

In [None]:
# 第一步：使用 UnstructuredEPubLoader 加载 EPUB 文档
test_data_path = Path("/Users/pegasus/media/library")
book_name = "跃迁：成为高手的技术.epub"
book_path = test_data_path / book_name
loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="fast")
documents = loader.load()
# 查看节点数量
print(f"节点数量: {len(documents)}")

改成markdown文件后，相对来说可操作空间更大了

In [3]:
test_data_path = Path(
    "/Users/pegasus/workplace/mygits/readest-ai/readai-backend/readai/tests/data"
)
book_name = "comunication_cleaned.md"
book_path = test_data_path / book_name
loader = UnstructuredMarkdownLoader(book_path, mode="elements")
documents = loader.load()
# 查看节点数量
print(f"节点数量: {len(documents)}")
print(documents[0].metadata)


节点数量: 41
{'source': '/Users/pegasus/workplace/mygits/readest-ai/readai-backend/readai/tests/data/comunication_cleaned.md', 'languages': ['zho'], 'file_directory': '/Users/pegasus/workplace/mygits/readest-ai/readai-backend/readai/tests/data', 'filename': 'comunication_cleaned.md', 'filetype': 'text/markdown', 'last_modified': '2025-04-01T20:55:25', 'category': 'NarrativeText', 'element_id': '6ce09934c1462a9ccfa47a5a073b3d06'}


In [19]:
import json

nodes_list = []
node_category = set()
# 检查所有节点的metadata中一共出现了哪几种category
# 记录过程中最长的内容长度
max_text_len = 0
for doc in documents:
    node_category.add(doc.metadata["category"])
    node_data = {
        "text": doc.page_content,
        "metadata": doc.metadata,
        "text_len": len(doc.page_content),
    }
    nodes_list.append(node_data)
    if len(doc.page_content) > max_text_len:
        max_text_len = len(doc.page_content)
print(node_category)
print(f"最长内容长度: {max_text_len}")

# 保存到 json 文件
# with open("非暴力沟通md.json", "w", encoding="utf-8") as f:
#     json.dump(nodes_list, f, ensure_ascii=False, indent=2)

{'NarrativeText', 'UncategorizedText', 'Title'}
最长内容长度: 13639


可以发现，加载后的段落更完整，更符合直觉的按照章节内容分割成多个节点了，这样可以进一步操作，构建多层级节点，同时也可以去生成一些更好的总结性的文本作为节点内容


In [16]:
from llama_index.core.ingestion import IngestionPipeline
from readai.components.custom.text_splitters.sentence import SentenceSplitter
from llama_index.core.node_parser.text.sentence import SentenceSplitter
from llama_index.core.node_parser import SentenceWindowNodeParser

ingest_docs = [
    Document(text=doc.page_content, metadata=doc.metadata) for doc in documents
]
# 创建SentenceSplitter实例
sentence_splitter = SentenceSplitter(
    chunk_size=512,  # 每个块的最大token数
    chunk_overlap=50,  # 块之间的重叠token数
    include_metadata=True,  # 包含元数据
    include_prev_next_rel=True,  # 维护块之间的前后关系
)

# 直接调用spltter的方式
processed_nodes = sentence_splitter.get_nodes_from_documents(
    ingest_docs, show_progress=True
)

# pipeline = IngestionPipeline(
#     transformations=[
#         sentence_splitter,
#     ]
# )

# processed_nodes = pipeline.run(
#     documents=ingest_docs,
#     show_progress=True,
# )


Parsing nodes:   0%|          | 0/41 [00:00<?, ?it/s]

In [17]:
# 手动调用SentenceSplitter来查看其行为
ingest_docs = [
    Document(text=doc.page_content, metadata=doc.metadata) for doc in documents
]
# 对一个doc进行分割后查看细节
test_text = ingest_docs[0].text
test_splits = sentence_splitter.split_text(test_text)
print(f"原文长度: {len(test_text)}, 分割后片段数: {len(test_splits)}")
# 检查原文以及分割后的结果
print(test_text)
print("-" * 100)
for text in test_splits:
    print(text)
    print("-" * 100)


原文长度: 1526, 分割后片段数: 3
马歇尔·卢森堡博士发现了一种沟通方式，依照它来谈话和聆听，能使人们情意相通，和谐相处，这就是“非暴力沟通”。 做为一个遵纪守法的好人，也许我们从来没有把谈话和“暴力”扯上关系。不过如果稍微留意一下现实生活中的谈话方式，并且用心体会各种谈话方式给我们的不同感受，我们一定会发现，有些话确实伤人！言语上的指责、嘲讽、否定、说教以及任意打断、拒不回应、随意出口的评价和结论给我们带来的情感和精神上的创伤甚至比肉体的伤害更加令人痛苦。这些无心或有意的语言暴力让人与人变得冷漠、隔膜、敌视。 非暴力沟通能够： ● 疗愈内心深处的隐秘伤痛； ● 超越个人心智和情感的局限性； ● 突破那些引发愤怒、沮丧、焦虑等负面情绪的思维方式； ● 用不带伤害的方式化解人际间的冲突； ● 学会建立和谐的生命体验。 图书在版编目(CIP)数据 非暴力沟通／（美）马歇尔·卢森堡（Marshall B.Rosenberg）著；刘轶译.-2版（修订本）-北京：华夏出版社有限公司，2021.5 书名原文：Nonviolent Communication ISBN 978-7-5222-0051-4 Ⅰ.①非⋯Ⅱ.①马⋯②刘⋯Ⅲ.①心理交往-通俗读物Ⅳ.①C912.11-49 中国版本图书馆CIP数据核字（2021）第006542号 Translated from the book Nonviolent Communication:A Language of Life 3rd Edition,ISBN 13/10:9781892005281/189200528X by Marshall B.Rosenberg. Copyright ? Fall 2015 Puddle Dancer Press,published by Puddle Dancer Press. All rights reserved. Used with permission. For further information about Nonviolent Communication(TM) please visit the Center for Nonviolent Communication on the Web at:www.cnvc.org. 版权所有翻印必究 北京市版权局著作权合同登记号：

In [13]:
# 查看node有哪些属性,按照json格式输出
print(len(processed_nodes))
# processed_nodes节点输出全部看看,保存内容到json文件中
with open("processed_nodes.json", "w", encoding="utf-8") as f:
    for node in processed_nodes:
        # 数据转json
        json_data = json.dumps(node.to_dict(), ensure_ascii=False, indent=2)
        f.write(json_data)

455


采用分级摘要节点

In [32]:
# 1. 预处理阶段
from llama_index.core.ingestion import IngestionPipeline
from readai.components.custom.text_splitters.hierarchical import HierarchicalNodeParser
from llama_index.core.extractors import SummaryExtractor

ingest_docs = [
    Document(text=doc.page_content, metadata=doc.metadata) for doc in documents
]
# 创建层次解析器
hierarchical_parser = HierarchicalNodeParser.from_defaults(
    include_metadata=False, include_prev_next_rel=True, chunk_sizes=[2048, 512, 128]
)
# 为章节级别节点创建摘要
# summary_extractor = SummaryExtractor(
#     summaries=["chapter_summary"],  # 摘要类型
#     node_types=["chapter"]  # 只为章节节点创建摘要
# )

# 创建处理管道
book_nodes = hierarchical_parser.get_nodes_from_documents(
    ingest_docs, show_progress=True
)
leaf_nodes = get_leaf_nodes(book_nodes)
root_nodes = get_root_nodes(book_nodes)


Parsing documents into nodes:   0%|          | 0/41 [00:00<?, ?it/s]

Metadata length (123) is close to chunk size (128). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (123) is close to chunk size (128). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (123) is close to chunk size (128). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.
Metadata length (123) is close to chunk size (128). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.


ValueError: Metadata length (128) is longer than chunk size (128). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.

In [30]:
print(len(book_nodes))
print(len(leaf_nodes))
print(len(root_nodes))

1859
1436
104


In [31]:
# 查看root_nodes
print(root_nodes[0].to_json)
with open("root_nodes.json", "w", encoding="utf-8") as f:
    for node in root_nodes:
        # 数据转json
        json_data = json.dumps(node.to_dict(), ensure_ascii=False, indent=2)
        f.write(json_data)


<bound method BaseComponent.to_json of TextNode(id_='160caeec-c20d-4118-847f-c7b265d36f1b', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d69b0408-c955-4655-9b84-877d567b5a44', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'source': '/Users/pegasus/workplace/mygits/readest-ai/readai-backend/readai/tests/data/comunication_cleaned.md', 'languages': ['zho'], 'file_directory': '/Users/pegasus/workplace/mygits/readest-ai/readai-backend/readai/tests/data', 'filename': 'comunication_cleaned.md', 'filetype': 'text/markdown', 'last_modified': '2025-04-01T20:55:25', 'category': 'NarrativeText', 'element_id': '6ce09934c1462a9ccfa47a5a073b3d06'}, hash='e9c1d6555633db3366004ce21db95f26fa7df8c1297658edb5e255f3e1662fc7'), <NodeRelationship.CHILD: '5'>: [RelatedNodeInfo(node_id='124a2580-b3d2-462f-839a-9a0fbec1c1c6', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='6aa7743b3b7f8

 探索TextNode

In [None]:
from llama_index.core.schema import (
    Document,
    MetadataMode,
    NodeRelationship,
    TextNode,
    NodeWithScore,
)


In [None]:
# 2. 检索阶段
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

# 基础向量检索器
vector_retriever = vector_index.as_retriever(similarity_top_k=3)

# 自动合并检索器
auto_retriever = AutoMergingRetriever(
    base_retriever=vector_retriever, storage_context=storage_context
)

# 查询引擎
query_engine = RetrieverQueryEngine.from_args(retriever=auto_retriever)

- 为什么要转换成markdown
因为转换成markdown后，原本epub文件是存在一些结构性内容的，比如标题的层级，相比直接按照epub内容（转html）被读取，转换成markdown后，这种层级内容更加清晰，而且我更好处理
- 转换后为什么还要去除一些无效符号？（比如空行）
因为目前很多中文分块策略，在区分段落时会根据两个换行符区分，但转markdown后发现有很多多余的空行,清除掉多余的空行其实是可以更方便使用text_spliiter的

- 要使用哪一种chunk策略以及node extractor？
递归分割？
标题提取？
句子分割