In [34]:
import os
from pathlib import Path

import mlflow
from langchain_community.document_loaders import (
    UnstructuredEPubLoader,
    UnstructuredMarkdownLoader,
)
from llama_index.core import Document
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    get_leaf_nodes,
    get_root_nodes,
)
from llama_index.core.settings import Settings
from llama_index.core.storage import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.deepseek import DeepSeek
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

from readai.components.custom.text_splitters.chinese_text_splitter import (
    ChineseRecursiveTextSplitter,
)

In [None]:
# 设置MLflow
try:
    mlflow.set_tracking_uri("http://localhost:5001")
    mlflow.get_tracking_uri()
    mlflow.set_experiment("LlamaIndex-RAG")
    mlflow.llama_index.autolog()
except Exception as e:
    print(e)

# 设置模型
api_key = os.getenv("DEEPSEEK_API_KEY")
model_name = os.getenv("DEEPSEEK_MODEL")
Settings.llm = DeepSeek(model=model_name, api_key=api_key)
Settings.embed_model = OllamaEmbedding(
    model_name="quentinz/bge-large-zh-v1.5", base_url="http://localhost:11434"
)

# 第一步：使用 UnstructuredEPubLoader 加载 EPUB 文档
test_data_path = Path(
    "/Users/pegasus/workplace/mygits/readest-ai/readai-backend/readai/tests/data"
)
book_name = "非暴力沟通.epub"
book_path = test_data_path / book_name
loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="hi_res")
documents = loader.load()

# 使用中文文本分割器
text_splitter = ChineseRecursiveTextSplitter(chunk_size=1000, chunk_overlap=100)
split_documents = text_splitter.split_documents(documents)

# 转换为llamaindex需要的documents对象
llama_documents = [
    Document(text=doc.page_content, metadata=doc.metadata) for doc in split_documents
]

# 创建层级解析器
parser = HierarchicalNodeParser.from_defaults(chunk_sizes=[2048, 1024])
book_nodes = parser.get_nodes_from_documents(llama_documents)
leaf_nodes = get_leaf_nodes(book_nodes)
root_nodes = get_root_nodes(book_nodes)

print(f"切分后节点总数量: {len(leaf_nodes)}")
print(f"根节点数量: {len(root_nodes)}")

# 初始化Qdrant客户端
qdrant_client = QdrantClient(
    host="localhost", port=6333
)  # 使用内存存储，也可以指定本地路径
vector_store = QdrantVectorStore(client=qdrant_client, collection_name="book_test0331")


In [27]:
# 第一步：使用 UnstructuredEPubLoader 加载 EPUB 文档
test_data_path = Path("/Users/pegasus/media/library")
book_name = "跃迁：成为高手的技术.epub"
book_path = test_data_path / book_name
loader = UnstructuredEPubLoader(book_path, mode="elements", strategy="fast")
documents = loader.load()

# 查看节点数量
print(f"节点数量: {len(documents)}")

节点数量: 3494


## 切换为markdown reader

In [44]:
test_data_path = Path("/Users/pegasus/media/library")
book_name = "test_doc2.md"
book_path = test_data_path / book_name
loader = UnstructuredMarkdownLoader(book_path)
documents = loader.load()

# 查看节点数量
print(f"节点数量: {len(documents)}")
print(documents[0].metadata)


节点数量: 1
{'source': '/Users/pegasus/media/library/test_doc2.md'}


In [64]:
import re


def clean_empty_headings(markdown_text):
    # 将文本分割成多行
    lines = markdown_text.splitlines()

    # 过滤掉只包含#号的行
    # 正则表达式 ^\s*#+\s*$ 解释:
    # ^ - 行的开始
    # \s* - 零个或多个空白字符
    # #+ - 一个或多个#符号
    # \s* - 零个或多个空白字符
    # $ - 行的结束
    cleaned_lines = [line for line in lines if not re.match(r"^\s*#+\s*$", line)]

    # 重新连接文本
    cleaned_markdown = "\n".join(cleaned_lines)

    return cleaned_markdown


# 使用示例
markdown_text = """
马歇尔·卢森堡博士发现了一种沟通方式，依照它来谈话和聆听，能使人们情意相通，和谐相处，这就是"非暴力沟通"。

#
## 
###

# 中文版序
我们每一个人都渴望幸福，渴望生活在一个和平、稳定而又繁荣的社会当中。
"""

cleaned_text = clean_empty_headings(markdown_text)
print(cleaned_text)


马歇尔·卢森堡博士发现了一种沟通方式，依照它来谈话和聆听，能使人们情意相通，和谐相处，这就是"非暴力沟通"。


# 中文版序
我们每一个人都渴望幸福，渴望生活在一个和平、稳定而又繁荣的社会当中。


In [41]:
import json

nodes_list = []
node_category = set()
# 检查所有节点的metadata中一共出现了哪几种category
for doc in documents:
    node_category.add(doc.metadata["category"])
    node_data = {"text": doc.page_content, "metadata": doc.metadata}
    nodes_list.append(node_data)
print(node_category)


# 保存到 json 文件
with open("非暴力沟通2md.json", "w", encoding="utf-8") as f:
    json.dump(nodes_list, f, ensure_ascii=False, indent=2)


{'Image', 'UncategorizedText', 'ListItem', 'NarrativeText', 'Title'}


In [54]:
##使用langchain的markdownsplitter
from langchain_text_splitters.markdown import (
    # MarkdownHeaderTextSplitter,
    MarkdownTextSplitter,
)

loader = UnstructuredMarkdownLoader(book_path)
documents = loader.load()
print(len(documents))
splitter = MarkdownTextSplitter()

split_documents = splitter.split_text(documents[0].page_content)
print(len(split_documents))

1
32


- 为什么要转换成markdown
因为转换成markdown后，原本epub文件是存在一些结构性内容的，比如标题的层级，相比直接被读取器赌气，转换成markdown后，这种层级内容更加清晰，而且我更好处理
- 转换后为什么还要去除一些无效符号？（比如空行）
因为目前很多中文分块策略，在区分段落时会根据两个换行符区分，但转markdown后发现有很多多余的空行,清除掉多余的空行其实是可以更方便使用text_spliiter的

- 要使用哪一种chunk策略以及node extractor？
递归分割？
标题提取？

## 处理markdown文件的目的是什么

In [67]:
import re

LINK_PATTERN = re.compile(r"!?\[.*?\]\(.*?\)")
EMPTY_HEADING_PATTERN = re.compile(r"^\s*#+\s*$")
METADATA_PATTERN = re.compile(r"\*\*(\w+):\*\*\s*(.+)")


def extract_metadata_from_markdown(markdown_text: str) -> dict:
    pattern = r"\*\*(\w+):\*\*\s*(.+)"
    metadata = {}

    for match in re.finditer(pattern, markdown_text):
        key = match.group(1).strip().lower()  # e.g., title, authors
        value = match.group(2).strip()
        metadata[key] = value

    return metadata


def process_markdown_file(input_path: str, output_path: str) -> dict:
    input_file = Path(input_path)
    if not input_file.exists():
        raise FileNotFoundError(f"文件不存在: {input_path}")

    with open(input_file, encoding="utf-8") as f_in:
        raw_md = f_in.read()

    # 提取元信息
    metadata = extract_metadata_from_markdown(raw_md)

    raw_md = re.sub(LINK_PATTERN, "", raw_md)
    lines = raw_md.splitlines()
    # 去掉前6行
    lines = lines[6:]

    # 去掉lines中空行为line为换行符或空行的行
    lines = [
        line
        for line in lines
        if line.strip() != "" and not EMPTY_HEADING_PATTERN.match(line)
    ]

    # 写入新文件
    output_file = Path(output_path)
    with open(output_file, "w", encoding="utf-8") as f_out:
        f_out.write("\n".join(lines))

    print(f"✅ 已清洗并保存新文件: {output_path}")
    return metadata


# 测试
metadata = process_markdown_file(
    book_path,
    "./test_doc2_cleaned.md",
)


✅ 已清洗并保存新文件: ./test_doc2_cleaned.md


In [None]:
# 创建文档存储
docstore = SimpleDocumentStore()
docstore.add_documents(book_nodes)

# 创建存储上下文
storage_context = StorageContext.from_defaults(
    docstore=docstore, vector_store=vector_store
)

# 创建基础索引
base_index = VectorStoreIndex(nodes=leaf_nodes, storage_context=storage_context)

# 创建基础检索器
base_retriever = base_index.as_retriever(similarity_top_k=6)

# 创建AutoMergingRetriever
from llama_index.core.retrievers import AutoMergingRetriever

retriever = AutoMergingRetriever(
    base_retriever, storage_context, verbose=True, simple_ratio_thresh=0.6
)

# 创建查询引擎
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever)

# 执行查询
query = "非暴力沟通的第一个要素是什么？"
response = query_engine.query(query)
print(response)