In [1]:
import os
import chromadb
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
# 1. 读取文档
with open(os.path.join('../data', 'Elon.txt'), 'r') as f:
    text = f.read()

In [3]:
# 2. 初始化解析器
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=64, chunk_overlap=2, add_start_index=True
)

In [4]:
# 3. 将文档解析为文本节点
doc=text_splitter.create_documents([text])
all_splits = text_splitter.split_documents(doc)
print(len(all_splits))
print(all_splits[0])

53
page_content='Elon Reeve Musk (/ˈiːlɒn/ EE-lon; born June 28, 1971) is a' metadata={'start_index': 0}


In [5]:
# 4. 初始化ChromaDB客户端
client = chromadb.PersistentClient(path="./chromaDemo")
collection = client.get_collection(name="ExampleText")

In [6]:
# 5. 准备文本块和元数据
texts = [split.page_content for split in all_splits]
metadatas = [split.metadata for split in all_splits]
ids = [f"node_{i}" for i in range(len(all_splits))]

In [None]:
# 6. 添加文本块到ChromaDB
collection.add(
    documents=texts,
    metadatas=metadatas,
    ids=ids,
)

In [14]:
# 7. 查询文本块
results = collection.query(
    query_texts=["When did Musk establish xAI"],
    n_results=2,
    # where={"metadata_field": "is_equal_to_this"}, # 可选过滤器
    # where_document={"$contains":"search_string"}  # 可选过滤器
)
print(results)
for document in results['documents']:
    for document_ in document:
        print(document_)  

{'ids': [['node_43', 'node_20']], 'distances': [[0.3531293511338359, 0.7765314052605274]], 'metadatas': [[{'start_index': 0}, {'start_index': 0}]], 'embeddings': None, 'documents': [['March 2023, Musk founded xAI, an artificial intelligence', 'year, Musk co-founded X.com, a direct bank. X.com merged with']], 'uris': None, 'data': None}
March 2023, Musk founded xAI, an artificial intelligence
year, Musk co-founded X.com, a direct bank. X.com merged with
