In [5]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader  # type: ignore
from typing import List
from langchain.schema import Document
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever

In [10]:
def ollama_query(context, question):
    template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Question: {question} 
    Context: {context} 
    Answer:
    """
    prompt = ChatPromptTemplate.from_template(template)

    model = OllamaLLM(model="qwen2.5:7b")
    chain = prompt | model | StrOutputParser()
    response = chain.invoke({"question": question, "context": context})

    # 过滤掉<think>...</think>部分

    # filtered_response = re.sub(r'<think>.*?</think>\s*', '', response, flags=re.DOTALL)
    return response


def load_chroma_db(
    db_path="./rag_chroma",
    embedding=HuggingFaceEmbeddings(model_name="BAAI/bge-base-en"),
    collection_name="navie_rag",
):
    chromadb = Chroma(embedding_function=embedding, persist_directory=db_path, collection_name=collection_name)
    return chromadb


def multi_query_search(query, db):

    retriever = MultiQueryRetriever.from_llm(
        retriever=db.as_retriever(), llm=OllamaLLM(model="qwen2.5:7b"),include_original=True
    )
    print("retrieveing")
    return retriever.invoke(query)

In [11]:
db = load_chroma_db()

# 查看数据库内容
print("数据库中的文档数量:", db._collection.count())

# 获取所有文档的ID
all_ids = db._collection.get()["ids"]
print("文档ID列表:", all_ids[:10] if len(all_ids) > 10 else all_ids)  # 只显示前10个ID

# 获取数据库中的一些示例文档
if db._collection.count() > 0:
    sample_results = db.similarity_search("", k=3)  # 随机获取几个文档
    print("\n示例文档内容:")
    for i, doc in enumerate(sample_results):
        print(f"文档 {i+1}:")
        print(f"内容: {doc.page_content[:200]}..." if len(doc.page_content) > 200 else doc.page_content)
        print(f"元数据: {doc.metadata}")
        print()

数据库中的文档数量: 208
文档ID列表: ['09ad5e4a-82e4-4a3c-870d-7957413150bc', '19badb6a-d660-49f0-bbb7-f7fdef9b7f88', 'efb7b018-165e-4e53-a2df-081a17ff4753', '63eb46ca-c6f0-43d0-add9-4ff1e5b87549', '211fdf87-c0f9-40cd-a1e4-3ae2ac881303', '0b3bbb2f-5e34-4340-9862-18205e8cd3cf', '55325bd3-348e-4ccb-9b3c-6611666ffebc', '1cf9b3f2-f470-47a9-88a2-53e3e283668d', '841e6e3c-881c-4f6a-b96c-df4bfb008aa4', 'fa8bc868-d3d4-42f4-ae96-56bf4da13e0e']

示例文档内容:
文档 1:
内容: some also coverage except for the cruciforms. In contrast, Sequencer’s ERFs are limited to the
cruciform and its neighborhood.
It is interesting to note that Sequencer, with its characteristic ERFs, a...
元数据: {'author': '', 'creationdate': '2023-01-13T02:09:12+00:00', 'creator': 'LaTeX with hyperref', 'keywords': '', 'moddate': '2023-01-13T02:09:12+00:00', 'page': 20, 'page_label': '21', 'producer': 'pdfTeX-1.40.21', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'source': 'lstm.pdf', 'subje

In [12]:
multi_query_search("what is LSTM?", db)

retrieveing


[Document(id='a6abc39e-15cd-4c89-9bcb-02f35f1a95c4', metadata={'author': '', 'creationdate': '2023-01-13T02:09:12+00:00', 'creator': 'LaTeX with hyperref', 'keywords': '', 'moddate': '2023-01-13T02:09:12+00:00', 'page': 2, 'page_label': '3', 'producer': 'pdfTeX-1.40.21', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'source': 'lstm.pdf', 'subject': '', 'title': '', 'total_pages': 26, 'trapped': '/False'}, page_content='of the proposed architectures.\n3.1 Preliminaries: Long short-term memory\nLSTM [27] is a specialized recurrent neural network (RNN) for modeling long-term dependencies of\nsequences. Plain LSTM has an input gate it that controls the storage of inputs, a forget gate ft that\ncontrols the forgetting of the former cell state ct−1 and an output gate ot that controls the cell output\nht from the current cell state ct. Plain LSTM is formulated as follows:\nit = σ(Wxixt + Whiht−1 + bi) , ft = σ(Wxf xt + Whf ht−1 + b