In [1]:
import os
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_milvus import Milvus, BM25BuiltInFunction
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_ollama import ChatOllama

In [2]:
embedding = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-m3",
    model_kwargs = {'device': 'cpu'},
)

In [3]:
def load_pdf_files(directory):
    result = []
    semantic_splitter = SemanticChunker(
        embedding,
        breakpoint_threshold_type = "percentile"
    )
    for file in os.listdir(directory):
        loader = PyMuPDFLoader(
            file_path = os.path.join(directory, file),
            mode = "page",
            extract_tables = "markdown",
        )
        documents = loader.load()
        semantic_chunks = semantic_splitter.create_documents([d.page_content for d in documents])
        result += semantic_chunks
    return result

In [4]:
# https://milvus.io/docs/zh/milvus_hybrid_search_retriever.md
vectorstore = Milvus.from_documents(
    documents = load_pdf_files('./docs'),
    embedding = embedding,
    builtin_function = BM25BuiltInFunction(),
    vector_field = ["dense", "sparse"],
    connection_args={
        "uri":  "http://localhost:19530",
    },
    consistency_level = "Strong",
    drop_old = True,
)

rerank_model = HuggingFaceCrossEncoder(model_name = "BAAI/bge-reranker-v2-m3")
retriever = ContextualCompressionRetriever(
    base_compressor = CrossEncoderReranker(model = rerank_model, top_n = 5), 
    base_retriever = vectorstore.as_retriever()
)

In [5]:
def format_context(context_docs):
    return "\n\n".join(doc.page_content for doc in context_docs)

In [6]:
formatted_prompt = """
Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n\n
"""
prompt = ChatPromptTemplate.from_messages(
    [("user", formatted_prompt)],
)

llm = ChatOllama(
    model = "deepseek-r1:32b",
    temperature = 0.5
)
rag_chain = (
    {"context": retriever | format_context, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [7]:
query = "What is RAG"
res = rag_chain.invoke(query)
res

'<think>\nOkay, I need to figure out what RAG stands for based on the provided context. Let me start by reading through the given information carefully. \n\nThe context mentions an "OVERVIEW OF RAG" and describes it as a typical application where a user asks ChatGPT a question about recent news. Since ChatGPT relies on pre-training data, it can\'t provide up-to-date info. So, RAG bridges this gap by using external databases to get relevant information, which is then used along with the original question to help LLMs generate better answers.\n\nLooking further, there\'s a section labeled A. Naive RAG, which talks about an early methodology that became popular after some initial work. It also mentions figures and tables related to RAG models generating more specific and accurate responses compared to BART models in certain tasks like Jeopardy question generation.\n\nThe context doesn\'t explicitly define RAG, but based on the usage and surrounding information, it\'s likely referring to a