# Quackling — Hybrid Pipeline

Note: this example requires a Milvus instance with hybrid retrieval support.

In [1]:
# requirements for this example:
%pip install -qq \
    quackling \
    python-dotenv \
    llama-index-embeddings-huggingface \
    llama-index-llms-huggingface-api \
    flagembedding peft \
    llama-index-vector-stores-milvus \
    llama-index-postprocessor-flag-embedding-reranker

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

from dotenv import load_dotenv
from pydantic import TypeAdapter
from rich.pretty import pprint

load_dotenv()

FILE_PATHS = [
    # "/path/to/local/pdf",  # file path
    "https://arxiv.org/pdf/2206.01062",  # URL (DocLayNet paper)
]
TEXT_QA_TEMPLATE_STR = "Context information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer:\n"
QUERY = "What is this paper about?"
TOP_K = 5

INGEST = TypeAdapter(bool).validate_python(os.environ.get("INGEST", "True"))

In [3]:
import warnings

warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(
    action="ignore", category=FutureWarning, module="easyocr|FlagEmbedding"
)

## Setup

### Reader and node parser

In [4]:
from quackling.llama_index.node_parsers import HierarchicalJSONNodeParser
from quackling.llama_index.readers import DoclingPDFReader

reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)
node_parser = HierarchicalJSONNodeParser()

### Dense Embedding model

In [5]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

HF_EMBED_MODEL_ID = "BAAI/bge-m3"

embed_model = HuggingFaceEmbedding(model_name=HF_EMBED_MODEL_ID)

### LLM

In [6]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

HF_API_KEY = os.environ.get("HF_API_KEY")
HF_LLM_MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm = HuggingFaceInferenceAPI(
    token=HF_API_KEY,
    model_name=HF_LLM_MODEL_ID,
)

### Vector store

In [7]:
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.vector_stores.milvus.utils import BGEM3SparseEmbeddingFunction

MILVUS_URL = os.environ["MILVUS_URL"]
MILVUS_COLL_NAME = os.environ.get("MILVUS_COLL_NAME", "quackling_hybrid_pipeline")
MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get("MILVUS_KWARGS", "{}"))
MILVUS_HYBRID_RANKER = os.environ.get("MILVUS_HYBRID_RANKER", "RRFRanker")
MILVUS_HYBRID_RNKR_PARAMS = TypeAdapter(dict).validate_json(
    os.environ.get("MILVUS_HYBRID_RNKR_PARAMS", '{"k": 60}')
)

sparse_embedding = BGEM3SparseEmbeddingFunction()

vector_store = MilvusVectorStore(
    uri=MILVUS_URL,
    dim=len(embed_model.get_text_embedding("hi")),
    collection_name=MILVUS_COLL_NAME,
    overwrite=INGEST,  # not showing follow-up/incremental ingestions in this example
    enable_sparse=True,
    sparse_embedding_function=sparse_embedding,
    hybrid_ranker=MILVUS_HYBRID_RANKER,
    hybrid_ranker_params=MILVUS_HYBRID_RNKR_PARAMS,
    **MILVUS_KWARGS,
)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [8]:
from llama_index.core.vector_stores.types import VectorStoreQueryMode

vector_store_query_mode = VectorStoreQueryMode.HYBRID

### Reranker

In [9]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(model="BAAI/bge-reranker-v2-m3", top_n=TOP_K)

## Preparing vector store index

In [10]:
from llama_index.core import StorageContext, VectorStoreIndex

if INGEST:
    # in this case we ingest the data into the vector store
    docs = reader.load_data(file_path=FILE_PATHS)
    pprint(docs, max_length=1, max_string=50, max_depth=4)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(
        documents=docs,
        embed_model=embed_model,
        storage_context=storage_context,
        transformations=[
            node_parser,
        ],
    )
else:
    # in this case we just load the vector store index
    index = VectorStoreIndex.from_vector_store(vector_store)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

## RAG

To use the reranker in RAG, we include it in the `node_postprocessors`:

In [11]:
from llama_index.core import PromptTemplate
from llama_index.core.response_synthesizers.type import ResponseMode

query_engine = index.as_query_engine(
    llm=llm,
    similarity_top_k=TOP_K,
    node_postprocessors=[reranker],  # <==
    text_qa_template=PromptTemplate(TEXT_QA_TEMPLATE_STR),
    vector_store_query_mode=vector_store_query_mode,
    response_mode=ResponseMode.REFINE,
)
query_res = query_engine.query(QUERY)
pprint(query_res, max_length=1, max_string=250, max_depth=4)

## Retrieval

In [12]:
retriever = index.as_retriever(
    similarity_top_k=TOP_K,
    vector_store_query_mode=vector_store_query_mode,
)
retr_res = retriever.retrieve(QUERY)
pprint(retr_res, max_length=1, max_string=250, max_depth=4)

Here are the retrieval results (simplified) prior to reranking:

In [13]:
before_rrk = [
    dict(path=node.metadata["path"], text=node.text, score=node.score)
    for node in retr_res
]
pprint(before_rrk, max_string=250)

Here we apply the reranker on the retrieved results — as you can see, the reranker has indeed changed the order:

In [14]:
reranked = reranker.postprocess_nodes(nodes=retr_res, query_str=QUERY)
after_rrk = [
    dict(path=node.metadata["path"], text=node.text, score=node.score)
    for node in reranked
]
pprint(after_rrk, max_string=250)