# Quackling — Hybrid Pipeline

Note: this example requires a Milvus instance with hybrid retrieval support.

In [1]:
# requirements for this example:
%pip install -q \
    quackling \
    python-dotenv \
    llama-index-embeddings-huggingface \
    llama-index-llms-huggingface-api \
    flagembedding \
    llama-index-vector-stores-milvus

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

from dotenv import load_dotenv
from pydantic import TypeAdapter
from rich.pretty import pprint

load_dotenv()

FILE_PATHS = [
    # "/path/to/local/pdf",  # file path
    "https://arxiv.org/pdf/2206.01062",  # URL (DocLayNet paper)
]
TEXT_QA_TEMPLATE_STR = "Context information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer:\n"
QUERY = "How many pages were human annotated?"
TOP_K = 3

INGEST = TypeAdapter(bool).validate_python(os.environ.get("INGEST", "True"))

In [3]:
import warnings

warnings.filterwarnings(
    action="ignore", category=UserWarning, module="torchvision|torch"
)

## Setup

### Reader

In [4]:
from quackling.llama_index.readers.docling_reader import DoclingReader

reader = DoclingReader(parse_type=DoclingReader.ParseType.JSON)
load_kwargs = dict(
    file_path=FILE_PATHS,
)

### Node parser

In [5]:
from quackling.llama_index.node_parsers.hier_node_parser import HierarchicalNodeParser

node_parser = HierarchicalNodeParser()

### Dense Embedding model

In [6]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"

embed_model = HuggingFaceEmbedding(model_name=HF_EMBED_MODEL_ID)

### LLM

In [7]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

HF_API_KEY = os.environ.get("HF_API_KEY")
HF_LLM_MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm = HuggingFaceInferenceAPI(
    token=HF_API_KEY,
    model_name=HF_LLM_MODEL_ID,
)

### Vector store

In [8]:
from FlagEmbedding import BGEM3FlagModel
from llama_index.vector_stores.milvus.utils import BaseSparseEmbeddingFunction


class BGEM3SparseEmbedding(BaseSparseEmbeddingFunction):
    def __init__(self):
        self.model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=False)

    def encode_queries(self, queries: list[str]):
        outputs = self.model.encode(
            queries,
            return_dense=False,
            return_sparse=True,
            return_colbert_vecs=False,
        )["lexical_weights"]
        return [self._to_standard_dict(output) for output in outputs]

    def encode_documents(self, documents: list[str]):
        outputs = self.model.encode(
            documents,
            return_dense=False,
            return_sparse=True,
            return_colbert_vecs=False,
        )["lexical_weights"]
        return [self._to_standard_dict(output) for output in outputs]

    def _to_standard_dict(self, raw_output):
        result = {}
        for k in raw_output:
            result[int(k)] = raw_output[k]
        return result

In [9]:
from llama_index.vector_stores.milvus import MilvusVectorStore

MILVUS_URL = os.environ["MILVUS_URL"]
MILVUS_COLL_NAME = os.environ.get("MILVUS_COLL_NAME", "quackling_hybrid_pipeline")
MILVUS_KWARGS = TypeAdapter(dict).validate_json(os.environ.get("MILVUS_KWARGS", "{}"))
MILVUS_HYBRID_RANKER = os.environ.get("MILVUS_HYBRID_RANKER", "RRFRanker")
MILVUS_HYBRID_RNKR_PARAMS = TypeAdapter(dict).validate_json(
    os.environ.get("MILVUS_HYBRID_RNKR_PARAMS", '{"k": 60}')
)

sparse_embedding = BGEM3SparseEmbedding()

vector_store = MilvusVectorStore(
    uri=MILVUS_URL,
    dim=len(embed_model.get_text_embedding("hi")),
    collection_name=MILVUS_COLL_NAME,
    overwrite=INGEST,  # not showing follow-up/incremental ingestions in this example
    enable_sparse=True,
    sparse_embedding_function=sparse_embedding,
    hybrid_ranker=MILVUS_HYBRID_RANKER,
    hybrid_ranker_params=MILVUS_HYBRID_RNKR_PARAMS,
    **MILVUS_KWARGS,
)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [10]:
from llama_index.core.vector_stores.types import VectorStoreQueryMode

vector_store_query_mode = VectorStoreQueryMode.HYBRID

## Preparing vector store index

In [11]:
from llama_index.core import StorageContext, VectorStoreIndex

if INGEST:
    # in this case we ingest the data into the vector store
    docs = reader.load_data(**load_kwargs)
    pprint(docs, max_length=1, max_string=250, max_depth=4)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(
        documents=docs,
        embed_model=embed_model,
        storage_context=storage_context,
        transformations=[
            node_parser,
        ],
    )
else:
    # in this case we just load the vector store index
    index = VectorStoreIndex.from_vector_store(vector_store)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

## RAG

In [12]:
from llama_index.core import PromptTemplate

query_engine = index.as_query_engine(
    llm=llm,
    similarity_top_k=TOP_K,
    text_qa_template=PromptTemplate(TEXT_QA_TEMPLATE_STR),
    vector_store_query_mode=vector_store_query_mode,
)
query_res = query_engine.query(QUERY)
pprint(query_res, max_length=1, max_string=250, max_depth=4)

## Retrieval

In [13]:
retriever = index.as_retriever(
    similarity_top_k=TOP_K,
    vector_store_query_mode=vector_store_query_mode,
)
retr_res = retriever.retrieve(QUERY)
pprint(retr_res, max_length=1, max_string=250, max_depth=4)