## Hybrid Search RAG

* Data - Chunk - Embeddings (Dense and Sparse) - Vector Database
* Instead of using Langchain directly, we will use Qdrant or vector database.

In [None]:
!pip install langchain langchain-community
!pip install pypdfium2
!pip install fastembed
!pip install qdrant-client

In [2]:
from langchain_community.document_loaders import WebBaseLoader,PyPDFium2Loader
from langchain_text_splitters import RecursiveCharacterTextSplitter



In [3]:
urls = [
    "https://atyantik.com/",
    "https://atyantik.com/about-us/",
    "https://atyantik.com/software-development-company/",
    "https://atyantik.com/saas-development/",
]

In [4]:
loader = WebBaseLoader(urls)
# loader = PyPDF(/content/drive/MyDrive/Resume/Tarun_Resume.pdf)
documents = loader.load()

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=0)
chunks = splitter.split_documents(documents)

In [7]:
len(chunks)

45

## Vector Database

In [8]:
from qdrant_client import QdrantClient,models
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams
from qdrant_client.models import PointStruct
from fastembed import TextEmbedding, SparseTextEmbedding
# search logic (Points)
# vectorParams - Dense
# Distance - Cosine
# SparseParams: TFIDF, BM25 and BM42
import os
from google.colab import userdata

1. Define the client
2. Create collection: Configure parameters
3. Add documents. Loop it over the entire chunks with the embeddings.
4. Search (Points)
5. Node: LangGraph node for the retrieval.

In [9]:
url = userdata.get("QDRANT_URL")
api_key = userdata.get("QDRANT_API_KEY")

In [10]:
client = QdrantClient(
    url = url,
    api_key=api_key,
)

In [11]:
collection_name = "hybrid"

## Load the embedding model

In [None]:
dense_embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-base-en")

In [None]:
sparse_embedding_model = SparseTextEmbedding("Qdrant/BM25")

In [14]:
client.create_collection(
    collection_name = collection_name,
    vectors_config = {
        "dense": VectorParams(
            size = 768,
            distance = Distance.COSINE,
            on_disk=True
        )
    },
    sparse_vectors_config = {
        "sparse": SparseVectorParams(
            modifier=models.Modifier.IDF
        )
    },
    quantization_config=models.BinaryQuantization(
        binary = models.BinaryQuantizationConfig(always_ram=False)
    )
)

True

## Points - Add your data. Upsert: Updating it into the collection

In [18]:
dummy_eles = ['Naruto','OP','DBZ','Bleach']

In [22]:
for idx, ele in enumerate(dummy_eles):
  print(ele)

Naruto
OP
DBZ
Bleach


enumerate - looping but with an index

client
  - payload
  - vector (embeddings)
  - id - unique

In [23]:
dense_embeddings = list(dense_embedding_model.embed(doc.page_content for doc in chunks))
sparse_embeddings = list(sparse_embedding_model.embed(doc.page_content for doc in chunks))

In [40]:
points = []

for idx in range(len(chunks)):
    point = PointStruct(
        id = idx,
        vector = {
            "dense": dense_embeddings[idx],
            "sparse": sparse_embeddings[idx].as_object()
        },
        payload = {"document": chunks[idx].page_content,
                   "source": chunks[idx].metadata['source']}
    )
    points.append(point)

In [41]:
add = client.upsert(
    points = points,
    collection_name = collection_name
)

- Enumerate: If you need index along with sequence/elements
- Zip: When you have multiple lists of same length, if you want to loop through all the list, then you use zip
- next: use only when you have a return object as Iterable

## Query

In [42]:
query = "mail to contact Atyantik"

In [43]:
dense_vectors = next(dense_embedding_model.query_embed(query))
sparse_vectors = next(sparse_embedding_model.query_embed(query))

In [46]:
prefetch = [
    models.Prefetch(
        query = dense_vectors,
        using = "dense", # vector search
        limit = 10
    ),
    models.Prefetch(
        query = models.SparseVector(**sparse_vectors.as_object()),
        using = "sparse", # keyword search
        limit = 10
    )
]

In [47]:
top_k = 3

In [61]:
context = client.query_points(
    collection_name = collection_name,
    prefetch = prefetch, # hybrid search
    query = dense_vectors,
    using = "dense",
    with_payload=True,
    limit = top_k
)

In [62]:
context.points[0]

ScoredPoint(id=44, version=0, score=0.8297515, payload={'document': 'Atyantik is a team of techno enthusiasts with over a decade of experience delivering IT solutions for clients across diverse industries. At Atyantik, we believe in providing our customers a value-driven, highly professional expertise and offering much more up-to-date technology, time-efficient, and cost-effective solutions.\t\t\t\t\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSite map\nAbout us\nServices\nOur Work\nCareers\n \n\nPartnership\nContact\n \n\nContact\n\n\t\t\t\t\t\t\t501, Privilege Avenue, Dr. Vikram Sarabhai Campus, Atlantis Lane, Beside Tricolor Hospital, Above HDFC Bank, Vadodara, Gujarat 390022, India\t\t\t\t\t\t\n\n\n\n\n\n\t\t\t\t\t\t\t\t+91-8347435435\t\t\t\t\t\t\t\n\n\n\n\n\n\n\n\t\t\t\t\t\t\t\tcontact@atyantik.com\t\t\t\t\t\t\t\n\n\n\n\n\nCopyright Â© 2025 Atyantik Technologies Private Limited\n\n\nPrivacy policy\nTerms and Conditions\nCookie policy', 'source': 'ht

## LangGraph

- context:List[str]
- query: str
- answer: str

In [None]:
def search(state: RAGState) -> RAGState:
  dense_vectors = next(dense_embedding_model.query_embed(state['query']))
  sparse_vectors = next(sparse_embedding_model.query_embed(state['query']))
  prefetch = [
    models.Prefetch(
        query = dense_vectors,
        using = "dense", # vector search
        limit = 10
    ),
    models.Prefetch(
        query = models.SparseVector(**sparse_vectors.as_object()),
        using = "sparse", # keyword search
        limit = 10
    )
]
relevant_docs = client.query_points(
    collection_name = collection_name,
    prefetch = prefetch, # hybrid search
    query = dense_vectors,
    using = "dense",
    with_payload=True,
    limit = top_k
)
context = []
for info in relevant_docs.points:
  context.append(info.payload['document'])

state['context'] = context
return state