In [1]:
from qdrant_client import QdrantClient
from qdrant_client import models
import json
import uuid

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
qdClient = QdrantClient("http://localhost:6333")

In [3]:
tables = []
texts = []

with open('summarized_tables.json', 'r' , encoding="utf-8") as f1:
    tables = json.load(f1)
    
for t in tables:
    t["content_type"] = "table"

with open('summarized_texts.json', 'r' , encoding="utf-8") as f2:
    texts = json.load(f2)

for t in texts:
    t["content_type"] = "text"

In [6]:
collection_name="stardew-sparse-and-dense"
vector_model_handle = "jinaai/jina-embeddings-v2-small-en"
EMBEDDING_DIMENSIONALITY = 512
spasrse_model_handle="Qdrant/bm25"

In [13]:
qdClient.get_collection(collection_name)

ResponseHandlingException: timed out

In [7]:
qdClient.create_collection(
    collection_name=collection_name,
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=EMBEDDING_DIMENSIONALITY,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

ResponseHandlingException: timed out

In [None]:
points = []
for text in texts:
    text_to_embedd = f"Page title: {text['page_title']}. Section title: {text['section_title']}. text: {text['text']}"
    
    point = models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "jina-small": models.Document(
                    text=text_to_embedd,
                    model=vector_model_handle,
                ),
                "bm25": models.Document(
                    text=text_to_embedd, 
                    model=spasrse_model_handle,
                ),
            },
            payload=text
        )
    points.append(point)

In [None]:
for table in tables:
    text_to_embedd = f"Page title: {table['page_title']}. Section title: {table['section_title']}. Table summary: {table['summary']}"

    point = models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "jina-small": models.Document(
                    text=text_to_embedd,
                    model=vector_model_handle,
                ),
                "bm25": models.Document(
                    text=text_to_embedd, 
                    model=spasrse_model_handle,
                ),
            },
            payload=table
        )
    points.append(point)


In [None]:
def batch_upsert(qdClient, collection_name, points, batch_size=500):
    total = len(points)
    for i in range(0, total, batch_size):
        batch = points[i:i+batch_size]
        qdClient.upsert(collection_name=collection_name,points=batch)
        print(f"✅ Upserted {min(i+batch_size, total)}/{total}")

In [None]:
batch_upsert(qdClient=qdClient,collection_name=collection_name,points=points,batch_size=1000)

In [None]:
def multi_stage_search(client, collection_name, query: str, limit: int = 5) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=vector_model_handle,
                ),
                using="jina-small",
                # Prefetch three times more results, then
                # expected to return, so we can really rerank
                limit=(3 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model=spasrse_model_handle, 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points

In [None]:
question = "How to become friends with Abigail?"

results = multi_stage_search(client = qdClient , collection_name=collection_name, query=question)
results[0]

In [None]:
results

In [None]:
def rrf_search(client, collection_name, query: str, limit: int = 5) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=vector_model_handle,
                ),
                using="jina-small",
                limit=(5 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=spasrse_model_handle,
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )

    return results.points

In [None]:
client = QdrantClient("http://localhost:6333")

In [None]:
results = rrf_search(client=client,collection_name=collection_name, query=question)