In [3]:
# Import client library
from qdrant_client import QdrantClient, models

client = QdrantClient(":memory:")

## 데이터

In [4]:
import json

data = json.load(open("data/test_data_sample.json", "r"))


#for i in data:
#    if i["restaurant_name"] == "Test Restaurant 1":
#        print(i["content"] + "\n")
    
#print(data[1]["restaurant_name"])

## 임베딩 생성

In [5]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


  import pynvml  # type: ignore[import]


## 컬렉션 생성

In [6]:
client.create_collection(
    collection_name="reviews",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

## 포인트 업로드

In [7]:
client.upload_points(
    collection_name="reviews",
    points=[
        models.PointStruct(
            id=idx, vector=encoder.encode(rev["content"]).tolist(), payload=rev
        )
        for idx, rev in enumerate(data)
    ],
)

## 포인트들(컬렉션)에 쿼리임베딩 후 "reviews"에 질의

In [8]:
hits = client.query_points(
    collection_name="reviews",
    query=encoder.encode("맛있는 음식").tolist(),
    limit=3,
).points

for hit in hits:
    print(hit.payload, "score:", hit.score)

{'id': 12, 'restaurant_id': 1, 'restaurant_name': 'Test Restaurant 1', 'food_category_id': 1, 'food_category_name': '한식', 'member_id': 13, 'group_id': None, 'subgroup_id': None, 'content': '빠른 회전율과 담백한 국물이 예술인 곳', 'is_recommended': True, 'created_at': '2025-03-14T17:02:45.369534', 'updated_at': None, 'deleted_at': None, 'images': []} score: 0.7873294498093656
{'id': 5, 'restaurant_id': 1, 'restaurant_name': 'Test Restaurant 1', 'food_category_id': 1, 'food_category_name': '한식', 'member_id': 6, 'group_id': None, 'subgroup_id': None, 'content': '굿 좋은 가격 좋은 맛', 'is_recommended': True, 'created_at': '2025-09-07T17:02:45.368073', 'updated_at': None, 'deleted_at': None, 'images': []} score: 0.7870660908308467
{'id': 15, 'restaurant_id': 1, 'restaurant_name': 'Test Restaurant 1', 'food_category_id': 1, 'food_category_name': '한식', 'member_id': 16, 'group_id': None, 'subgroup_id': None, 'content': '아들 따라서 가봤는데 맛있었습니다. 가족모임 하기 아주 좋은 장소였습니다.', 'is_recommended': True, 'created_at': '2025-12-26T17:

## 필터링 검색 (food category, 한식, 일식 ...)

In [9]:
hits = client.query_points(
    collection_name="reviews",
    query=encoder.encode("맛있는 음식").tolist(),
    query_filter=models.Filter(
        must=[models.FieldCondition(key="food_category_name", match=models.MatchValue(value="일식"))]
    ),
    limit=1,
).points

for hit in hits:
    print(hit.payload, "score:", hit.score)

{'id': 79, 'restaurant_id': 4, 'restaurant_name': 'Test Restaurant 4', 'food_category_id': 4, 'food_category_name': '일식', 'member_id': 80, 'group_id': None, 'subgroup_id': None, 'content': '아이도 참 좋아했어요 반찬이 맛있는 게 많이 나오네요', 'is_recommended': True, 'created_at': '2025-09-25T17:02:45.384269', 'updated_at': None, 'deleted_at': None, 'images': []} score: 0.6509563059150395


## sparse + dense hybrid

In [10]:
client = QdrantClient(":memory:")

In [11]:
from fastembed import TextEmbedding, LateInteractionTextEmbedding, SparseTextEmbedding 

In [12]:
dense_embedding_model = TextEmbedding("intfloat/multilingual-e5-large")
bm25_embedding_model = SparseTextEmbedding("Qdrant/bm25")
late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")

  dense_embedding_model = TextEmbedding("intfloat/multilingual-e5-large")


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model.onnx_data:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/546k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/716 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

In [13]:
#sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
#jinaai/jina-embeddings-v3

In [None]:
dense_embeddings = list(dense_embedding_model.embed("passage: " + rev["content"] for rev in data))
bm25_embeddings = list(bm25_embedding_model.embed(rev["content"] for rev in data))
late_interaction_embeddings = list(late_interaction_embedding_model.embed(rev["content"] for rev in data))

In [15]:
from qdrant_client.models import Distance, VectorParams, models

client.create_collection(
    "hybrid-search",
    vectors_config={
        "all-MiniLM-L6-v2": models.VectorParams(
            size=len(dense_embeddings[0]),
            distance=models.Distance.COSINE,
        ),
        "colbertv2.0": models.VectorParams(
            size=len(late_interaction_embeddings[0][0]),
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            ),
            hnsw_config=models.HnswConfigDiff(m=0)  #  Disable HNSW for reranking
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF
        )
    }
)

True

In [16]:
from qdrant_client.models import PointStruct
points = []
for idx, (dense_embedding, bm25_embedding, late_interaction_embedding, doc) in enumerate(zip(dense_embeddings, bm25_embeddings, late_interaction_embeddings, data)):
  
    point = PointStruct(
        id=idx,
        vector={
            "all-MiniLM-L6-v2": dense_embedding,
            "bm25": bm25_embedding.as_object(),
            "colbertv2.0": late_interaction_embedding,
        },
        payload={"document": doc}
    )
    points.append(point)

operation_info = client.upsert(
    collection_name="hybrid-search",
    points=points
)

In [24]:
query = "만두 튀김"

In [None]:
dense_vectors = next(dense_embedding_model.query_embed("query: 만두 튀김"))
sparse_vectors = next(bm25_embedding_model.query_embed(query))
late_vectors = next(late_interaction_embedding_model.query_embed(query))

In [26]:
prefetch = [
        models.Prefetch(
            query=dense_vectors,
            using="all-MiniLM-L6-v2",
            limit=20,
        ),
        models.Prefetch(
            query=models.SparseVector(**sparse_vectors.as_object()),
            using="bm25",
            limit=20,
        ),
    ]

In [27]:
results = client.query_points(
         "hybrid-search",
        prefetch=prefetch,
        query=late_vectors,
        using="colbertv2.0",
        with_payload=True,
        limit=3
)

In [28]:
results

QueryResponse(points=[ScoredPoint(id=91, version=0, score=26.14021952030386, payload={'document': {'id': 92, 'restaurant_id': 5, 'restaurant_name': 'Test Restaurant 5', 'food_category_id': 5, 'food_category_name': '분식', 'member_id': 93, 'group_id': None, 'subgroup_id': None, 'content': '마카롱 찐 맛 집 여기 마카롱 말고 구움 과자류도 다 맛있어요 .. 종종 사 먹는 곳', 'is_recommended': True, 'created_at': '2025-05-19T17:02:45.387118', 'updated_at': None, 'deleted_at': None, 'images': []}}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=61, version=0, score=25.84705709732299, payload={'document': {'id': 62, 'restaurant_id': 4, 'restaurant_name': 'Test Restaurant 4', 'food_category_id': 4, 'food_category_name': '일식', 'member_id': 63, 'group_id': None, 'subgroup_id': None, 'content': '메밀 맛있음 양도 많음 돈가스도 부드럽고 양도 많아서 또 갈 듯', 'is_recommended': True, 'created_at': '2025-01-15T17:02:45.380478', 'updated_at': None, 'deleted_at': None, 'images': []}}, vector=None, shard_key=None, order_value=None), ScoredPoint(id

In [29]:
results.points[0].payload["document"]["content"]


'마카롱 찐 맛 집 여기 마카롱 말고 구움 과자류도 다 맛있어요 .. 종종 사 먹는 곳'