In [18]:
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")

In [19]:
SPARSE_COLLECTION_NAME = 'course_faq_sparse'
HYBRID_COLLECTION_NAME = 'course_faq_hybrid'
EMBEDDING_MODEL = "BAAI/bge-small-en" 

In [20]:
from qdrant_client import models

client.create_collection(
    collection_name=SPARSE_COLLECTION_NAME,
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [21]:
import json 
with open('documents.json', 'r') as f:
    documents = json.load(f)

In [22]:
import uuid

client.upsert(
    collection_name=SPARSE_COLLECTION_NAME,
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "bm25": models.Document(
                    text=doc["text"], 
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "text": doc["text"],
                "section": doc["section"],
                "course": course["course"],
            }
        )
        for course in documents
        for doc in course["documents"]
    ]
)


UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [23]:
query = "How to submit assignments?"
limit = 5

results = client.query_points(
    collection_name=SPARSE_COLLECTION_NAME,
    query=models.Document(
        text=query,
        model="Qdrant/bm25",
    ),
    using="bm25",
    limit=limit,
    with_payload=True,
)


for point in results.points:
    print(f"Score: {point.score:.4f}")
    print(f"Section: {point.payload['section']}")
    print(f"Course: {point.payload['course']}")
    print(f"Text: {point.payload['text']}")
 
    print("-" * 30)



Score: 13.2044
Section: Project
Course: data-engineering-zoomcamp
Text: Each submitted project will be evaluated by 3 (three) randomly assigned students that have also submitted the project.
You will also be responsible for grading the projects from 3 fellow students yourself. Please be aware that: not complying to this rule also implies you failing to achieve the Certificate at the end of the course.
The final grade you get will be the median score of the grades you get from the peer reviewers.
And of course, the peer review criteria for evaluating or being evaluated must follow the guidelines defined here.
------------------------------
Score: 12.1077
Section: Projects (Midterm and Capstone)
Course: machine-learning-zoomcamp
Text: I am not sure how the project evaluate assignment works? Where do I find this? I have access to all the capstone 2 project, perhaps, I can randomly pick any to review.
Answer:
The link provided for example (2023/Capstone link ): https://docs.google.com/form

In [24]:
from qdrant_client import models

client.create_collection(
    collection_name=HYBRID_COLLECTION_NAME,
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF)
    },
    vectors_config={
     "bge_small" : models.VectorParams(size=384, distance=models.Distance.COSINE)
    }
)

True

In [26]:
client.upsert(
    collection_name=HYBRID_COLLECTION_NAME,
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "bge_small": models.Document(
                    text=doc["text"],
                    model=EMBEDDING_MODEL,
                ),
                "bm25": models.Document(
                    text=doc["text"], 
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "text": doc["text"],
                "section": doc["section"],
                "course": course["course"],
            }
        )
        for course in documents
        for doc in course["documents"]
    ]
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [27]:
results = client.query_points(
        collection_name=HYBRID_COLLECTION_NAME,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=EMBEDDING_MODEL,
                ),
                using="bge_small",
                limit=(10 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25", 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

for point in results.points:    
    print(f"Score: {point.score:.4f}")
    print(f"Section: {point.payload['section']}")
    print(f"Course: {point.payload['course']}")
    print(f"Text: {point.payload['text']}")
 
    print("-" * 30)

Score: 13.2044
Section: Project
Course: data-engineering-zoomcamp
Text: Each submitted project will be evaluated by 3 (three) randomly assigned students that have also submitted the project.
You will also be responsible for grading the projects from 3 fellow students yourself. Please be aware that: not complying to this rule also implies you failing to achieve the Certificate at the end of the course.
The final grade you get will be the median score of the grades you get from the peer reviewers.
And of course, the peer review criteria for evaluating or being evaluated must follow the guidelines defined here.
------------------------------
Score: 12.1077
Section: Projects (Midterm and Capstone)
Course: machine-learning-zoomcamp
Text: I am not sure how the project evaluate assignment works? Where do I find this? I have access to all the capstone 2 project, perhaps, I can randomly pick any to review.
Answer:
The link provided for example (2023/Capstone link ): https://docs.google.com/form

In [28]:
results = client.query_points(
        collection_name=HYBRID_COLLECTION_NAME,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model=EMBEDDING_MODEL,
                ),
                using="bge_small",
                limit=(5 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )
for point in results.points:    
    print(f"Score: {point.score:.4f}")
    print(f"Section: {point.payload['section']}")
    print(f"Course: {point.payload['course']}")
    print(f"Text: {point.payload['text']}")
 
    print("-" * 30)

Score: 0.5455
Section: Project
Course: data-engineering-zoomcamp
Text: Each submitted project will be evaluated by 3 (three) randomly assigned students that have also submitted the project.
You will also be responsible for grading the projects from 3 fellow students yourself. Please be aware that: not complying to this rule also implies you failing to achieve the Certificate at the end of the course.
The final grade you get will be the median score of the grades you get from the peer reviewers.
And of course, the peer review criteria for evaluating or being evaluated must follow the guidelines defined here.
------------------------------
Score: 0.5000
Section: General course-related questions
Course: machine-learning-zoomcamp
Text: No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.
------------------------------
Score: 0.3733
Section: Projects (Midterm and Capstone)
Course: machine-learning-zoomcamp
Text: I