In [1]:
!python -m pip install -q "qdrant-client[fastembed]>=1.14.2"

In [2]:
!pip show qdrant-client


Name: qdrant-client
Version: 1.15.0
Summary: Client library for the Qdrant vector search engine
Home-page: https://github.com/qdrant/qdrant-client
Author: Andrey Vasnetsov
Author-email: andrey@qdrant.tech
License: Apache-2.0
Location: /usr/local/python/3.12.1/lib/python3.12/site-packages
Requires: grpcio, httpx, numpy, portalocker, protobuf, pydantic, urllib3
Required-by: 


In [3]:
from qdrant_client import QdrantClient, models

In [4]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [5]:
import requests

docs_url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()



In [None]:
documents_raw

In [None]:
from fastembed import TextEmbedding
TextEmbedding.list_supported_models()

In [8]:
import json

EMBEDDING_DIMENSIONALITY = 512

for model in TextEmbedding.list_supported_models():
    if model["dim"] == EMBEDDING_DIMENSIONALITY:
        print(json.dumps(model, indent=2))

{
  "model": "BAAI/bge-small-zh-v1.5",
  "sources": {
    "hf": "Qdrant/bge-small-zh-v1.5",
    "url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz",
    "_deprecated_tar_struct": true
  },
  "model_file": "model_optimized.onnx",
  "description": "Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.",
  "license": "mit",
  "size_in_GB": 0.09,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "Qdrant/clip-ViT-B-32-text",
  "sources": {
    "hf": "Qdrant/clip-ViT-B-32-text",
    "url": null,
    "_deprecated_tar_struct": false
  },
  "model_file": "model.onnx",
  "description": "Text embeddings, Multimodal (text&image), English, 77 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year",
  "license": "mit",
  "size_in_GB": 0.25,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}
{
  "model": "jinaai/jina-embeddings-v2-small-e

In [9]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [11]:
# Define the collection name
collection_name = "zoomcamp-rag"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)


True

In [12]:
points = []
id = 0

for course in documents_raw:
    for doc in course['documents']:

        point = models.PointStruct(
            id=id,
            vector=models.Document(text=doc['text'], model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
            payload={
                "text": doc['text'],
                "section": doc['section'],
                "course": course['course']
            } #save all needed metadata fields
        )
        points.append(point)

        id += 1

In [None]:
client.upsert(
    collection_name=collection_name,
    points=points
)

In [17]:
def search(query, limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [18]:
import random

course = random.choice(documents_raw)
course_piece = random.choice(course['documents'])
print(json.dumps(course_piece, indent=2))

{
  "text": "I have a problem with my terminal. Command\nconda create -n ml-zoomcamp python=3.9\ndoesn\u2019t work. Any of 3.8/ 3.9 / 3.10 should be all fine\nIf you\u2019re on Windows and just installed Anaconda, you can use Anaconda\u2019s own terminal called \u201cAnaconda Prompt\u201d.\nIf you don\u2019t have Anaconda or Miniconda, you should install it first\n(Tatyana Mardvilko)",
  "section": "1. Introduction to Machine Learning",
  "question": "Conda is not an internal command"
}


In [19]:
result = search(course_piece['question'])

In [20]:
result

QueryResponse(points=[ScoredPoint(id=851, version=0, score=0.8316113, payload={'text': 'Problem: For me, Installing anaconda didn’t modify the .bashrc profile. That means Anaconda env was not activated even after exiting and relaunching the unix shell.\nSolution:\nFor bash : Initiate conda again, which will add entries for anaconda in .bashrc file.\n$ cd YOUR_PATH_ANACONDA/bin $ ./conda init bash\nThat will automatically edit your .bashrc.\nReload:\n$ source ~/.bashrc\nAhamed Irshad (daisyfuentesahamed@gmail.com)', 'section': 'Module 1: Introduction', 'course': 'mlops-zoomcamp'}, vector=None, shard_key=None, order_value=None)])

In [21]:
print(f"Question:\n{course_piece['question']}\n")
print("Top Retrieved Answer:\n{}\n".format(result.points[0].payload['text']))
print("Original Answer:\n{}".format(course_piece['text']))

Question:
Conda is not an internal command

Top Retrieved Answer:
Problem: For me, Installing anaconda didn’t modify the .bashrc profile. That means Anaconda env was not activated even after exiting and relaunching the unix shell.
Solution:
For bash : Initiate conda again, which will add entries for anaconda in .bashrc file.
$ cd YOUR_PATH_ANACONDA/bin $ ./conda init bash
That will automatically edit your .bashrc.
Reload:
$ source ~/.bashrc
Ahamed Irshad (daisyfuentesahamed@gmail.com)

Original Answer:
I have a problem with my terminal. Command
conda create -n ml-zoomcamp python=3.9
doesn’t work. Any of 3.8/ 3.9 / 3.10 should be all fine
If you’re on Windows and just installed Anaconda, you can use Anaconda’s own terminal called “Anaconda Prompt”.
If you don’t have Anaconda or Miniconda, you should install it first
(Tatyana Mardvilko)


In [22]:
print(search("What if I submit homeworks late?").points[0].payload['text'])

No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y
Older news:[source1] [source2]


In [23]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [24]:
def search_in_course(query, course="mlops-zoomcamp", limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [None]:
print(search_in_course("What if I submit homeworks late?", "llm-zoomcamp").points[0].payload['text'])