In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py

--2025-07-11 07:42:49--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4273 (4.2K) [text/plain]
Saving to: ‘minsearch.py’


2025-07-11 07:42:49 (44.9 MB/s) - ‘minsearch.py’ saved [4273/4273]



In [2]:
import minsearch



In [3]:
import json

In [4]:
import openai
from openai import OpenAI

In [5]:
import os
from dotenv import load_dotenv

In [6]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [20]:
openai_client = OpenAI()

In [8]:
with open('../documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [9]:
documents = []

for course_dict in docs_raw:
    for docs in course_dict['documents']:
        docs['course'] = course_dict['course']
        documents.append(docs)

In [10]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [11]:
index = minsearch.Index(
    text_fields = ['text', 'section', 'question'],
    keyword_fields = ['course']
)

In [12]:
index.fit(documents)

<minsearch.Index at 0x7f7baae84590>

In [13]:
q = 'the course has already started, can I still enroll?'

In [14]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results

In [15]:
def build_prompt(query, search_results):
    prompt_template = """
        You are course teaching assistant, Answer the QUESTION  based on CONTEXT from FAQ database.
        Use only facts from teh CONTEXT when answering the QUESTION.
        If the CONTEXT doesn't contain the answer, output NONE

        QUESTION: {question}

        CONTEXT: {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [19]:
def llm(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [17]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [18]:
query = "can i enroll the course late?"
rag(query)

"Yes, you can still join the course after the start date. Be aware, however, that there will be deadlines for turning in the final projects, so it's important not to leave everything for the last minute."

## RAG with Vector Search

In [22]:
!python -m pip install -q "qdrant-client[fastembed]>=1.14.2"

[0m

In [25]:
from qdrant_client import QdrantClient, models

In [26]:
qd_client = QdrantClient("http://localhost:6333")

In [27]:
EMBEDDING_DIMENTIONALITY = 512

In [28]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [29]:
collection_name = "zoomcamp-faq"

In [30]:
qd_client.delete_collection(collection_name=collection_name)

False

In [32]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENTIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [45]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [33]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [36]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.89it/s]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [37]:
question = "I just discover the course, Can I still join it?"

In [46]:
def vector_search(question):

    course = 'data-engineering-zoomcamp'
    
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    results = []

    for point in query_points.points:
        results.append(point.payload)

    return results

In [48]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [49]:
rag("how do I run kafka")

'To run Kafka, in the project directory, execute the following command in the terminal:\n\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```'