In [None]:
!pip install minsearch

--2025-06-09 21:05:15--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4014 (3.9K) [text/plain]
Saving to: ‘minsearch.py.2’


2025-06-09 21:05:16 (71.6 KB/s) - ‘minsearch.py.2’ saved [4014/4014]



## TF-IDF

In [1]:
import minsearch



In [2]:
import json

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [7]:
q = 'the course has already started, can I still enroll?'

In [8]:
index.fit(documents)

<minsearch.Index at 0x7f537650be50>

In [9]:
boost = {'question': 3.0, 'section': 0.5}

results = index.search(
    query=q,
    filter_dict={'course': 'data-engineering-zoomcamp'},
    boost_dict=boost,
    num_results=5,    
)

In [10]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

## RAG

In [11]:
from openai import OpenAI

In [12]:
client = OpenAI(base_url="https://api.deepseek.com")

In [15]:
response = client.chat.completions.create(
    model='deepseek-chat',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

'Whether you can still enroll in a course after it has started depends on several factors. Here’s what you should consider:\n\n### 1. **Institution/Platform Policies**  \n   - **Online platforms (Coursera, Udemy, etc.)**: Most allow enrollment at any time for self-paced courses, but instructor-led or cohort-based courses may have deadlines.  \n   - **Universities/Colleges**: Some permit late enrollment (with instructor approval or a grace period), while others enforce strict deadlines. Check the registrar’s office or course syllabus.  \n\n### 2. **Course Flexibility**  \n   - **Self-paced courses**: Typically allow late enrollment since you access materials on your own schedule.  \n   - **Live/Interactive courses**: Joining late might mean missing lectures, assignments, or group work. Contact the instructor to see if catching up is feasible.  \n\n### 3. **Instructor Approval**  \n   - For structured courses (especially academic ones), email the instructor to explain your situation. The

In [13]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [14]:
def llm(prompt):
    response = client.chat.completions.create(
        model='deepseek-chat',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [15]:
def rag(query):
    search_results = index.search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [21]:
rag(q)

"Yes, you can still enroll in the course even after it has started. However, you may not be able to submit some of the homeworks. To be eligible for a certificate, you need to submit 2 out of 3 course projects and review 3 peers' projects by the deadline. \n\nFor example, if you join at the end of November and manage to complete two projects, you can still qualify for the certificate. Be mindful of the deadlines for the final projects to ensure you don't miss them. \n\nAll course materials will remain available after the course ends, allowing you to follow the course at your own pace if needed. You can also seek support via the Slack channel, where you can ask questions or use the @ZoomcampQABot for assistance. \n\nWelcome to the course! You can start by accessing the materials on the course page (http://mlzoomcamp.com/) and joining the relevant communication channels for updates and support."

In [22]:
query = 'how do I run kafka?'

rag(query)

'To run Kafka, follow the appropriate steps based on your programming language and environment:\n\n### **Python Kafka Setup:**\n1. **Create a virtual environment** (run only once):\n   ```bash\n   python -m venv env\n   source env/bin/activate  # On Windows: `env\\Scripts\\activate`\n   pip install -r requirements.txt\n   ```\n2. **Install Kafka dependencies** (if needed):\n   ```bash\n   pip install confluent-kafka fastavro\n   # For Avro support:\n   pip install confluent-kafka[avro]\n   ```\n3. **Run Python Kafka scripts** (e.g., `producer.py` or `stream.py`) in the activated virtual environment.\n\n### **Java Kafka Setup:**\n1. **Build the JAR file** (if using Gradle):\n   ```bash\n   gradle shadowjar\n   ```\n2. **Run the Java Kafka application**:\n   ```bash\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n### **Docker & Broker Setup:**\n- If you encounter `NoBrokersAvailable`, ensure Kafka broker containers are runnin

In [23]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course even if it has already started. However, you won't be able to submit some of the homeworks, but you can still participate in the course. To be eligible for a certificate, you need to submit 2 out of 3 course projects and review 3 peers' projects by the deadline. \n\nFor example, if you join at the end of November and manage to complete two projects, you can still qualify for the certificate. Just be mindful of the deadlines for the final projects. \n\nAdditionally, all course materials will remain available after the course finishes, so you can also follow the course at your own pace later if needed."

In [24]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [16]:
from elasticsearch import Elasticsearch

In [17]:
es_client = Elasticsearch('http://localhost:9200')

In [18]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

  es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [19]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [20]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████████████████████████████████████████| 948/948 [00:05<00:00, 186.99it/s]


In [22]:
query = 'I just disсovered the course. Can I still join it?'

In [23]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

^3 = three times more important

In [30]:
elastic_search(query)

  response = es_client.search(index=index_name, body=search_query)


[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

In [31]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [32]:
rag(query)

  response = es_client.search(index=index_name, body=search_query)


'Yes, you can still join the course even after the start date. You are eligible to submit homeworks without registering, but be mindful of the deadlines for final projects. Additionally, all course materials will remain available after the course finishes, allowing you to follow the course at your own pace. \n\nYou can also prepare by setting up the necessary dependencies (Google Cloud account, Google Cloud SDK, Python 3, Terraform, Git) and reviewing the prerequisites and syllabus. If you have questions, you can seek support in the Slack channel or refer to the FAQ document. \n\nRegistration is not required to start learning or submitting homework—it’s only used to gauge interest before the course begins.'