In [8]:
import os
import json
import minsearch
from google import genai
from dotenv import load_dotenv

In [9]:
load_dotenv()

True

In [10]:
client = genai.Client(api_key = os.getenv('GEMINI_API_KEY'))

In [11]:
with open('documents.json', 'r') as file_input:
    docs_raw = json.load(file_input)

In [12]:
documents = []

for course_dicts in docs_raw:
    for doc in course_dicts['documents']:
        doc['course'] = course_dicts['course']
        documents.append(doc)

In [13]:
index = minsearch.Index(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)

In [14]:
index.fit(documents)

<minsearch.minsearch.Index at 0x74937bdea6c0>

In [15]:
def search(query):
    boost = {'question':3.0, 'section': 0.5}
    results = index.search(
        query=query,
        filter_dict = {'course' : 'data-engineering-zoomcamp'},
        boost_dict = boost,
        num_results = 5
    )

    return results

In [59]:
def build_prompt(query, search_answers):
    prompt = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE
    
    QUESTION: {question}
    
    CONTEXT: {context}
    """.strip()
    
    context = ''
    for doc in search_answers:
        context = context + f"section: {doc['section']} \nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt.format(question=query, context=context).strip()

    return prompt

In [60]:
def request_llm(prompt):
    response = client.models.generate_content(
    model = 'gemini-2.0-flash',
    contents = [prompt]
    )
    return response.text

In [61]:
q = "how do I run Kafka?"

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    llm_response = request_llm(prompt)

    return llm_response

In [62]:
print(rag(q)) 

To run Kafka in terminal, use the command: java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java.
If you're using Python, create a virtual environment, activate it, and then use `pip install -r ../requirements.txt`. Remember to activate the virtual environment each time you need it using `source env/bin/activate` (or `env/Scripts/activate` on Windows).
Also, if you encounter a "ModuleNotFoundError: No module named 'kafka.vendor.six.moves'" error, use `pip install kafka-python-ng` instead.



# Switching up the search engine to Elastic Search

In [63]:
from elasticsearch import Elasticsearch  

In [64]:
es_client = Elasticsearch('http://localhost:9200')

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

In [None]:
from tqdm.auto import tqdm
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

In [None]:
query = 'I just discovered the course, can I still join?'

In [None]:
def elastic_search(query):
    result_docs = []
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body = search_query)
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    llm_response = request_llm(prompt)

    return llm_response

In [71]:
response = rag(query)
print(response)

Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects, so don't leave everything for the last minute.

