In [1]:
import minsearch
import json
import os

In [2]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = [
    {**doc, 'course': course_dict['course']}
    for course_dict in docs_raw
    for doc in course_dict['documents']
]
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [5]:
q = 'the course has already started, can I still enroll?'
index.fit(documents)

<minsearch.Index at 0x757d96356da0>

In [6]:
boost = {'question': 3.0, 'section': 0.4}

search_results = index.search(
    query=q,
    boost_dict=boost,
    num_results=2
)

search_results

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'}]

In [7]:
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage


In [10]:
api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-tiny"
client = MistralClient(api_key=api_key)

In [12]:

chat_response = client.chat(
    model=model,
    messages=[ChatMessage(role="user", content=q)]
)

print(chat_response.choices[0].message.content)

It depends on the specific course and the policies of the institution or platform offering the course. Some courses allow late enrollment, while others do not. It's best to contact the course provider to inquire about their enrollment policies and see if there's still an opportunity for you to join.


In [17]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [13]:
def build_prompt(query, search_results):
    """
    Build a prompt for a language model based on a query and search results.

    Parameters:
    query (str): The query or question to be answered.
    search_results (list): List of dictionaries containing 'section', 'question', and 'text' keys.

    Returns:
    str: The formatted prompt string containing the query and relevant context from search results.
    """
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()

    context = "\n".join(
        f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n"
        for doc in search_results
    ).strip()

    prompt = prompt_template.format(question=query, context=context)
    return prompt


In [14]:
def llm(prompt):
    """
    Send a prompt to the chat model and return the response.

    Parameters:
    prompt (str): The input text to be sent to the chat model.

    Returns:
    response: The response from the chat model.
    """
    response = client.chat(
        model=model,
        messages=[ChatMessage(role="user", content=prompt)]
    )
    return response.choices[0].message.content


In [20]:
query = 'how do I run Kafka?'

def rag(query):
    """
    Retrieve search results for the query, build a prompt, and get an answer from the language model.

    Parameters:
    query (str): The input query to be processed.

    Returns:
    answer: The response generated by the language model based on the query and search results.
    """
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    return llm(prompt)

answer = rag(query)


In [21]:
answer


'To run Kafka in the context of the provided FAQ, you have different options based on the programming language you\'re using:\n\n1. Java:\nIn the project directory, run the following command in the terminal:\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\n2. Python:\nFirst, create a virtual environment and install the necessary packages:\n\n- To create a virtual environment and install packages (run only once):\n```\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\n```\n\n- To activate it (you\'ll need to run it every time you need the virtual env):\n```\nsource env/bin/activate\n```\n\n- To deactivate it:\n```\ndeactivate\n```\n\nMake sure you have the \'dlt[duckdb]\' package installed for the provided Python code:\n```\n!pip install dlt[duckdb]\n```\n\nFor Windows, you might need to activate the virtual environment using `env/Scripts/activate`.\n\nIf you encounter a permission denied error w