In [1]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-03 05:57:28--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-03 05:57:28 (64.9 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [2]:
import os
import requests
import typing

from groq import Groq
from IPython.display import Markdown, display
import minsearch

In [3]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7f014c234760>

In [4]:

def search(index: minsearch.Index, query: str, num_results: int = 10) -> list[dict[str, str]]:
    boost = {"question": 3.0, "section": 0.5}
    return index.search(
        query=query, filter_dict={"course": "data-engineering-zoomcamp"}, boost_dict=boost, num_results=num_results
    )

In [5]:
def build_prompt(query: str, search_results: list[dict[str, str]]) -> str:
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()
    context = ""

    for doc in search_results:
        context = context + f'section: {doc["section"]}\nquestion: {doc["question"]}\nanswer: {doc["text"]}\n\n'

    return prompt_template.format(question=query, context=context).strip()

def llm(client: Groq, prompt: str) -> typing.Optional[str]:
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content

In [6]:
def rag_minsearch(client: Groq, index: minsearch.Index, question: str) -> typing.Optional[str]:
    search_results = search(index, question, num_results=5)
    prompt = build_prompt(question, search_results)
    return llm(client, prompt)

In [7]:
question = 'the course has already started, can I still enroll?'

groq_client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

answer = rag_minsearch(client=groq_client, index=index, question=question)

In [8]:
display(Markdown(answer))

Based on the CONTEXT, I can see that the course has already started (mentioned in the "When will the course start?" section). Therefore, according to the "Can I still join the course after the start date?" section, the answer is YES, you can still enroll in the course. However, be aware that there will be deadlines for turning in the final projects, so don't leave everything for the last minute.