# RAG Intro

In [1]:
import minsearch

import json

## Loading Knowledge Base

In [2]:
with open('documents.json', 'rt') as file:
    docs_raw = json.load(file)

In [3]:
documents = []

for courses in docs_raw:
    for doc in courses['documents']:
        doc['course'] = courses['course']
        documents.append(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## Indexing Documents

In [5]:
index = minsearch.Index(
    text_fields = ['question','text','section'],
    keyword_fields = ['course']
)

In [6]:
q = "The course has already started. Can I still enroll?"

In [7]:
index.fit(documents)

<minsearch.minsearch.Index at 0x1be648c84d0>

### Querying KB

In [8]:
def search(query):
    
    boost = {'question':3.0, 'section':0.5} #boosts a text field that we consider more important

    search_results = index.search(
        query = query,
        filter_dict = {'course':'data-engineering-zoomcamp'},
        boost_dict = boost,
        num_results = 5
    )

    return search_results

In [9]:
search_results = search(q)

## Inference Client

In [10]:
from huggingface_hub import InferenceClient

In [11]:
client = InferenceClient()

In [12]:
response = client.chat_completion(
    model= "meta-llama/Meta-Llama-3.1-8B-Instruct",
    messages = [{"role": "user", "content": q}]
)

In [13]:
response.choices[0].message.content

"It depends on the specific course and its enrollment policies. \n\nTypically, once a course has started, enrollment might be closed, especially if it's an in-person or hybrid course. However, some online courses may still allow late enrollment, especially if they're self-paced or have flexible start dates.\n\nTo check your options, you can reach out to the course instructor, teaching assistant, or the course administrator. They can provide you with more information about the course's enrollment status and any remaining opportunities to join.\n\nIt's worth noting that even if you can enroll late, you might miss out on some initial course materials, assignments, or discussions. You should be prepared to catch up on any missed content and potentially work with the instructor to create a plan for completing the course."

## Adding Prompt to Connect LLM to KB

In [14]:
def build_prompt(query, search_results):

    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION. 
    If the CONTEXT doesn't contain the answer, output NONE 

    QUESTION: {question}

    CONTEXT: {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [16]:
prompt = build_prompt(q, search_results)

In [17]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION. 
    If the CONTEXT doesn't contain the answer, output NONE 

    QUESTION: The course has already started. Can I still enroll?

    CONTEXT: section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start w

In [18]:
def call_llm(prompt):

    response = client.chat_completion(
        model= "meta-llama/Meta-Llama-3.1-8B-Instruct",
        messages = [{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [19]:
call_llm(prompt)

"Yes, even if you don't register, you're still eligible to submit the homeworks."

In [20]:
query = "How do I enroll to the course?"
search_results = search(query)
prompt = build_prompt(query, search_results)
answer = call_llm(prompt)

In [21]:
answer

'NONE \n\nThe provided context does not mention how to enroll in the course. However, it does mention that you should register before the course starts using this link.'

## Using ElasticSearch to Index Documents

In [1]:
from elasticsearch import Elasticsearch

In [2]:
es_client = Elasticsearch('http://localhost:9200')

In [3]:
es_client.info()

BadRequestError: BadRequestError(400, 'media_type_header_exception', 'Invalid media-type value on headers [Accept, Content-Type]', Accept version must be either version 8 or 7, but found 9. Accept=application/vnd.elasticsearch+json; compatible-with=9)