# RAG Intro

In [8]:
import minsearch

import json

## Loading Knowledge Base

In [9]:
with open('documents.json', 'rt') as file:
    docs_raw = json.load(file)

In [10]:
documents = []

for courses in docs_raw:
    for doc in courses['documents']:
        doc['course'] = courses['course']
        documents.append(doc)

In [11]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## Indexing Documents

In [12]:
index = minsearch.Index(
    text_fields = ['question','text','section'],
    keyword_fields = ['course']
)

In [13]:
q = "The course has already started. Can I still enroll?"

In [14]:
index.fit(documents)

<minsearch.minsearch.Index at 0x1844ddff8f0>

### Querying KB

In [15]:
def search(query):
    
    boost = {'question':3.0, 'section':0.5} #boosts a text field that we consider more important

    search_results = index.search(
        query = query,
        filter_dict = {'course':'data-engineering-zoomcamp'},
        boost_dict = boost,
        num_results = 5
    )

    return search_results

In [16]:
search_results = search(q)

## Inference Client

In [17]:
from huggingface_hub import InferenceClient

In [18]:
client = InferenceClient()

In [19]:
response = client.chat_completion(
    model= "meta-llama/Meta-Llama-3.1-8B-Instruct",
    messages = [{"role": "user", "content": q}]
)

In [20]:
response.choices[0].message.content

"It depends on the specific course and its enrollment policies. Some courses may allow late enrollment, while others may not. Here are a few possibilities:\n\n1. **Check with the instructor or course administrator**: Reach out to the instructor or course administrator to ask about late enrollment options. They may be able to provide you with more information or direct you to someone who can help.\n2. **Check the course website or syllabus**: Look for information on the course website or syllabus about late enrollment or add/drop deadlines.\n3. **Contact the institution's registrar or student services**: If you're unable to get in touch with the instructor or course administrator, try contacting the institution's registrar or student services office to ask about late enrollment options.\n\nIf the course has already started and you're interested in enrolling, be prepared to:\n\n* Explain your situation and why you're interested in enrolling late\n* Provide any required documentation, suc

## Adding Prompt to Connect LLM to KB

In [21]:
def build_prompt(query, search_results):

    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION. 
    If the CONTEXT doesn't contain the answer, output NONE 

    QUESTION: {question}

    CONTEXT: {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [22]:
prompt = build_prompt(q, search_results)

In [23]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION. 
    If the CONTEXT doesn't contain the answer, output NONE 

    QUESTION: The course has already started. Can I still enroll?

    CONTEXT: section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start w

In [24]:
def call_llm(prompt):

    response = client.chat_completion(
        model= "meta-llama/Meta-Llama-3.1-8B-Instruct",
        messages = [{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [25]:
call_llm(prompt)

"Yes, even if you don't register, you're still eligible to submit the homeworks."

In [26]:
query = "How do I enroll to the course?"
search_results = search(query)
prompt = build_prompt(query, search_results)
answer = call_llm(prompt)

In [27]:
answer

'NONE \n\nThe provided context does not contain any information about the enrollment process for the course. However, it does mention a registration link for the course that will be available before the course starts.'

## Using ElasticSearch to Index Documents

In [28]:
from elasticsearch import Elasticsearch

In [29]:
es_client = Elasticsearch('http://localhost:9200')

In [30]:
es_client.info()

ObjectApiResponse({'name': '38f5d0390b23', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'IVtPf2z9RbGCYMP2UjS9jg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Creating Index for ES

In [31]:
index_settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text": {"type":"text"},
            "section": {"type":"text"},
            "question": {"type":"text"},
            "course": {"type":"keyword"}
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/as99EidBShqieTPV_LK1pw] already exists')

In [32]:
from tqdm.auto import tqdm

In [34]:
for doc in tqdm(documents):    
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

### Querying with ES

In [35]:
search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": { 
                    "multi_match": { #Text matching
                        "query": query,
                        "fields": ["question^3", "text", "section"],#^3 means x3 more important
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

In [36]:
response = es_client.search(index=index_name, body=search_query)

In [41]:
result_docs = []

for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

In [42]:
result_docs

[{'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outline