## A. IMPLEMENTING THE MINSEARCH ENGINE

In [1]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py


In [2]:
# minsearch repository https://github.com/alexeygrigorev/minsearch/tree/main

In [3]:
#implementing the search engine 
import minsearch

In [4]:
# Getting the indexed FAQ documents

import json

In [5]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [6]:
# restructuring the json to contain document records for each course
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [7]:
#indexing the document so we can search through it 

index = minsearch.Index(
    text_fields = ['question','text','section'],
    keyword_fields=['course']
)

In [8]:
index.fit(documents)

<minsearch.Index at 0x73665994b610>

In [9]:
# texting the index 
boost = {'question': 3.0, 'section':0.5} #this means the question field is 3 times more important than the text field


results = index.search(
    query='the courese has already started, can I still enroll?',
    boost_dict =boost,
    filter_dict = {'course':'data-engineering-zoomcamp'},
    num_results=5
)

In [10]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, for simplicity (of troubleshooting against the recorded videos) and stability. [source]\nBut Python 3.10 and 3.11 should work fine.',
  'section': 'General course-related questions',
  'question': 'Environment - Is Python 3.9 still the recommended version to use in 2024?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'd

## B. PASSING THE CONTEXT TO AN LLM 

In [11]:
import os 
#os.environ['OPENAI_API_KEY']

In [12]:
from openai import OpenAI

In [14]:
client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role":'user', "content":'the courese has already started, can I still enroll?'}])

In [15]:
response.choices[0].message.content

"Whether you can still enroll in a course that has already started depends on the institution or organization offering the course. Many institutions have specific policies regarding enrollment deadlines, while some may allow late enrollment under certain circumstances. \n\nTo get accurate information, it's best to:\n\n1. Check the course's official website or syllabus for enrollment details.\n2. Contact the course instructor or administrative office directly to inquire about late enrollment options.\n3. Look for any add/drop deadlines specific to the semester or term you are in.\n\nThey will provide you with the most relevant information regarding your situation."

In [16]:
#Setting the context

prompt_template = """
You are course teaching assistant. Answer the QUESTION based on context from the FAQ databse.
Use only the facts from the CONTEXT when answering the QUESTION 
If the CONTEXT doesn't contain the answer , output NONE


QUESTION: {question}

CONTEXT: {context}

"""

In [17]:
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [18]:
print(context)

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Environment - Is Python 3.9 still the recommended version to use in 2024?
answer: Yes, for simplicity (of troubleshooting against the recorded videos) and stability. [source]
But Python 3.10 and 3.11 should work fine.

section: General course-related questions
question: How can we contribute to the course?
answer: Star the repo! Share it with friends if you find it useful ❣️
Create a PR if you see you can improve the text or the structure of the repository.

section: General course-related questions
question: Are we still using the NYC Trip data for January 2021? Or are we using the 2022 data?
answer: We will u

In [19]:
response.choices[0].message.content

"Whether you can still enroll in a course that has already started depends on the institution or organization offering the course. Many institutions have specific policies regarding enrollment deadlines, while some may allow late enrollment under certain circumstances. \n\nTo get accurate information, it's best to:\n\n1. Check the course's official website or syllabus for enrollment details.\n2. Contact the course instructor or administrative office directly to inquire about late enrollment options.\n3. Look for any add/drop deadlines specific to the semester or term you are in.\n\nThey will provide you with the most relevant information regarding your situation."

### Modularizing the code 

In [20]:
def search(query):

        # texting the index 
    boost = {'question': 3.0, 'section':0.5} #this means the question field is 3 times more important than the text field
    
    
    results = index.search(
        query=query,
        boost_dict =boost,
        filter_dict = {'course':'data-engineering-zoomcamp'},
        num_results=5)

    return results
    

In [30]:
def build_prompt(query, search_results):
    prompt_template = """
    You are course teaching assistant. Answer the QUESTION based on context from the FAQ databse.
    Use only the facts from the CONTEXT when answering the QUESTION 
    If the CONTEXT doesn't contain the answer , output NONE
    
    
    QUESTION: {question}
    
    CONTEXT: {context}
    
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"


    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt



In [34]:
# modularizing the logic for invoking the gpt 

def llm(prompt):
    client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
    response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role":'user', "content":prompt}])

    return response.choices[0].message.content

In [42]:
# modularized calls 
query = "how do I run kafka?"

def rag(query):
 
    search_results = search(query)
    
    prompt = build_prompt(query, search_results)
    
    answer = llm(prompt)

    return answer

In [43]:
rag('the course has already started , can I still enroll ?')

'Yes, even if the course has already started, you can still enroll and are eligible to submit the homework. However, keep in mind that there will be deadlines for turning in the final projects.'