## A. IMPLEMENTING THE MINSEARCH ENGINE

In [1]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py


In [2]:
# minsearch repository https://github.com/alexeygrigorev/minsearch/tree/main

In [1]:
#implementing the search engine 
import minsearch

In [2]:
# Getting the indexed FAQ documents

import json

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
# restructuring the json to contain document records for each course
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
#indexing the document so we can search through it 

index = minsearch.Index(
    text_fields = ['question','text','section'],
    keyword_fields=['course']
)

In [6]:
index.fit(documents)

<minsearch.Index at 0x700b53fc0890>

In [7]:
# texting the index 
boost = {'question': 3.0, 'section':0.5} #this means the question field is 3 times more important than the text field


results = index.search(
    query='the courese has already started, can I still enroll?',
    boost_dict =boost,
    filter_dict = {'course':'data-engineering-zoomcamp'},
    num_results=5
)

In [8]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, for simplicity (of troubleshooting against the recorded videos) and stability. [source]\nBut Python 3.10 and 3.11 should work fine.',
  'section': 'General course-related questions',
  'question': 'Environment - Is Python 3.9 still the recommended version to use in 2024?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'd

## B. PASSING THE CONTEXT TO AN LLM 

In [9]:
import os 
#os.environ['OPENAI_API_KEY']

In [33]:
from openai import OpenAI

In [34]:
client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role":'user', "content":'the courese has already started, can I still enroll?'}])

In [35]:
response.choices[0].message.content

"Whether or not you can still enroll in a course that has already started depends on the policies of the institution or program offering the course. Many institutions have specific deadlines for enrollment, while others may allow late enrollment on a case-by-case basis. \n\nHere are some steps you can take:\n\n1. **Check the Course Policies**: Review the course syllabus or the institution's website for enrollment deadlines and late registration policies.\n\n2. **Contact the Instructor or Program Coordinator**: Reach out to the instructor or the administrative office of the program to inquire about the possibility of enrolling late.\n\n3. **Explain Your Situation**: If you have a valid reason for wanting to enroll after the start date, be prepared to explain it when you contact them.\n\n4. **Consider Online Options**: If the course is offered online, some institutions may be more flexible with late enrollments.\n\n5. **Look for Similar Courses**: If enrolling in this course isn't possib

In [36]:
#Setting the context

prompt_template = """
You are course teaching assistant. Answer the QUESTION based on context from the FAQ databse.
Use only the facts from the CONTEXT when answering the QUESTION 
If the CONTEXT doesn't contain the answer , output NONE


QUESTION: {question}

CONTEXT: {context}

"""

In [37]:
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [38]:
print(context)

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Environment - Is Python 3.9 still the recommended version to use in 2024?
answer: Yes, for simplicity (of troubleshooting against the recorded videos) and stability. [source]
But Python 3.10 and 3.11 should work fine.

section: General course-related questions
question: How can we contribute to the course?
answer: Star the repo! Share it with friends if you find it useful ❣️
Create a PR if you see you can improve the text or the structure of the repository.

section: General course-related questions
question: Are we still using the NYC Trip data for January 2021? Or are we using the 2022 data?
answer: We will u

In [39]:
response.choices[0].message.content

"Whether or not you can still enroll in a course that has already started depends on the policies of the institution or program offering the course. Many institutions have specific deadlines for enrollment, while others may allow late enrollment on a case-by-case basis. \n\nHere are some steps you can take:\n\n1. **Check the Course Policies**: Review the course syllabus or the institution's website for enrollment deadlines and late registration policies.\n\n2. **Contact the Instructor or Program Coordinator**: Reach out to the instructor or the administrative office of the program to inquire about the possibility of enrolling late.\n\n3. **Explain Your Situation**: If you have a valid reason for wanting to enroll after the start date, be prepared to explain it when you contact them.\n\n4. **Consider Online Options**: If the course is offered online, some institutions may be more flexible with late enrollments.\n\n5. **Look for Similar Courses**: If enrolling in this course isn't possib

### Modularizing the code 

In [40]:
def search(query):

        # texting the index 
    boost = {'question': 3.0, 'section':0.5} #this means the question field is 3 times more important than the text field
    
    
    results = index.search(
        query=query,
        boost_dict =boost,
        filter_dict = {'course':'data-engineering-zoomcamp'},
        num_results=5)

    return results
    

In [41]:
def build_prompt(query, search_results):
    prompt_template = """
    You are course teaching assistant. Answer the QUESTION based on context from the FAQ databse.
    Use only the facts from the CONTEXT when answering the QUESTION 
    If the CONTEXT doesn't contain the answer , output NONE
    
    
    QUESTION: {question}
    
    CONTEXT: {context}
    
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"


    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt



In [42]:
# modularizing the logic for invoking the gpt 

def llm(prompt):
    client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
    response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role":'user', "content":prompt}])

    return response.choices[0].message.content

In [42]:
# modularized calls 
query = "how do I run kafka?"

def rag(query):
 
    search_results = search(query)
    
    prompt = build_prompt(query, search_results)
    
    answer = llm(prompt)

    return answer

In [43]:
rag('the course has already started , can I still enroll ?')

'Yes, even if the course has already started, you can still enroll and are eligible to submit the homework. However, keep in mind that there will be deadlines for turning in the final projects.'

## D. PERSISTING DOCUMENT INDEXES USING ELASTICSEARCH FROM A DOCKER CONTAINER

In [13]:
# sample document index from minsearch
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [18]:
#connecting to the Elastic Search client
#Elastic search saves all the index data on disk - may need to do volume mapping for more 
#advanced persisting.
from elasticsearch import Elasticsearch

In [15]:
es_client  = Elasticsearch('http://localhost:9200')

In [16]:
es_client.info()

ObjectApiResponse({'name': 'acd1e8272a57', 'cluster_name': 'docker-cluster', 'cluster_uuid': '0Ijp0t7TT-u0EENOYZ2h9g', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [19]:
#Creating an index in the database
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name , body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [20]:
from tqdm.auto import tqdm

In [21]:
#indexing the FAQ documents
for doc in tqdm(documents):
    es_client.index(index=index_name, document = doc)

In [28]:
#querying the indexed documents 
# "question^3" - means that question is 3 times more important than the text and section fields
# size means we get five results back 

def elastic_search(query) : 
    
    query = "I just discovered the course , can I still join ?"
    
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)
    
    
    #collecting the several documents into one List (Constnat time complexity)
    result_docs = [] 
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs




In [46]:
#elastic_search(query)

In [47]:
#Adjusting the RAG workflow

# modularized calls 
query = "'the course has already started , can I still enroll ?"

def rag(query):
 
    search_results = elastic_search(query)
    
    prompt = build_prompt(query, search_results)
    
    answer = llm(prompt)

    return answer



In [48]:
rag(query)

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [50]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [55]:
## E. Calaculating prompting costs 

In [52]:
import tiktoken

In [53]:
encoding = tiktoken.encoding_for_model("gpt-4o-mini")

In [54]:
search_results = elastic_search(query)

prompt = build_prompt(query, search_results)

len(encoding.encode(prompt))

467

In [77]:
#gpt mini costs for sychronous API
input_cost = (0.150/1000000) * len(encoding.encode(prompt))
output_cost = (0.6/1000000 ) * len(encoding.encode(rag(query)))

In [79]:
total_cost = "{:.8f}".format(input_cost + output_cost)

In [80]:
tokens = encoding.encode(prompt)[:10]


tokens , print(f"The cost of this prompt is ${total_cost}".format('a'))

The cost of this prompt is $0.00009105


([3575, 553, 4165, 14029, 29186, 13, 30985, 290, 150339, 4122], None)