# Build Your own RAG


# Retrieval of Docs

In [3]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json


--2024-06-21 06:38:39--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-06-21 06:38:39--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json.4’


2024-06-21 06:38:39 (40.5 MB/s) - ‘documents.json.4’ saved [658332/658332]



In [4]:
# Importing data to include sext,question and course..
import json 
with open ('./documents.json','rt') as f_in:
    documents_file = json.load(f_in)
# Empty list to store the retrieved docs
documents = []

for course in documents_file:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
# Use Elastic search to index documents
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '3258b0acb186', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'QeuogQ_VT4K76xN52oelBg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [8]:
# Create an index (in elastic search its a table)
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
# Delete index if it already exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [9]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:33<00:00, 28.57it/s]


In [10]:
user_question = "How do I join the course after it has started?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [11]:
response = es.search(index=index_name, body=search_query)
response

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:100]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, t...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your ...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud accou...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the course repo to your local machine using the ...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided c

In [38]:
# Lets try to put it in a function. We only need the user query, the index/table_name and the maximum results to be produced

def retrieve_document(query, index_name = "course-questions",max_result=5):
    
    user_question = query

    search_query = {
        "size": max_result,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": user_question,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents
    
    

In [39]:
# retrieve_document("Can I still join the course after the start date")


## Generation - Answering questions

#### We now do the G part in RAG based on the R output

In [40]:
# Lets communicate with GPT 3.5
from openai import OpenAI

In [41]:
# Create a client object
client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages = [{
        "role":"user",
        "content":"The course already started. Can I still join?"
    }])
print(response.choices[0].message.content)

It depends on the specific course policies and the institution offering the course. Here are some steps you can take:

1. **Check the Course Website or Catalog**: Sometimes, information about late registration or joining after the start date is provided online.

2. **Contact the Instructor**: Reach out to the course instructor directly to inquire if it's possible to join late. Be sure to explain your situation.

3. **Speak to an Academic Advisor**: An academic advisor can provide guidance on the policies and may assist in facilitating your late enrollment.

4. **Review the Institution’s Policies**: Many institutions have specific policies regarding late enrollment, including cut-off dates and potential penalties.

5. **Catch Up on Course Material**: If you are allowed to join, be prepared to quickly catch up on any missed coursework to stay current with the class.

Do you have the contact information for the instructor or the academic advising office? That would be a good place to star

In [42]:
# Set Up our prompt structure.
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()
context_docs = retrieve_document(user_question)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terrafo

In [43]:
prompt = f"""You're a course teaching assistant. Answer the user QUESSTION based on CONTEXT- the documents retireved from our FAQ database.
Only use the facts from the CONTEXT.If the CONTEXT doesn't contain the answer, return "NONE" 
QUESTION : {user_question}
CONTEXT:
{context}
""".strip()

In [44]:
# Lets now prompt GPT
# Create a client object
client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages = [{
        "role":"user",
        "content":prompt
    }])
print(response.choices[0].message.content)

Yes, you can still join the course after it has started. Even if you don't officially register, you're eligible to submit the homework assignments. However, be sure to adhere to the deadlines for turning in the final projects to avoid last-minute rushes.


#### There are system and user prompts. Lets combine everything together to produce outputs in one function

In [45]:
# Contatenate my questions
context_template  = """ Section:{section}
Question:{question}
Answer:{text}""".strip()
# Set up the prompt
prompt_template = f"""You're a course teaching assistant. Answer the user QUESTION based on CONTEXT- the documents retireved from our FAQ database.
Only use the facts from the CONTEXT.If the CONTEXT doesn't contain the answer, return "NONE" 

QUESTION : {user_question}

CONTEXT:

{context}

""".strip()
# We want to build context properly (well formated)
def build_context(documets):
    context_result = ""
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n"+doc_str)
    return context_result.strip()
# Structure it as a prompt
def build_prompt(user_question,documents):
    context = build_context(documents)
    prompt = prompt_template.format(user_question = user_question,
                                    context = context)
    return prompt
# Lets ask AI
def ask_openai(prompt,model="gpt-4o"):
    response = client.chat.completions.create(
    model=model,
    messages = [{
        "role":"user",
        "content":prompt}])
    answers = response.choices[0].message.content
    return answers
# Lets put everything together in one bot
def qa_bot(user_question):
    context_test = build_context(retrieve_document(user_question))
    prompt_structure = build_prompt(user_question,context_test)
    gpt_ans = ask_openai(prompt_structure)
    return gpt_ans


In [49]:
qa_bot("How can i resister for my course")

"Yes, even if you don't register by the course start date, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [47]:
qa_bot("I can't connect to postgres port 5432, my password doesn't work")

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."