# Build Your own RAG


# Retrieval of Docs

In [1]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json


--2024-06-20 07:43:45--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-06-20 07:43:45--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json.2’


2024-06-20 07:43:45 (75.7 MB/s) - ‘documents.json.2’ saved [658332/658332]



In [2]:
# Importing data to include sext,question and course..
import json 
with open ('./documents.json','rt') as f_in:
    documents_file = json.load(f_in)
# Empty list to store the retrieved docs
documents = []

for course in documents_file:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
# Use Elastic search to index documents
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '4d8344cbf166', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'Q5awz6cJSq265zjv1pFCdA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [9]:
# Create an index (in elastic search its a table)
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
# Delete index if it already exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:30<00:00, 30.81it/s]


In [18]:
user_question = "How do I join the course after it has started?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [19]:
response = es.search(index=index_name, body=search_query)
response

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:100]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, t...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your ...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud accou...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the course repo to your local machine using the ...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided c

In [28]:
# Lets try to put it in a function. We only need the user query, the index/table_name and the maximum results to be produced

def retrieve_document(query, index_name = "course-questions",max_result=5):
    
    user_question = query

    search_query = {
        "size": max_result,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": user_question,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents
    
    

In [26]:
retrieve_document("Can I still join the course after the start date")


[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

## Generation - Answering questions

#### We now do the G part in RAG based on the R output

In [22]:
# Lets communicate with GPT 3.5
from openai import OpenAI

In [23]:
# Create a client object
client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages = [{
        "role":"user",
        "content":"The course already started. Can I still join?"
    }])
print(response.choices[0].message.content)

Whether you can still join a course after it has already started generally depends on several factors, including the specific policies of the institution or organization offering the course, the particular nature and structure of the course, and how much time has elapsed since the course began.

Here are some steps you can take to find out if it's possible:

1. **Contact the Instructor or Institution**: Reach out directly to the course instructor or the institution's admissions office. They can provide you with the most accurate information regarding late enrollment policies.

2. **Review Course Policies**: Check the course syllabus or the institution's website for any information regarding late enrollment and cut-off dates.

3. **Consider Catching Up**: If late enrollment is allowed, ask about what you will need to do to catch up on missed material. Some courses may have resources available, like recorded lectures or online materials, to help latecomers.

4. **Assess Impact**: Think a

In [29]:
# Set Up our prompt structure.
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()
context_docs = retrieve_document(user_question)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terrafo

In [31]:
prompt = f"""You're a course teaching assistant. Answer the user QUESSTION based on CONTEXT- the documents retireved from our FAQ database.
Only use the facts from the CONTEXT.If the CONTEXT doesn't contain the answer, return "NONE" 
QUESTION : {user_question}
CONTEXT:
{context}
""".strip()

In [32]:
# Lets now prompt GPT
# Create a client object
client = OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages = [{
        "role":"user",
        "content":prompt
    }])
print(response.choices[0].message.content)

Yes, even if you don't register, you're still eligible to join the course and submit the homeworks after it has started. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


#### There are system and user prompts. Lets combine everything together to produce outputs in one function