In [1]:
import openai
import elasticsearch 

In [2]:
import json

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '3914637ded07', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'ZDna6r9TSWiWjhhs9O8E5w', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [5]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
  0%|                                                                                                                    | 0/948 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:22<00:00, 41.40it/s]


In [6]:
user_question = "How do I join the course after it has started?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [7]:
response = es.search(index=index_name, body=search_query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [8]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [9]:
response['hits']

{'total': {'value': 407, 'relation': 'eq'},
 'max_score': 53.121944,
 'hits': [{'_index': 'course-questions',
   '_id': 'espAT5ABq0Q4fZD0HJxT',
   '_score': 53.121944,
   '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
    'section': 'General course-related questions',
    'question': 'Course - Can I still join the course after the start date?',
    'course': 'data-engineering-zoomcamp'}},
  {'_index': 'course-questions',
   '_id': 'f8pAT5ABq0Q4fZD0HJzH',
   '_score': 46.915794,
   '_source': {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
    'section': 'G

In [10]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [11]:
user_question = "How do I join the course after it has started?"

response = retrieve_documents(user_question)

for doc in response:
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [12]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [13]:
MODEL="gpt-4o"
# MODEL="gpt-3.5-turbo"

# response = client.chat.completions.create(
#     model=MODEL,
#     messages=[{"role": "user", "content": "The course already started. Can I still join?"}]
# )
# print(response.choices[0].message.content)

In [14]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

context_docs = retrieve_documents(user_question)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terrafo

In [15]:
prompt = f"""
You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [16]:
# response = client.chat.completions.create(
#     model=MODEL,
#     messages=[{"role": "user", "content": prompt}]
# )
# answer = response.choices[0].message.content
# answer

In [17]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()


def build_context(documents):
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()


def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

def ask_openai(prompt, model="gpt-4o"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer

In [18]:
qa_bot("I'm getting invalid reference format: repository name must be lowercase")

'The error "invalid reference format: repository name must be lowercase" typically indicates an issue with how Docker is interpreting the repository or volume names. Here are some troubleshooting steps you can follow:\n\n1. **Move Your Data**:\n   - Ensure your data is in a folder path without spaces. For example, instead of `C:/Users/Alexey Grigorev/git/...`, use `C:/git/...`.\n\n2. **Adjust the Volume Mapping**:\n   - Try different ways to specify the volume mapping. Here are several options you can try:\n     ```sh\n     -v /c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data\n     -v //c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data\n     -v /c/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data\n     -v //c/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data\n     --volume //driveletter/path/ny_taxi_postgres_data/:/var/lib/postgresql/data\n     ```\n\n3. **Add `winpty` (for Git Bash)**:\n   - If you\'re using Git Bash, prepend your Docker command with

In [19]:
qa_bot("I am having issues with setting up a GitHub account. What should i do?")

"If you are having issues setting up a GitHub account, follow these steps to simplify the process and get started:\n\n1. **Creating an Account:**\n   - Go to [GitHub's website](https://github.com/) and sign up for a new account.\n   - Follow the prompts to enter your email, create a password, and choose a username.\n\n2. **Clone the Course Repository:**\n   - Once your account is created, you'll need to clone the course repository to your local machine.\n   - Follow the process outlined in the video [Git for Everybody: How to Clone a Repository from GitHub](https://www.youtube.com/watch?v=QT2N4t2hxjw).\n\n3. **Set Up Your Repository:**\n   - It's useful to create your own repositories to host your notes and versions of your files.\n   - A great tutorial to get you started is [Atlassian's guide on setting up a repository](https://www.atlassian.com/git/tutorials/setting-up-a-repository).\n\n4. **Ignore Unnecessary Files:**\n   - Use a `.gitignore` file to avoid committing large databases