In [1]:
import openai
import elasticsearch 

In [2]:
import json

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': 'e663d9a2e466', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'XEzF9wpbRqSCNN9QIR0mDQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [5]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|████████████████████████████████████████████████████████████████████████████████| 948/948 [00:26<00:00, 35.85it/s]


In [6]:
user_question = "How do I join the course after it has started?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [7]:
response = es.search(index=index_name, body=search_query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [8]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [9]:
response['hits']

{'total': {'value': 407, 'relation': 'eq'},
 'max_score': 53.23979,
 'hits': [{'_index': 'course-questions',
   '_id': 'j5bATpABXWQJEVvQU7SG',
   '_score': 53.23979,
   '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
    'section': 'General course-related questions',
    'question': 'Course - Can I still join the course after the start date?',
    'course': 'data-engineering-zoomcamp'}},
  {'_index': 'course-questions',
   '_id': 'lJbATpABXWQJEVvQVLQq',
   '_score': 47.0588,
   '_source': {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
    'section': 'Gener

In [10]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [11]:
user_question = "How do I join the course after it has started?"

response = retrieve_documents(user_question)

for doc in response:
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [33]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [24]:
MODEL="gpt-4o"
# MODEL="gpt-3.5-turbo"

# response = client.chat.completions.create(
#     model=MODEL,
#     messages=[{"role": "user", "content": "The course already started. Can I still join?"}]
# )
# print(response.choices[0].message.content)

It depends on the specific course and its policies regarding late enrollment. Here are a few steps you can take to find out if joining late is possible:

1. **Check the Course Website or Syllabus:** Sometimes information about late enrollment is provided here.
   
2. **Contact the Instructor:** Send an email or attend office hours to ask if it's possible to join the course after it has started. Be prepared to explain your situation and demonstrate your willingness to catch up on missed content.

3. **Speak with the Registrar or Academic Advisor:** They often have information about add/drop deadlines and can provide guidance on whether late enrollment is feasible.

4. **Assess the Impact:** Consider how much material you’ve missed and whether you can realistically catch up without overly burdening yourself.

Good luck!


In [25]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

context_docs = retrieve_documents(user_question)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terrafo

In [26]:
prompt = f"""
You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [27]:
# response = client.chat.completions.create(
#     model=MODEL,
#     messages=[{"role": "user", "content": prompt}]
# )
# answer = response.choices[0].message.content
# answer

"Yes, you can still join the course after it has started. Even if you don't formally register, you are still eligible to submit the homework assignments. However, keep in mind that there will be deadlines for turning in the final projects, so it's not advisable to leave everything until the last minute."

In [28]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()


def build_context(documents):
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()


def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

def ask_openai(prompt, model="gpt-4o"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer

In [29]:
qa_bot("I'm getting invalid reference format: repository name must be lowercase")

'The error "invalid reference format: repository name must be lowercase" can often occur when mapping volumes with Docker on Windows because the filesystem paths sometimes include uppercase letters. Here\'s a detailed solution:\n\n1. **Move Data to a Folder Without Spaces**: Ensure that your data is not in a directory with spaces in its name. For example:\n   - **Correct**: `C:/git/...`\n   - **Incorrect**: `C:/Users/Alexey Grigorev/git/...`\n\n2. **Use Lowercase Paths**: Ensure that your paths and repository names are entirely in lowercase.\n\n3. **Volume Mapping Options**:\n   - **Option 1**:\n     ```shell\n     -v /c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data\n     ```\n   - **Option 2**:\n     ```shell\n     -v //c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data\n     ```\n   - **Option 3**:\n     ```shell\n     -v /c/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data\n     ```\n   - **Option 4**:\n     ```shell\n     -v //c/some/path/ny_taxi_postgre

In [30]:
qa_bot("I am having issues with setting up a GitHub account. What should i do?")

"If you are having issues with setting up a GitHub account, here are some steps and resources that can help you get started:\n\n1. **Create a GitHub Account**:\n   - Go to [GitHub](https://github.com) and sign up for a new account.\n\n2. **Clone the Course Repository**:\n   - After creating an account, clone the course repository to your local machine.\n   - Here is a helpful video on how to clone a repository from GitHub: [Git for Everybody: How to Clone a Repository from GitHub](https://www.youtube.com/watch?v=XdB7eI5zCBY).\n\n3. **Set Up Your Repository**:\n   - You can create your own repositories to host your notes, project files, etc.\n   - Follow this tutorial to set up a new repository: [Atlassian Git Tutorials: Setting Up a Repository](https://www.atlassian.com/git/tutorials/setting-up-a-repository).\n\n4. **Ignore Unnecessary Files**:\n   - Use a `.gitignore` file to exclude large files like databases, .csv, .gz files, and any other files that shouldn't be stored in the repos