In [33]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [34]:
import os
from dotenv import load_dotenv

load_dotenv()

# Now retrieve the API key
api_key = os.getenv("OPENAI_API_KEY")


In [35]:
from openai import OpenAI

In [36]:
client = OpenAI(
   api_key = api_key
)

In [37]:
def build_prompt(query, search_results):
    propmt_template = """
    You are a course teaching assistant. Anser the QUESTION based on the CONTEXT from the FAQ database. 
    Use only the facts from the context when answering the questions. 
    
    QUESTION : {question}
    
    CONTEXT : {context}
    """.strip()
    
    context = ""
    
    for doc in search_results:
        context = context + f"section : {doc['section']}\nquestion: {doc['question']}\nanswer : {doc['text']}\n\n"

    prompt = propmt_template.format(question = query, context = context).strip()

    return prompt
        

In [38]:
def llm_response(prompt):
    response = client.chat.completions.create(
    model = "gpt-4o",
    messages = [{"role":"user", "content": prompt}]
)
    return response.choices[0].message.content

In [39]:
def rag(query):
    results = search(query)
    prompt = build_prompt(query, results)
    response = llm_response(prompt)
    return response

In [40]:
pip install elasticsearch

Note: you may need to restart the kernel to use updated packages.


In [41]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch(
    'http://localhost:9200'
)

In [42]:
es_client.info()

ObjectApiResponse({'name': '4e0b038c145a', 'cluster_name': 'docker-cluster', 'cluster_uuid': '7BO0u5QcRBizxOxYIvCqew', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [43]:
resp = es_client.indices.delete(
    index="course_questions",
)
print(resp)

{'acknowledged': True}


In [44]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course_questions"

In [45]:
#creating index
es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_questions'})

In [22]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [24]:
from tqdm.auto import tqdm

In [46]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [47]:
# indexing documents
for doc in documents:
    es_client.index(index=index_name, document=doc)
    

In [48]:
query = "How can I run Kafka?"

In [49]:
def elastic_search(query):
    search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}
    search_result = es_client.search(index = index_name, body = search_query)
    
    result_docs = []
    for doc in search_result['hits']['hits']:
        result_docs.append(doc['_source'])
    return result_docs

In [50]:
def rag(query):
    results = elastic_search(query)
    prompt = build_prompt(query, results)
    response = llm_response(prompt)
    return response

In [51]:
response = rag(query)

In [52]:
response

'To run Kafka, the context information provided does not directly include explicit instructions on how to start or run Kafka itself. However, it does contain related information about running Java-based Kafka producers, consumers, and KStreams, as well as setting up a Python environment for Kafka-related operations.\n\nFor running Java-based Kafka applications, you can navigate to your project directory and use the following command:\n```shell\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nFor running Python-based Kafka producers or consumers, it is recommended to create a virtual environment, install the necessary packages from `requirements.txt`, and ensure Docker images are running if needed. You can create and activate a virtual environment using:\n```shell\n# Create virtual environment and install packages (run only once)\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\n\n# Activate virtual 