In [1]:
import minsearch
import json

with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x76c2a5d7b940>

In [2]:
from openai import OpenAI
client = OpenAI()

In [3]:
def search(query):
    boost = {'question':3.0, 'section':0.5}

    results = index.search(
        query=query,
        boost_dict=boost,
        filter_dict={'course':'data-engineering-zoomcamp'},
        num_results=5 
    )
    return results

In [67]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Use only the facts from the CONTEXT when answering the QUESTION. 
    
    QUESTION: {question}
    
    CONTEXT: {context}
     """.strip()

    context=""

    for doc in search_results:
        context = context + f"question : {doc['question']}\nanswer : {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [68]:
# invoking GPT - llm takes the prompt and gives back some answer
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role':'user', 'content':prompt}]
    )
    
    return response.choices[0].message.content 

In [69]:
#query = 'How do I run Kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [70]:
rag('the course has already started. Can we still enroll?')

'Yes, you can still enroll in the course even after it has started. You are eligible to submit the homeworks, but please be aware of the deadlines for turning in the final projects. Make sure to manage your time well and not leave everything until the last minute.'

In [71]:
from elasticsearch import Elasticsearch

In [72]:
es_client = Elasticsearch('http://localhost:9200')

In [73]:
es_client.info()

ObjectApiResponse({'name': '2d556f1dd469', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'zx-ShEnYTnyXzGqvRJxVWg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [74]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

In [75]:
index_name = "course-questions"

es_client.indices.create(index= index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/meCSayoHS5mxrNDJPe3Dsg] already exists')

In [76]:
from tqdm.auto import tqdm

In [77]:
for doc in tqdm(documents):
    es_client.index(index = index_name, document=doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:21<00:00, 43.64it/s]


In [78]:
query = 'How do I execute a command in a running docker container?'

In [79]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

In [80]:
response = es_client.search(index= index_name, body=search_query)

In [81]:
response

ObjectApiResponse({'took': 912, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 690, 'relation': 'eq'}, 'max_score': 84.17781, 'hits': [{'_index': 'course-questions', '_id': 'y5qPb5ABcHZIFqCHsmyQ', '_score': 84.17781, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp'}}, {'_index': 'course-questions', '_id': 'f5q4b5ABcHZIFqCH73AE', '_score': 84.17781, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker ru

In [82]:
result_docs =  []

for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])
    result_docs.append('\n\n')

result_docs

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 '\n\n',
 {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 '\n\n',
 {'text': "You c

In [83]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index= index_name, body=search_query)

    result_docs =  []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [84]:
elastic_search(query)

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "You can copy files from

In [85]:
def rag_es(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [86]:
rag_es(query)

'To execute a command in a running docker container, you need to use the `docker exec` command. Here are the steps:\n\n1. First, find the container ID of the running container by using the following command:\n   ```\n   docker ps\n   ```\n\n2. Once you have the container ID, execute a command inside that specific container by using the following command:\n   ```\n   docker exec -it <container-id> <command>\n   ```\n\nFor example, to start a bash session inside the running container, you can use:\n   ```\n   docker exec -it <container-id> bash\n   ```\n\nReplace `<container-id>` with the actual ID of your container obtained from the `docker ps` command.'

In [87]:
search_results = elastic_search(query)
prompt = build_prompt(query, search_results)

In [88]:
len(prompt)

2351

In [89]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m439.0 kB/s[0m eta [36m0:00:00[0mMB/s[0m eta [36m0:00:01[0m
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (775 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m775.1/775.1 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: regex

In [91]:
import tiktoken

In [92]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [103]:
list = encoding.encode(prompt)

In [122]:
len(set(list))


163