In [1]:
from elasticsearch import Elasticsearch
from elasticsearch import NotFoundError
import boto3
import json
from pprint import pprint
from typing import List

In [2]:
INDEX_NAME = "course-questions"
SEARCH_FILTER_TERM = "machine-learning-zoomcamp"

In [3]:
es = Elasticsearch("http://elasticsearch:9200")

session = boto3.Session(profile_name="private")
bedrock_runtime = session.client("bedrock-runtime", region_name="us-east-1")
comprehend = session.client('comprehend', region_name='us-east-1')

In [1]:
import requests

docs_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1"
docs_response = requests.get(docs_url)
documents_row = docs_response.json()

documents_row[0]["course"]

'data-engineering-zoomcamp'

In [5]:
import hashlib

documents = []

for course in documents_row:

    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        question_hash = hashlib.md5(doc["question"].encode()).hexdigest()
        doc["id"] = question_hash
        documents.append(doc)
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': '2d669d12c0511996b393bff34bfbcf55'}

In [6]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
        }
    },
}

try:
    es.indices.get(index=INDEX_NAME)
    print(f"{INDEX_NAME} already exists")
    # es.indices.delete(index=INDEX_NAME, ignore=[400, 404])
except NotFoundError:
    response = es.indices.create(index=INDEX_NAME, body=index_settings)
    print(response)

course-questions already exists


In [7]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=INDEX_NAME, id=doc["id"], document=doc)

count_response = es.count(index=INDEX_NAME)
print(count_response)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 948/948 [00:03<00:00, 245.29it/s]


{'count': 943, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}


In [8]:
def create_search_query(question: str, size: int = 5, search_filter_term: str = None) -> str:
    search_query = {
        "size": size,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": question,
                        "fields": ["question^4", "text"],
                        "type": "best_fields",
                    }
                }
            }
        },
    }

    if search_filter_term is not None:
        search_query["query"]["bool"]["filter"] = {
            "term": {
                "course": search_filter_term
            }
        }

    return search_query

In [9]:
question = "How do I execute a command in a running docker container?"

search_query = create_search_query(question)

response = es.search(index=INDEX_NAME, body=search_query)
most_relevant = response["hits"]["hits"][0]
pprint(most_relevant)
pprint(f"Answer to Q3 is: {most_relevant['_score']}")


{'_id': '410d5719a0f86af7e51fe0338c99088e',
 '_index': 'course-questions',
 '_score': 84.050095,
 '_source': {'course': 'machine-learning-zoomcamp',
             'id': '410d5719a0f86af7e51fe0338c99088e',
             'question': 'How do I debug a docker container?',
             'section': '5. Deploying Machine Learning Models',
             'text': 'Launch the container image in interactive mode and '
                     'overriding the entrypoint, so that it starts a bash '
                     'command.\n'
                     'docker run -it --entrypoint bash <image>\n'
                     'If the container is already running, execute a command '
                     'in the specific container:\n'
                     'docker ps (find the container-id)\n'
                     'docker exec -it <container-id> bash\n'
                     '(Marcos MJD)'}}
'Answer to Q3 is: 84.050095'


In [10]:
search_query = create_search_query(question, size=3, search_filter_term=SEARCH_FILTER_TERM)

response = es.search(index=INDEX_NAME, body=search_query)
response_filtered_by_term = response["hits"]["hits"]
the_third_result = response_filtered_by_term[2]
pprint(f"Answer to Q4 is: {the_third_result['_source']['question']}")
pprint(response_filtered_by_term)


('Answer to Q4 is: How do I copy files from a different folder into docker '
 'container’s working directory?')
[{'_id': '410d5719a0f86af7e51fe0338c99088e',
  '_index': 'course-questions',
  '_score': 84.050095,
  '_source': {'course': 'machine-learning-zoomcamp',
              'id': '410d5719a0f86af7e51fe0338c99088e',
              'question': 'How do I debug a docker container?',
              'section': '5. Deploying Machine Learning Models',
              'text': 'Launch the container image in interactive mode and '
                      'overriding the entrypoint, so that it starts a bash '
                      'command.\n'
                      'docker run -it --entrypoint bash <image>\n'
                      'If the container is already running, execute a command '
                      'in the specific container:\n'
                      'docker ps (find the container-id)\n'
                      'docker exec -it <container-id> bash\n'
                      '(Marcos MJD)'}},


In [11]:
def create_element_context(question: str, answer: str) -> str:
    element_context = (
        f"Q: {question}\n"
        f"A: {answer}"
    ).strip()

    return element_context


def create_context(elasticserach_hits: List) -> str:
    result = ""
    for elem in elasticserach_hits:
        element_context = create_element_context(elem["_source"]["question"], elem["_source"]["text"])
        result += f"{element_context}\n\n"

    stripped = result.strip()
    return stripped

In [12]:
context = create_context(response_filtered_by_term)
pprint(context)

('Q: How do I debug a docker container?\n'
 'A: Launch the container image in interactive mode and overriding the '
 'entrypoint, so that it starts a bash command.\n'
 'docker run -it --entrypoint bash <image>\n'
 'If the container is already running, execute a command in the specific '
 'container:\n'
 'docker ps (find the container-id)\n'
 'docker exec -it <container-id> bash\n'
 '(Marcos MJD)\n'
 '\n'
 'Q: How do I copy files from my local machine to docker container?\n'
 'A: You can copy files from your local machine into a Docker container using '
 "the docker cp command. Here's how to do it:\n"
 'To copy a file or directory from your local machine into a running Docker '
 'container, you can use the `docker cp command`. The basic syntax is as '
 'follows:\n'
 'docker cp /path/to/local/file_or_directory container_id:/path/in/container\n'
 'Hrithik Kumar Advani\n'
 '\n'
 'Q: How do I copy files from a different folder into docker container’s '
 'working directory?\n'
 'A: You can c

In [13]:
def create_prompt(question: str, context: str) -> str:
    prompt_template = (
        "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n"
        "Use only the facts from the CONTEXT when answering the QUESTION.\n"
        f"QUESTION: {question}\n"

        "CONTEXT:\n"
        f"{context}"
    ).strip()

    return prompt_template

In [14]:
prompt = create_prompt(question, context)
pprint(f"Answer to Q5 is: {len(prompt)}")

number_of_words = len(prompt.replace("\n", " ").split())
print(f"Estimated token numbers based on number of words: {number_of_words * 1.25}")

number_of_symbols = len(prompt)
print(f"Estimated token numbers based on number of symbols: {number_of_symbols / 4.5}")


pprint(f"Prompt: {prompt}")

'Answer to Q5 is: 1460'
Estimated token numbers based on number of words: 295.0
Estimated token numbers based on number of symbols: 324.44444444444446
("Prompt: You're a course teaching assistant. Answer the QUESTION based on the "
 'CONTEXT from the FAQ database.\n'
 'Use only the facts from the CONTEXT when answering the QUESTION.\n'
 'QUESTION: How do I execute a command in a running docker container?\n'
 'CONTEXT:\n'
 'Q: How do I debug a docker container?\n'
 'A: Launch the container image in interactive mode and overriding the '
 'entrypoint, so that it starts a bash command.\n'
 'docker run -it --entrypoint bash <image>\n'
 'If the container is already running, execute a command in the specific '
 'container:\n'
 'docker ps (find the container-id)\n'
 'docker exec -it <container-id> bash\n'
 '(Marcos MJD)\n'
 '\n'
 'Q: How do I copy files from my local machine to docker container?\n'
 'A: You can copy files from your local machine into a Docker container using '
 "the docker cp 

In [15]:
def get_llm_answer(prompt: str, bedrock_runtime) -> str:
        # model = "amazon.titan-text-express-v1"
        model = "amazon.titan-text-premier-v1:0"
        # model = "amazon.titan-text-lite-v1"

        
        kwargs = {
            "modelId": model,
            "contentType": "application/json",
            "accept": "*/*",
            "body": json.dumps(
                {
                    "inputText": prompt,
                    "textGenerationConfig": {
                        "maxTokenCount": 500,
                        "stopSequences": [],
                        "temperature": 0.9,
                        "topP": 0.9,
                    },
                }
            ),
        }

        response = bedrock_runtime.invoke_model(**kwargs)
        body_as_plain_text = response.get('body').read()
        response_body = json.loads(body_as_plain_text)

        result = response_body["results"][0]["outputText"]

        return (result.strip(), response_body)

In [16]:
response = comprehend.detect_syntax(Text=prompt, LanguageCode='en')
tokens = [token['Text'] for token in response['SyntaxTokens']]

pprint(f"Answer to Q5 is: {len(tokens)}")
print(tokens)

'Answer to Q5 is: 301'
['You', "'re", 'a', 'course', 'teaching', 'assistant', '.', 'Answer', 'the', 'QUESTION', 'based', 'on', 'the', 'CONTEXT', 'from', 'the', 'FAQ', 'database', '.', 'Use', 'only', 'the', 'facts', 'from', 'the', 'CONTEXT', 'when', 'answering', 'the', 'QUESTION', '.', 'QUESTION', ':', 'How', 'do', 'I', 'execute', 'a', 'command', 'in', 'a', 'running', 'docker', 'container', '?', 'CONTEXT', ':', 'Q', ':', 'How', 'do', 'I', 'debug', 'a', 'docker', 'container', '?', 'A', ':', 'Launch', 'the', 'container', 'image', 'in', 'interactive', 'mode', 'and', 'overriding', 'the', 'entrypoint', ',', 'so', 'that', 'it', 'starts', 'a', 'bash', 'command', '.', 'docker', 'run', '-', 'it', '--', 'entrypoint', 'bash', '<', 'image', '>', 'If', 'the', 'container', 'is', 'already', 'running', ',', 'execute', 'a', 'command', 'in', 'the', 'specific', 'container', ':', 'docker', 'ps', '(', 'find', 'the', 'container', '-', 'id', ')', 'docker', 'exec', '-', 'it', '<', 'container', '-', 'id', '>', 

In [17]:
answer_from_llm, response_body = get_llm_answer(prompt, bedrock_runtime=bedrock_runtime)
pprint(answer_from_llm)
pprint(response_body)

('To execute a command in a running docker container, use the docker exec '
 'command. You will need the container-id, which you can find by running '
 'docker ps. Then, run the following command: docker exec -it <container-id> '
 'bash.')
{'inputTextTokenCount': 395,
 'results': [{'completionReason': 'FINISH',
              'outputText': 'To execute a command in a running docker '
                            'container, use the docker exec command. You will '
                            'need the container-id, which you can find by '
                            'running docker ps. Then, run the following '
                            'command: docker exec -it <container-id> bash.',
              'tokenCount': 57}]}
