In [76]:
import openai
import tiktoken
import requests 
from openai import OpenAI
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch

In [2]:
client = OpenAI()

## Q1. Running Elastic

In [5]:
!curl localhost:9200

{
  "name" : "077f06bbb743",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "8EIuvg_9QnqURUPhpaJVXA",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


- What's the version.build_hash value? **42f05b9372a9a4a470db3b52817899b99a76ee73**

In [3]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Q2. Indexing the data

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
es_client = Elasticsearch("http://localhost:9200")

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course_querry"
es_client.indices.create(index=index_name, body=index_settings)

In [19]:
for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

- Which function do you use for adding your data to elastic? **index**

## Q3. Searching

In [38]:
query = "How do I execute a command in a running docker container?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            }
        }
    }
}

In [39]:
response = es_client.search(index=index_name, body=search_query)

In [40]:
response['hits']['hits'][0]['_score']

84.050095

- What's the score for the top ranking result? **84.050095**

## Q4. Filtering

In [41]:
query = "How do I execute a command in a running docker container?"

search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

In [42]:
response = es_client.search(index=index_name, body=search_query)

In [47]:
response['hits']['hits'][2]['_source']['question']

'How do I copy files from a different folder into docker container’s working directory?'

- What's the 3rd question returned by the search engine?

  **What's the 3rd question returned by the search engine?**

## Q5. Building a prompt

In [3]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                        }
                    },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                        }
                    }
                }
            }
        }

    response = es_client.search(index=index_name, body=search_query)
    
    respond_doc = []
    for hit in response['hits']['hits']:
        respond_doc.append(hit['_source'])
        
    return respond_doc

In [71]:
def build_prompt(query, context):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()
    
    prompt = prompt_template.format(question=query, context=context)
    return prompt

In [72]:
query = "How do I execute a command in a running docker container?"
respond_doc = elastic_search(query)

In [73]:
context = ""
context_template = """
Q: {question}
A: {text}
""".strip()

for doc in respond_doc:
    context += (context_template.format(question=doc['question'], text=doc['text'])+"\n\n")

context = context.strip()

In [74]:
prompt = build_prompt(query, context)
len(prompt)

1462

- What's the length of the resulting prompt? (use the len function) **1462**

## Q6. Tokens

In [77]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [83]:
len(encoding.encode(text=prompt))

322

- How many tokens does our prompt have? **322**

In [84]:
encoding.decode_single_token_bytes(63842)

b"You're"

In [86]:
def rag(prompt):
    response = client.chat.completions.create(
    model = "gpt-4o",
    messages = [{"role":"user", "content": prompt}]
        )
    return response.choices[0].message.content

In [88]:
answer = rag(prompt)
print(answer)

To execute a command in a running Docker container, you can use the `docker exec` command. Here are the steps to do it:

1. List the running containers to find the container ID:
   ```
   docker ps
   ```

2. Execute the command in the specific container using the container ID. For example, to start a bash shell:
   ```
   docker exec -it <container-id> bash
   ```

Replace `<container-id>` with the actual ID of your running container. This will open an interactive bash shell in the running container.


In [91]:
in_token = 0.005 / 1000
out_token = 0.015  / 1000

output = 150
input = 250 

cost = 1000*(output*in_token + input*out_token)
f"It will cost ${cost} to run 1000 requests."

'It will cost $4.5 to run 1000 requests.'