In [11]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [13]:
user_question = "I just discovered the course. Can I still join it?"
user_question_vector = model.encode(user_question).tolist()

In [1]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [5]:
import json

with open("data/documents-with-ids.json", "w") as f:
    json.dump(documents, f, indent=2)

In [10]:
embeddings = []

for doc in tqdm(documents):
    qa_text = doc["question"] + " " + doc["text"]
    embeddings.append(model.encode(qa_text).tolist())

100%|██████████| 948/948 [01:05<00:00, 14.46it/s]


In [11]:
import numpy as np

X = np.array(embeddings)
X.shape

(948, 768)

In [49]:
if_ml_zoomcamp_docs = [doc["course"] == "machine-learning-zoomcamp" for doc in documents]
docs_mlz = [doc for doc in documents if doc["course"] == "machine-learning-zoomcamp"]
X_mlz = X[if_ml_zoomcamp_docs].copy()

for doc, embd in zip(docs_mlz, X_mlz):
    doc["qt_vector"] = embd.tolist()

In [26]:
X_mlz.dot(user_question_vector).max()

np.float64(0.6506574375598062)

In [30]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=docs_mlz, embeddings=X_mlz)
search_engine.search(user_question_vector, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you c

In [31]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [34]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [37]:
def hit_rate_search_engine(ground_truth: list[dict], search_engine: VectorSearchEngine) -> float:
    hits = 0
    for item in tqdm(ground_truth):
        question = item['question']
        v_question = model.encode(question).tolist()
        results = search_engine.search(v_question, num_results=5)
        hits += any(item['document'] == result['id'] for result in results)
    return hits / len(ground_truth)

In [38]:
hit_rate(ground_truth, search_engine)

100%|██████████| 1830/1830 [00:47<00:00, 38.45it/s]


0.9398907103825137

In [41]:
len(docs_mlz), X_mlz.shape

(375, (375, 768))

In [7]:
from elasticsearch import Elasticsearch

In [8]:
es_client = Elasticsearch("http://localhost:9200")

In [15]:
user_question = "I just discovered the course. Can I still join it?"
user_question_vector = model.encode(user_question).tolist()

In [26]:
knn_query = {
    "field": "qa_vector",
    "query_vector": user_question_vector,
    "k": 5,
    "num_candidates": 10000, 
}

In [27]:
response = es_client.search(
    index="course-qa",
    knn=knn_query,
    size=12,
    source=["question", "answer", "id"],
)

In [28]:
response["hits"]["hits"]

[{'_index': 'course-qa',
  '_id': 'zPvGHZEBaOxKYkmM4uBx',
  '_score': 0.7873192,
  '_source': {'question': 'Course - Can I still join the course after the start date?',
   'id': '7842b56a',
   'answer': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."}},
 {'_index': 'course-qa',
  '_id': '0fvGHZEBaOxKYkmM4uCM',
  '_score': 0.7583208,
  '_source': {'question': 'Course - Can I follow the course after it finishes?',
   'id': 'a482086d',
   'answer': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.'}},
 {'_index': 'course-qa',
  '_id': 'yvvGHZEBaOxKYkmM4uBY',
  '_score': 0.7561499,
  '_source':

In [66]:
def hit_rate_es(ground_truth: list[dict], es_client: Elasticsearch) -> float:
    hits = 0
    for item in tqdm(ground_truth):
        question = item['question']
        v_question = model.encode(question).tolist()
        knn_query = {
            "field": "qt_vector",
            "query_vector": v_question,
            "k": 5,
            "num_candidates": 10000, 
        }
        response = es_client.search(
            index=index_name,
            knn=knn_query,
            size=5,
            source=["id"],
        )
        results = response["hits"]["hits"]
        hits += any(item['document'] == result['_source']['id'] for result in results)
    return hits / len(ground_truth)

In [67]:
hit_rate_es(ground_truth, es_client)

100%|██████████| 1830/1830 [01:15<00:00, 24.15it/s]


0.9398907103825137

In [25]:
index_name = "questions"
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

In [26]:
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'questions'})

In [27]:
documents = prepare_docs()
for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)

100%|██████████| 948/948 [00:02<00:00, 470.29it/s]


In [28]:
query = "windows or mac"

es_search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text",],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

In [29]:
results = es_client.search(index=index_name, body=es_search_query)

In [30]:
results["hits"]["hits"]

[{'_index': 'questions',
  '_id': 'QxAM-ZAB4ih9aEr8s23X',
  '_score': 23.824303,
  '_source': {'text': "Problem: I download kind from the next command:\ncurl.exe -Lo kind-windows-amd64.exe https://kind.sigs.k8s.io/dl/v0.17.0/kind-windows-amd64\nWhen I try\nkind --version\nI get: 'kind' is not recognized as an internal or external command, operable program or batch file\nSolution: The default name of executable is kind-windows-amd64.exe, so that you have to rename this file to  kind.exe. Put this file in specific folder, and add it to PATH\nAlejandro Aponte",
   'section': '10. Kubernetes and TensorFlow Serving',
   'question': "'kind' is not recognized as an internal or external command, operable program or batch file. (In Windows)",
   'course': 'machine-learning-zoomcamp'}},
 {'_index': 'questions',
  '_id': 'IxAM-ZAB4ih9aEr8s22d',
  '_score': 21.696842,
  '_source': {'text': 'Problem:\nWhile trying to build docker image in Section 9.5 with the command:\ndocker build -t clothing-mode

In [74]:
context_template = """
Q: {question}
A: {text}
""".strip()

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [79]:
contexts = []
for hit in results["hits"]["hits"]:
    context = context_template.format(**hit["_source"])
    contexts.append(context)

In [80]:
prompt = prompt_template.format(question=query, context="\n\n".join(contexts))

In [83]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: How do I execute a command in a running docker container?

CONTEXT:
Q: How do I debug a docker container?
A: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.
docker run -it --entrypoint bash <image>
If the container is already running, execute a command in the specific container:
docker ps (find the container-id)
docker exec -it <container-id> bash
(Marcos MJD)

Q: How do I copy files from my local machine to docker container?
A: You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:
To copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:
docker cp /path/to/local/file_or_directory container_id:

In [85]:
import tiktoken

In [86]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [89]:
encoding.encode(prompt)

[63842,
 261,
 4165,
 14029,
 29186,
 13,
 30985,
 290,
 150339,
 4122,
 402,
 290,
 31810,
 8099,
 591,
 290,
 40251,
 7862,
 558,
 8470,
 1606,
 290,
 19719,
 591,
 290,
 31810,
 8099,
 1261,
 55959,
 290,
 150339,
 364,
 107036,
 25,
 3253,
 621,
 357,
 15792,
 261,
 6348,
 306,
 261,
 6788,
 62275,
 9282,
 1715,
 10637,
 50738,
 734,
 48,
 25,
 3253,
 621,
 357,
 15199,
 261,
 62275,
 9282,
 3901,
 32,
 25,
 41281,
 290,
 9282,
 3621,
 306,
 25383,
 6766,
 326,
 151187,
 290,
 7251,
 4859,
 11,
 813,
 484,
 480,
 13217,
 261,
 38615,
 6348,
 558,
 68923,
 2461,
 533,
 278,
 2230,
 7962,
 4859,
 38615,
 464,
 3365,
 523,
 3335,
 290,
 9282,
 382,
 4279,
 6788,
 11,
 15792,
 261,
 6348,
 306,
 290,
 4857,
 9282,
 734,
 68923,
 10942,
 350,
 6555,
 290,
 9282,
 26240,
 446,
 68923,
 25398,
 533,
 278,
 464,
 6896,
 26240,
 29,
 38615,
 198,
 6103,
 277,
 10732,
 391,
 79771,
 1029,
 48,
 25,
 3253,
 621,
 357,
 5150,
 6291,
 591,
 922,
 2698,
 7342,
 316,
 62275,
 9282,
 3901,
 32,
 2

In [92]:
encoding.decode_single_token_bytes(261)

b' a'