# Homework 03 Vector Search

## Q1. Getting the embeddings model

In [1]:
from sentence_transformers import SentenceTransformer, util
import requests
import numpy as np
from tqdm.notebook import tqdm

  from tqdm.autonotebook import tqdm, trange


In [2]:
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

In [3]:
user_question = "I just discovered the course. Can I still join it?"
embedding_vector = embedding_model.encode([user_question])[0]
embedding_vector[0]

0.07822262

Sense check [passed]

In [4]:
token_ids = embedding_model.tokenize([user_question])["input_ids"]
token_ids

tensor([[ 101, 1045, 2074, 3603, 1996, 2607, 1012, 2064, 1045, 2145, 3693, 2009,
         1029,  102]])

In [5]:
token_ids

tensor([[ 101, 1045, 2074, 3603, 1996, 2607, 1012, 2064, 1045, 2145, 3693, 2009,
         1029,  102]])

In [6]:
embedding_model.tokenizer.convert_ids_to_tokens(token_ids[0].numpy())

['[CLS]',
 'i',
 'just',
 'discovered',
 'the',
 'course',
 '.',
 'can',
 'i',
 'still',
 'join',
 'it',
 '?',
 '[SEP]']

## Q2. Creating the embeddings

In [7]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [8]:
docs = [
    f"{doc['question']} {doc['text']}"
    for doc in documents
]
embeddings = embedding_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

The 1st dimension match with the number of documents, the second with the model output dimensionality

In [9]:
assert len(documents) == embeddings.shape[0]
assert embeddings.shape[1] == embedding_model.get_sentence_embedding_dimension()

In [10]:
embeddings.shape

(948, 768)

## Q3. Search

We're including a <question> to the doc in the

```python
qa_text = f'{question} {text}'
```

Such trick is useful for Elastic Search (when you expect the new question intersects with the original one by some words)
But it could hurt these cos-similarity models due to how they are trained/fine-tuned.

When you use sentence embeddings you expect the system to be smart and it doesn't rely on the same words, but "gets the meaning" instead.
When the question in the db and query are very similar, it can improve the similarity, but I don't expect this.

In [11]:
query = "How many people live in London?"
db = embedding_model.encode([
    "Around 9 Million people live in London",
    "London is known for its financial district"
])


util.dot_score(embedding_model.encode(query), db)[0].cpu().tolist()

[0.8703423142433167, 0.4522414803504944]

Including a re-formulated question to the answer decreases the score.

In [12]:
query = "How many people live in London?"
db = embedding_model.encode([
    "What is the population of London? Around 9 Million people live in London",
    "What is London known for? London is known for its financial district"
])


util.dot_score(embedding_model.encode(query), db)[0].cpu().tolist()

[0.8537825345993042, 0.4601653814315796]

Anyway, the reference solutions include the questions in the answers, so let's proceed.

In [13]:
scores = util.dot_score(embedding_model.encode(user_question), embeddings)[0].cpu().numpy()
scores.max()

0.65065736

## Q4. Hit-rate for our search engine

In [39]:
class VectorSearchEngine:
    def __init__(self, 
                 documents: list[dict],
                 model: SentenceTransformer) -> None:
        self.__document_ids = [doc["id"] for doc in documents]
        self.__model = model
        texts = [
            f"{doc['question']} {doc['text']}"
            for doc in documents
        ]
        self.__embeddings = model.encode(texts, show_progress_bar=True)

    @property
    def embeddings(self) -> np.ndarray:
        return self.__embeddings.cpu().numpy()

    def search(self, query, num_results: int=10):
        scores = util.dot_score(
            self.__model.encode(query), 
            self.__embeddings
        )[0].cpu().numpy()
        candidate_idx = np.argpartition(-scores, num_results)[:num_results]
        idx = candidate_idx[np.argsort(-scores[candidate_idx])]
        return [self.__document_ids[i] for i in idx]

In [40]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [41]:
def compute_hit_rate(ground_truth: list[dict],
                     search_engine: VectorSearchEngine,
                     num_results: int) -> float:
    questions = (doc["question"] for doc in ground_truth)
    y_pred = (
        search_engine.search(query=q, num_results=num_results)
        for q in questions
    )
    y_true = (doc["document"] for doc in ground_truth)
    hit_generator = tqdm(zip(y_pred, y_true), total=len(ground_truth))
    hits = [
        1.0 if correct in pred else 0.0
        for (pred, correct) in hit_generator
    ]
    return np.mean(hits)

In [42]:
search_engine = VectorSearchEngine(documents=documents, model=embedding_model)
hit_rate = compute_hit_rate(ground_truth=ground_truth, search_engine=search_engine, num_results=5)
hit_rate

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1830 [00:00<?, ?it/s]

0.9218579234972678

## Q5. Indexing with Elasticsearch

In [34]:
!docker run -it \
    --rm \
    --name elasticsearch \
    -d \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


8f8dba2fb3fb084e737f01c37d32f4ca65bd5883ebb0e5ecaad7b1ed41dcb176


In [37]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': '8f8dba2fb3fb', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'sxcBZ5S0QH-kdf9MRUBWKw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [38]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [84]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [85]:
for doc in tqdm(documents):
    try:
        doc["text_vector"] = (
            embedding_model
            .encode(f"{doc['question']} {doc['text']}")
            .tolist()
        )
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

  0%|          | 0/948 [00:00<?, ?it/s]

In [90]:
assert (
    es_client
    .indices
    .stats(index=index_name)
    .body["_all"]["primaries"]["docs"]["count"]
    ) == len(documents)

In [114]:
class ESSearchEngine:
    def __init__(self,
                 es_client: Elasticsearch,
                 index_name: str) -> None:
        if not es_client.indices.exists(index=index_name).body:
            raise KeyError("Index must be filled up externally")
        self.__es_client = es_client
        self.__index_name = index_name
        # hardcoded
        self.__num_candidates = 10_000

    def search(self, query_vector: np.ndarray, num_results: int=10):
        knn_query = {
            "field": "text_vector",
            "query_vector": query_vector,
            "k": num_results,
            "num_candidates": self.__num_candidates, 
        }
        response = self.__es_client.search(
            index=index_name,
            # Additional filters, i.e. section
            query=None,
            knn=knn_query,
            size=num_results,
        )
        return [hit["_source"]["id"] for hit in response["hits"]["hits"]]

In [115]:
es_search_engine = ESSearchEngine(
    es_client=es_client,
    index_name=index_name,
)

In [116]:
es_search_engine.search(embedding_model.encode(user_question), num_results=1)

['ee58a693']

In [117]:
[doc for doc in documents if doc["id"] == "ee58a693"]

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693',
  'text_vector': [0.08062857389450073,
   -0.06663881987333298,
   0.02527313306927681,
   -0.013004389591515064,
   0.07587283104658127,
   -0.059462644159793854,
   -0.02188391610980034,
   0.0029000581707805395,
   0.0007929443963803351,
   -0.005222355481237173,
   -0.03365180268883705,
   -0.027913551777601242,
   0.058116547763347626,
   0.03974829614162445,
   0.05441856011748314,
   -0.0382516048848629,
   0.0630574300885

## Q6. Hit-rate for Elasticsearch

We using the same dataset with the same embedding model.

I expect almost the same result.
The difference could be explained by the NN/ANN algorithm used inside the ES.

Since I used the fair brute-force, the ES hit rate can't be better than custom solution,
but it could be faster on large indexes.

In [122]:
def compute_hit_rate(ground_truth: list[dict],
                     search_engine: VectorSearchEngine,
                     embedding_model: SentenceTransformer,
                     num_results: int) -> float:
    questions = (doc["question"] for doc in ground_truth)
    y_pred = (
        search_engine.search(
            query_vector=embedding_model.encode(q), 
            num_results=num_results
        )
        for q in questions
    )
    y_true = (doc["document"] for doc in ground_truth)
    hit_generator = tqdm(zip(y_pred, y_true), total=len(ground_truth))
    hits = [
        1.0 if correct in pred else 0.0
        for (pred, correct) in hit_generator
    ]
    return np.mean(hits)

In [123]:
hit_rate = compute_hit_rate(
    ground_truth=ground_truth,
    search_engine=es_search_engine,
    embedding_model=embedding_model,
    num_results=5
)
hit_rate

  0%|          | 0/1830 [00:00<?, ?it/s]

0.9218579234972678