# 1

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model_name = "multi-qa-distilbert-cos-v1"

In [None]:
embedding_model = SentenceTransformer(model_name)

In [None]:
user_question = "I just discovered the course. Can I still join it?"

In [None]:
uq_emb = embedding_model.encode(user_question)

In [None]:
uq_emb[0]

# Preparation

In [None]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

# 2

In [None]:
import numpy as np

In [None]:
documents[0]

In [None]:
doc_embs = []
for doc in documents:
    question = doc['question']
    text = doc['text']
    qa_text = f'{question} {text}'
    emb = embedding_model.encode(qa_text)
    doc_embs.append(emb)

doc_embs = np.array(embs)

In [None]:
doc_embs.shape

# 3

In [None]:
scores = doc_embs.dot(uq_emb)

In [None]:
np.max(scores)

# Vector search

In [None]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

In [None]:
search_engine = VectorSearchEngine(documents=documents, embeddings=doc_embs)

In [None]:
search_engine.search(uq_emb, num_results=5)

# 4

In [None]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [None]:
ground_truth[:3]

In [None]:
def hit_rate(y_true, y_preds):
    return int(y_true in y_preds)

In [None]:
hrs = []
for doc in ground_truth:
    q_emb = embedding_model.encode(doc['question'])
    res = search_engine.search(q_emb, num_results=5)
    hr = hit_rate(doc['document'], [d['id'] for d in res])
    hrs.append(hr)

In [None]:
np.mean(hrs)

In [None]:
hrs = []
for doc in ground_truth:
    q_emb = embedding_model.encode(doc['question'])
    res = search_engine.search(q_emb, num_results=1)
    hr = hit_rate(doc['document'], [d['id'] for d in res])
    hrs.append(hr)

In [None]:
np.mean(hrs)

# 5

In [None]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "text_vector":{"type":"dense_vector","dims": 768,"index":True,"similarity": "cosine"},
            "section": {"type": "keyword"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "id": {"type": "text"},
            "question_vector":{"type":"dense_vector","dims": 768,"index":True,"similarity": "cosine"},
            "question_text_vector":{"type":"dense_vector","dims": 768,"index":True,"similarity": "cosine"},
        }
    }
}

In [None]:
index_name = "course-questions"

In [None]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)

In [None]:
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
#created the dense vector using the pre-trained model
operations = []
for doc in documents:
    # Transforming the title into an embedding using the model
    doc["question_vector"] = embedding_model.encode(doc["question"]).tolist()
    doc["text_vector"] = embedding_model.encode(doc["text"]).tolist()
    doc["question_text_vector"] = embedding_model.encode(f"{doc['question']} {doc['text']}").tolist()
    operations.append(doc)

In [None]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [None]:
query = {
    "field" : "question_text_vector",
    "query_vector" :  uq_emb,
    "k" : 5,
    "num_candidates" : 5, 
}

In [None]:
res = es_client.search(
    index=index_name, 
    knn=query, 
    source=["text", "section", "question", "course", "id"]
)

In [None]:
res['hits']['hits'][0]['_source']["id"]

In [None]:
hrs = []
for doc in ground_truth:
    q_emb = embedding_model.encode(doc['question'])

    query = {
        # "field" : "question_text_vector",
        "field": "question_vector",
        "query_vector" :  q_emb,
        "k" : 5,
        "num_candidates" : 5, 
    }
    
    res = es_client.search(
        index=index_name, 
        knn=query, 
        source=["id"]
    )
    hr = hit_rate(doc['document'], [d['_source']['id'] for d in res['hits']['hits']])
    hrs.append(hr)

np.mean(hrs)

In [None]:
hrs = []
for doc in ground_truth:
    q_emb = embedding_model.encode(doc['question'])

    query = {
        "field": "text_vector",
        "query_vector" :  q_emb,
        "k" : 5,
        "num_candidates" : 5, 
    }
    
    res = es_client.search(
        index=index_name, 
        knn=query, 
        source=["id"]
    )
    hr = hit_rate(doc['document'], [d['_source']['id'] for d in res['hits']['hits']])
    hrs.append(hr)

np.mean(hrs)

In [None]:
hrs = []
for doc in ground_truth:
    q_emb = embedding_model.encode(doc['question'])

    query = {
        "field" : "question_text_vector",
        "query_vector" :  q_emb,
        "k" : 5,
        "num_candidates" : 5, 
    }
    
    res = es_client.search(
        index=index_name, 
        knn=query, 
        source=["id"]
    )
    hr = hit_rate(doc['document'], [d['_source']['id'] for d in res['hits']['hits']])
    hrs.append(hr)

np.mean(hrs)

In [None]:
hrs = []
for doc in ground_truth:
    q_emb = embedding_model.encode(doc['question'])

    query = {
        "field" : "question_text_vector",
        "query_vector" :  q_emb,
        "k" : 5,
        "num_candidates" : 1000, 
    }
    
    res = es_client.search(
        index=index_name, 
        knn=query, 
        source=["id"]
    )
    hr = hit_rate(doc['document'], [d['_source']['id'] for d in res['hits']['hits']])
    hrs.append(hr)

In [None]:
np.mean(hrs)