In [None]:
from sentence_transformers import SentenceTransformer
import requests
import numpy as np
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
import pandas as pd

# Q1: Getting the embeddings model
model_name = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)

user_question = "I just discovered the course. Can I still join it?"
user_question_embedding = embedding_model.encode(user_question)
print("Q1: First value of the resulting vector:", user_question_embedding[0])

# Q2: Prepare the documents
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

# Filter for "machine-learning-zoomcamp"
filtered_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']
print("Number of filtered documents:", len(filtered_documents))
print("filtered_documents[0] = ", filtered_documents[0])

# Q2: Creating the embeddings
embeddings = []
for doc in filtered_documents:
    qa_text = f"{doc['question']} {doc['text']}"
    qa_embedding = embedding_model.encode(qa_text)
    embeddings.append(qa_embedding)

X = np.array(embeddings)
print("Q2: Shape of X:", X.shape)

# Q3: Search
v = user_question_embedding
scores = X.dot(v)
print("Q3: Highest score in the results:", scores.max())

# Q4. Hit-rate for our search engine
class VectorSearchEngine:
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

# Create the search engine
search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)

# Search for the top 5 results
results = search_engine.search(v, num_results=5)
# print("Search for the top 5 results")
# print(results)

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')
print("ground_truth[0]  =  ", ground_truth[0])


def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)



def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)



def evaluate(ground_truth, search_engine, num_results=5):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_engine.search(embedding_model.encode(q['question']), num_results=5)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

metrics = evaluate( ground_truth, search_engine, num_results=5)
print(f"Q4:  Hit-rate: {metrics['hit_rate']:.2f}")


# Q5

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

for doc in tqdm(filtered_documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = embedding_model.encode(question)
    doc['text_vector'] = embedding_model.encode(text)
    doc['question_text_vector'] = embedding_model.encode(qt)

for doc in tqdm(filtered_documents):
    es_client.index(index=index_name, document=doc)
    

knn = {
    "field": 'question_text_vector',
    "query_vector": user_question_embedding,
    "k": 5,
    "num_candidates": 10000,
    "filter": {
        "term": {
            "course": 'machine-learning-zoomcamp'
        }
    }
}

search_query = {
    "knn": knn,
    "_source": ["text", "section", "question", "course", "id"]
}

es_results = es_client.search(
    index=index_name,
    body=search_query
)
    
print("Q5: What's the ID of the document with the highest score? ", es_results['hits']['hits'][0]['_source']['id'])


# Q6


def elastic_search_knn(field, vector, course, num_results=5):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": num_results,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs



def question_text_vector_knn(q, num_results=5):
    question = q['question']
    course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course, num_results= num_results)



def evaluate(ground_truth, search_engine, num_results=5):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_engine(q, num_results=5)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

metrics = evaluate(ground_truth, question_text_vector_knn, num_results=5)
print(f"Q6: Hit-rate: {metrics['hit_rate']:.2f}")



Q1: First value of the resulting vector: 0.07822261
Number of filtered documents: 375
filtered_documents[0] =  {'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork', 'section': 'General course-related questions', 'question': 'How do I sign up?', 'course': 'machine-learning-zoomcamp', 'id': '0227b872'}
Q2: Shape of X: (375, 768)
Q3: Highest score in the results: 0.65065736
ground_truth[0]  =   {'question': 'Where can I sign up for the course?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}


  0%|          | 0/1830 [00:00<?, ?it/s]