## Evaluate a vector retrieval feature against question and answer pair.

The goal is to avaluate the concatination of question and answer, the same metrics are used. See the [Evaluate a text retrieval](03_evaluate_text_retrieval.ipynb) file.

In [118]:
from services import *
from common.settings import Settings
from common.client_factory import ClientFactory
from services.retrieval_service import RetrievalService
from services.reciprocal_rank_fusion_service import ReciprocalRankFusionService
from common.sentence_transformer_model_factory import SentenceTransformerModelFactory
import pandas as pd
from pandas import DataFrame
import json
import os
from typing import Any, Dict, List
from pprint import pprint
from tqdm import tqdm

import importlib
import retrieval_evaluation_utils as ev_utils
import retrieval_evaluation_ranking_utils as ranking_utils

In [119]:
# Reload the module
importlib.reload(ev_utils)
importlib.reload(ranking_utils)

<module 'retrieval_evaluation_ranking_utils' from '/home/jovyan/work/notebook/utils/retrieval_evaluation_ranking_utils.py'>

In [120]:
doc_with_ids_path = "/home/jovyan/work/notebook/retrieval_evaluation/dataset_with_doc_ids.csv"
ground_truth_path = "/home/jovyan/work/notebook/retrieval_evaluation/ground_truth.csv"
evaluation_results_path = "/home/jovyan/work/notebook/retrieval_evaluation/evaluation_results.csv"
test_name = "vector_question_answer"
model_name = "distiluse-base-multilingual-cased-v1"

### Evaluate the vector retrieval feature using MRR and HR@k.

#### Load the dataset and ground truth

In [121]:
dataset_df: DataFrame

if os.path.exists(doc_with_ids_path):
    dataset_df = pd.read_csv(doc_with_ids_path, delimiter=";")
else:
    columns = ['source_system', 'category', 'question', 'document_id']
    dataset_df = pd.DataFrame(columns=columns)

dataset_df[:2]

Unnamed: 0,source_system,category,question,answer,document_id
0,evdi,Analysekonzept,Wie läuft der Analyseprozess für Immobilienpro...,Der Analyseprozess bei Engel & Völkers Digital...,f2624f5125f9
1,evdi,Analysekonzept,Wie werden die Anlageprojekte bewertet und wie...,Die Bewertung der Anlageprojekte bei Engel & V...,14e6c2e22916


In [122]:
groud_truth_df: DataFrame
if os.path.exists(ground_truth_path):
    groud_truth_df = pd.read_csv(ground_truth_path, delimiter=";")
else:
    columns = ['source_system', 'category', 'question', 'document_id']
    groud_truth_df = pd.DataFrame(columns=columns)

groud_truth_df[:2]

Unnamed: 0,source_system,category,question,document_id
0,evdi,Analysekonzept,Was sind die wichtigsten Schritte des Analysep...,f2624f5125f9
1,evdi,Analysekonzept,Welche externen Partner sind an der Analyse vo...,f2624f5125f9


In [123]:
settings = Settings()
settings.index_name = test_name
settings.embedding_model_name = model_name
client_factory = ClientFactory(settings)
es_client = client_factory.create_elasticsearch_client()
es_client.info()

ObjectApiResponse({'name': 'a639f0e3a42f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'aJrw85pRTECvuVEC8ISEHg', 'version': {'number': '8.9.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '8aa461beb06aa0417a231c345a1b8c38fb498a0d', 'build_date': '2023-07-19T14:43:58.555259655Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

#### Index questions and answers

In [124]:
model_factory = SentenceTransformerModelFactory(settings)
embedding_model = model_factory.create_model()
retrieval_service = RetrievalService(es_client, embedding_model, settings)



In [125]:
first_dataset_item = dataset_df.iloc[0]
question = first_dataset_item["question"]
vector_value = embedding_model.encode(question)
dimensions = len(vector_value)

In [126]:
index_settings = dict(
        settings=dict(
            number_of_shards=1,
            number_of_replicas=0,
        ),
        mappings=dict(
            properties=dict(
                answer=dict(type="text"),
                question=dict(type="text"),
                category=dict(type="text"),
                document_id=dict(type="text"),
                answer_instructions=dict(type="text"),
                source_system=dict(type="keyword"),
                vector_question_answer=dict(
                    type="dense_vector",
                    dims=dimensions,
                    index=True,
                    similarity="cosine",
                ),
            ),
        ),
    )

if es_client.indices.exists(index=test_name):
    es_client.indices.delete(index=test_name)

es_client.indices.create(index=test_name, body=index_settings)

for idx, row in tqdm(dataset_df.iterrows(), total=dataset_df.shape[0]):
    document_to_index = row.to_dict()
    question_answer = (document_to_index["question"] + " " + document_to_index["answer"])
    vector = embedding_model.encode(question_answer)
    document_to_index["vector_question_answer"] = vector

    es_client.index(index=test_name, document=document_to_index)

100%|██████████| 87/87 [00:23<00:00,  3.63it/s]


In [127]:
relevance_total: List[List[bool]] = []

In [128]:
for idx, row in tqdm(groud_truth_df.iterrows(), total=groud_truth_df.shape[0]):
    doc = row.to_dict()
    retrieval_result = retrieval_service.search(doc["question"], 5)
    vector_result = retrieval_result.vector_result_items
    
    relevance: List[bool] = []
    for item in vector_result:
        relevance.append(item.document_id == doc["document_id"])
    
    relevance_total.append(relevance)

100%|██████████| 435/435 [00:31<00:00, 13.77it/s]


In [129]:
pprint(relevance_total[:16])

[[True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False]]


In [130]:
def calculate_mrr(dataset: List[List[bool]]):
    total_score = 0.0    

    for row in dataset:
        for idx, value in enumerate(row):
            if value is True:
                total_score += 1 / (idx + 1)
                break

    result = total_score / len(dataset)
    return result

In [131]:
def calculate_hit_rate(dataset: List[List[bool]]) -> float:
    total_score = 0.0    

    for row in dataset:
        if True in row:
            total_score += 1
            

    result = total_score / len(dataset)
    return result

In [132]:
calculated_hit_rate_at_k5 = calculate_hit_rate(relevance_total)
print(f"Hit Rate value: {calculated_hit_rate_at_k5}")

Hit Rate value: 0.7839080459770115


In [133]:
calculated_hit_rate_at_k3 = ranking_utils.calculate_hit_rate_at_k(relevance_total, 3)
print(f"Hit Rate at k3 value: {calculated_hit_rate_at_k3}")

Hit Rate at k3 value: 0.7149425287356321


In [134]:
calculated_mrr = calculate_mrr(relevance_total)
print(f"MRR value: {calculated_mrr}")

MRR value: 0.6240613026819922


#### Save the results

In [135]:
df = pd.DataFrame({
    "source_system": [first_dataset_item["source_system"], first_dataset_item["source_system"], first_dataset_item["source_system"]],
    "method": [test_name, test_name, test_name],
    "metric": ["mrr", "HR@K5", "HR@K3"],
    "value": [calculated_mrr, calculated_hit_rate_at_k5, calculated_hit_rate_at_k3],
    "model": [settings.embedding_model_name, settings.embedding_model_name, settings.embedding_model_name],
    "description": ["vector evaluation against question/answer pair", "vector evaluation against question/answer pair", "vector evaluation against question/answer pair"]
})

ev_utils.add_evaluation_results(df, evaluation_results_path)

#### Clean up Elasticsearch index

In [136]:
if es_client.indices.exists(index=test_name):
    es_client.indices.delete(index=test_name)