## Evaluate a vector retrieval feature against question and answer pair.

The goal is to avaluate the concatination of question and answer, the same metrics are used. See the [Evaluate a text retrieval](03_evaluate_text_retrieval.ipynb) file.

In [25]:
from services import *
from common.settings import Settings
from common.client_factory import ClientFactory
from services.retrieval_service import RetrievalService
from services.reciprocal_rank_fusion_service import ReciprocalRankFusionService
from common.sentence_transformer_model_factory import SentenceTransformerModelFactory
import pandas as pd
from pandas import DataFrame
from sentence_transformers import SentenceTransformer
import json
import os
from typing import Any, Dict, List
from pprint import pprint
from tqdm import tqdm

import importlib
import retrieval_evaluation_utils as ev_utils

In [26]:
# Reload the module
importlib.reload(ev_utils)

<module 'retrieval_evaluation_utils' from '/home/jovyan/work/notebook/utils/retrieval_evaluation_utils.py'>

In [27]:
doc_with_ids_path = "/home/jovyan/work/notebook/retrieval_evaluation/dataset_with_doc_ids.csv"
ground_truth_path = "/home/jovyan/work/notebook/retrieval_evaluation/ground_truth.csv"
evaluation_results_path = "/home/jovyan/work/notebook/retrieval_evaluation/evaluation_results.csv"
test_name = "vector_question_answer"

models = [
    "distiluse-base-multilingual-cased-v1",
    "deepset/gbert-base"
]

### Evaluate the vector retrieval feature using MRR and HR@k.

#### Load the dataset and ground truth

In [28]:
dataset_df: DataFrame

if os.path.exists(doc_with_ids_path):
    dataset_df = pd.read_csv(doc_with_ids_path, delimiter=";")
else:
    columns = ['source_system', 'category', 'question', 'document_id']
    dataset_df = pd.DataFrame(columns=columns)

dataset_df[:2]

Unnamed: 0,source_system,category,question,answer,document_id
0,evdi,Analysekonzept,Wie läuft der Analyseprozess für Immobilienpro...,Der Analyseprozess bei Engel & Völkers Digital...,f2624f5125f9
1,evdi,Analysekonzept,Wie werden die Anlageprojekte bewertet und wie...,Die Bewertung der Anlageprojekte bei Engel & V...,14e6c2e22916


In [29]:
groud_truth_df: DataFrame
if os.path.exists(ground_truth_path):
    groud_truth_df = pd.read_csv(ground_truth_path, delimiter=";")
else:
    columns = ['source_system', 'category', 'question', 'document_id']
    groud_truth_df = pd.DataFrame(columns=columns)

groud_truth_df[:2]

Unnamed: 0,source_system,category,question,document_id
0,evdi,Analysekonzept,Was sind die wichtigsten Schritte des Analysep...,f2624f5125f9
1,evdi,Analysekonzept,Welche externen Partner sind an der Analyse vo...,f2624f5125f9


In [30]:
settings = Settings()
settings.index_name = test_name
client_factory = ClientFactory(settings)
es_client = client_factory.create_elasticsearch_client()
es_client.info()

ObjectApiResponse({'name': 'a639f0e3a42f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'aJrw85pRTECvuVEC8ISEHg', 'version': {'number': '8.9.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '8aa461beb06aa0417a231c345a1b8c38fb498a0d', 'build_date': '2023-07-19T14:43:58.555259655Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

#### Define functions to index questions and answers

In [31]:
def create_embedding_model() -> SentenceTransformer:
    model_factory = SentenceTransformerModelFactory(settings)
    embedding_model = model_factory.create_model()
    return embedding_model

In [32]:
def get_dimensions(embedding_model: SentenceTransformer) -> int:
    first_dataset_item = dataset_df.iloc[0]
    question = first_dataset_item["question"]
    vector_value = embedding_model.encode(question)
    dimensions = len(vector_value)
    return dimensions

In [33]:
def index_dataset(dataset_df: DataFrame, embedding_model: SentenceTransformer, dimensions: int) -> None:
    index_settings = dict(
        settings=dict(
            number_of_shards=1,
            number_of_replicas=0,
        ),
        mappings=dict(
            properties=dict(
                answer=dict(type="text"),
                question=dict(type="text"),
                category=dict(type="text"),
                document_id=dict(type="text"),
                answer_instructions=dict(type="text"),
                source_system=dict(type="keyword"),
                vector_question_answer=dict(
                    type="dense_vector",
                    dims=dimensions,
                    index=True,
                    similarity="cosine",
                ),
            ),
        ),
    )

    if es_client.indices.exists(index=test_name):
        es_client.indices.delete(index=test_name)

    es_client.indices.create(index=test_name, body=index_settings)

    for idx, row in tqdm(dataset_df.iterrows(), total=dataset_df.shape[0]):
        document_to_index = row.to_dict()
        question_answer = (document_to_index["question"] + " " + document_to_index["answer"])
        vector = embedding_model.encode(question_answer)
        document_to_index["vector_question_answer"] = vector

        es_client.index(index=test_name, document=document_to_index)

In [34]:
def calculate_relevance(groud_truth_df: DataFrame, retrieval_service: RetrievalService) -> List[List[bool]]:
    relevance_total: List[List[bool]] = []

    for idx, row in tqdm(groud_truth_df.iterrows(), total=groud_truth_df.shape[0]):
        doc = row.to_dict()
        retrieval_result = retrieval_service.search(doc["question"], 5)
        vector_result = retrieval_result.vector_result_items
    
        relevance: List[bool] = []
        for item in vector_result:
            relevance.append(item.document_id == doc["document_id"])
    
        relevance_total.append(relevance)

    return relevance_total

In [35]:
def calculate_mrr(dataset: List[List[bool]]):
    total_score = 0.0    

    for row in dataset:
        for idx, value in enumerate(row):
            if value is True:
                total_score += 1 / (idx + 1)
                break

    result = total_score / len(dataset)
    return result

In [36]:
def calculate_hit_rate(dataset: List[List[bool]]) -> float:
    total_score = 0.0    

    for row in dataset:
        if True in row:
            total_score += 1
            

    result = total_score / len(dataset)
    return result

#### Save the results

In [37]:
def save_the_result(source_system, calculated_mrr: float, calculated_hit_rate: float, model_name: str, description: str):
    df = pd.DataFrame({
        "source_system": [source_system, source_system],
        "method": [test_name, test_name],
        "metric": ["mrr", "HR@K5"],
        "value": [calculated_mrr, calculated_hit_rate],
        "model": [model_name, model_name],
        "description": [description, description]
    })

    ev_utils.add_evaluation_results(df, evaluation_results_path)

In [38]:
#### Iterate over all models

for model in tqdm(models):
    embedding_model = create_embedding_model()
    retrieval_service = RetrievalService(es_client, embedding_model, settings)

    dimensions = get_dimensions(embedding_model)
    index_dataset(dataset_df, embedding_model, dimensions)

    relevant_items = calculate_relevance(groud_truth_df, retrieval_service)
    print(f"The first relevance items '{relevant_items[:10]}' for the model {model}")

    calculated_hit_rate = calculate_hit_rate(relevant_items)
    print(f"Hit Rate value: {calculated_hit_rate} for the model '{model}'.")

    calculated_mrr = calculate_mrr(relevant_items)
    print(f"MRR value: {calculated_mrr} for the model '{model}'.")

    save_the_result(dataset_df.iloc[0]["source_system"], calculated_mrr, calculated_hit_rate, model, "vector evaluation against question/answer pair")


100%|██████████| 87/87 [00:39<00:00,  2.20it/s]
100%|██████████| 435/435 [00:49<00:00,  8.75it/s]
 50%|█████     | 1/2 [01:59<01:59, 119.68s/it]

The first relevance items '[[True, False, False, False, False], [True, False, False, False, False], [True, False, False, False, False], [True, False, False, False, False], [True, False, False, False, False], [False, True, False, False, False], [True, False, False, False, False], [True, False, False, False, False], [False, True, False, False, False], [True, False, False, False, False]]' for the model distiluse-base-multilingual-cased-v1
Hit Rate value: 0.7839080459770115 for the model 'distiluse-base-multilingual-cased-v1'.
MRR value: 0.7839080459770115 for the model 'distiluse-base-multilingual-cased-v1'.


100%|██████████| 87/87 [00:14<00:00,  5.90it/s]
100%|██████████| 435/435 [00:39<00:00, 11.07it/s]
100%|██████████| 2/2 [02:56<00:00, 88.35s/it] 

The first relevance items '[[True, False, False, False, False], [True, False, False, False, False], [True, False, False, False, False], [True, False, False, False, False], [True, False, False, False, False], [False, True, False, False, False], [True, False, False, False, False], [True, False, False, False, False], [False, True, False, False, False], [True, False, False, False, False]]' for the model deepset/gbert-base
Hit Rate value: 0.7839080459770115 for the model 'deepset/gbert-base'.
MRR value: 0.7839080459770115 for the model 'deepset/gbert-base'.





#### Clean up Elasticsearch index

In [39]:
if es_client.indices.exists(index=test_name):
    es_client.indices.delete(index=test_name)