# What

As was originally worked on in #146 the vector database is being used alot moving forward.

This notebook will be a easy way to evaluate the performance of the vector database.

In the the two otehr nmotebooks `#165_basic_safety_issue_rag.ipynb` and `vector_db_exploration.ipynb` I developed some of this evaluation code.

In [None]:
import voyageai.client
import yaml
from typing import Callable

import pandas as pd
import numpy as np

import lancedb
import voyageai
import importlib
import dotenv

import engine.analyze.Embedding as Embedding

dotenv.load_dotenv()

vo = voyageai.Client()

# Test dataset

Inside `evaluation_searches.yaml` there is a collection of searches to test the retrieval performance

In [None]:
test_set = yaml.safe_load(open('evaluation_searches.yaml'))
test_set

# Evaluation functions

Using what seems to be the industry norm for retrieval I am going to uses the Normalized discounted cummulative gain (NDCG) metric.

In [None]:
def NDCG(results: pd.DataFrame, relevant_reports: list, at = 20):
    '''
    Calculates the Normalized discounted cummulative gain.
    Arugments
    results - this should be a dataframe of all of the safety issues. The rank of the reports will be extracted from the first occurane of it in `report_id` column
    relevant_reports - A list of all of the relevant report ID. This is treated as binary
    at - The number of results to consider
    '''
    # display(relevant_reports)
    reports_rank = list(enumerate(results['report_id'].unique()))[:100]

    # display(reports_rank)

    reports_relevance = [(at/2 if (report_id in relevant_reports) else 0) for _, report_id in reports_rank]

    # display(reports_relevance)
    
    DCG = [(pow(2,relevance) - 1) / np.log2(rank+1) for rank, relevance in zip(range(1, len(reports_relevance)+1), reports_relevance)]
    # display(DCG)
    DCG = sum(DCG)

    IDCG = [(pow(2,(at/2))- 1)  / np.log2(rank+1) for rank in range(1, len(reports_rank)+1)]
    # display(IDCG)
    IDCG = sum(IDCG)
    # print(DCG, IDCG)
    return DCG / IDCG

def evaluate_search(search: dict, search_function: Callable[[str, dict], pd.DataFrame], loss_function: Callable[[pd.DataFrame, list, int], float], valid_size = 20, verbose = False) -> float:

    search_results = search_function(search['query'], search["settings"])
    if search_results is None:
        print("No results found therefore score is 0")
        return 0, 0, None
    expected_report_ids = set(search['expected_reports'])
    search_report_ids = set(search_results['report_id'].head(valid_size))

    score = loss_function(search_results, expected_report_ids, valid_size)

    percent_present_reports = len(expected_report_ids.intersection(search_report_ids)) / len(expected_report_ids)
    if verbose:
        print(f"  Percentage of expected reports present in search results: {percent_present_reports} with score: {score}")
        if percent_present_reports != 1.0:
            misisng_reports = list(expected_report_ids.difference(search_report_ids))
            print(f"  Missing reports: {misisng_reports}")
            print(f"  These are at index {[search_results.report_id.ne(report_id).idxmin() for report_id in misisng_reports]}")
        
        display(search_results)
    
    return score, percent_present_reports, search_results
    

In [None]:
def evaluate_searches(searches, search_function, verbose=False):
    at = 20
    test_results = []
    for i, search in enumerate(searches):
        if verbose:
            print(f"{i} Evaluating search: '{search['query']}'")
        ndcg, percent_present, results = evaluate_search(search, search_function, loss_function=NDCG, valid_size=at, verbose = verbose)
        test_results.append({
            "search_id": i,
            "search": search,
            f"ndcg@{at}": ndcg,
            f"percent_present_@_{at}": percent_present,
            "search_results": results
        })

    test_results_df = pd.DataFrame(test_results)
    
    print(f"Average NDCG@{at}: {test_results_df[f'ndcg@{at}'].mean()}, with average reports_present: {test_results_df[f'percent_present_@_{at}'].mean() * 100}%")

    return test_results_df
    

# Testings

I am going to mainly compare between two different embeddings

In [None]:
import viewer.Searching as Searching

def search_function(query: str, setting_dict, table, embedding_model) -> pd.DataFrame:
    setting_dict = {f"setting_{key}": value for key, value in setting_dict.items()}
    settings = Searching.SearchSettings.from_dict(setting_dict)
    searcher = Searching.SearchEngineSearcher(
        Searching.Search(query, settings),
        table,
        embedding_model
    )

    results = searcher.search()

    return results

In [None]:
db = lancedb.connect("vector_db")

db.table_names()

## voyage-3 

In [None]:
voyage_3_table = db.open_table("voyage-3")
voyage_3_table.count_rows()

In [None]:
importlib.reload(Searching)
evaluate_searches(
    test_set,
    lambda query, settings: search_function(
        query,
        settings,
        voyage_3_table,
        "voyage-3"
    ),
    verbose=False
)

## Voyage-large-2-instruct

In [None]:
voyage_2_table = db.open_table("voyage-2")


In [None]:
evaluate_searches(
    test_set,
    lambda query, settings: search_function(
        query,
        settings,
        voyage_2_table,
        "voyage-large-2-instruct"
    ),
    verbose=False
)

## Reankers


In [None]:
import voyageai

vo = voyageai.Client()

def reranked_search(query, settings):

    results = search_function(query, settings, voyage_2_table).head(1_000)

    reranking = vo.rerank(query, results["document"].tolist(), model = "rerank-2")

    results["reranked_score"] = [r.relevance_score for r in reranking.results]

    results.sort_values("reranked_score", inplace=True, ascending=False)

    return results


In [None]:
evaluate_searches(
    test_set,
    reranked_search,
    verbose=False
)