# Retrieval approach with minsearch

### Imports

In [1]:
import minsearch
import pandas as pd
import json
from tqdm import tqdm 

In [2]:
with open('../../data/documents_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
index = minsearch.Index(
    text_fields=[ "text", "section", "title", "chapter"],
    keyword_fields=["title", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7a0dc0618850>

In [4]:
def minsearch_search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5
    )

    return results

## Evaluation

In [5]:
gt_df = pd.read_csv('../../data/ground_truth_data.csv')

In [6]:
ground_truth =  gt_df.to_dict(orient ='records')

In [7]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [8]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [12]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['text_id']
    results = minsearch_search(query=q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 240/240 [00:00<00:00, 433.96it/s]


In [13]:
hit_rate(relevance_total), mrr(relevance_total)

(0.2875, 0.1885416666666667)