In [12]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [17]:
from elasticsearch import Elasticsearch

# Create Elasticsearch client
es_client = Elasticsearch('http://localhost:9200')

# Define index settings with mappings matching your column names
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Category": {"type": "keyword"},  # Category as a keyword
            "Question": {"type": "text"},     # Question as text (full-text search)
            "Answer": {"type": "text"},       # Answer as text (full-text search)
            "id": {"type": "keyword"},        # id as a keyword (for unique identification)
        }
    }
}

index_name = "eval_text"

# Delete index if it exists
es_client.indices.delete(index=index_name, ignore_unavailable=True)

# Create the new index
es_client.indices.create(index=index_name, body=index_settings)


  es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'eval_text'})

In [18]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 429/429 [00:01<00:00, 260.96it/s]


In [19]:
def elastic_search(query, category=None):
    search_query = {
        "size": 5,  # Limit the search to 5 results
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["Question^3", "Answer", "Category"],  # Search in 'Question' and 'Answer' fields, include category
                        "type": "most_fields"  # Use 'best_fields' for matching
                    }
                }
            }
        }
    }

    # Add a filter if a category is specified
    if category:
        search_query['query']['bool']['filter'] = {
            "term": {
                "Category": category  # Filter based on 'Category'
            }
        }

    # Execute the search query on the 'faq-documents' index
    response = es_client.search(index=index_name, body=search_query)
    
    # Collect the search results
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs


In [20]:
# elastic_search(
#     query="I just discovered the course. Can I still join?",
#     category="data-engineering-zoomcamp"
# )

In [21]:
import pandas as pd

In [22]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [23]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [24]:
ground_truth

[{'Question': 'Sind Ausgaben für kostenpflichtige Literaturdatenbanken förderfähig?',
  'Category': 'General/other',
  'Document': '604174c6'},
 {'Question': 'Gibt es finanzielle Unterstützung für den Zugriff auf Literaturdatenbanken?',
  'Category': 'General/other',
  'Document': '604174c6'},
 {'Question': 'Kann ich die Kosten für Datenbanken in mein Projektbudget aufnehmen?',
  'Category': 'General/other',
  'Document': '604174c6'},
 {'Question': 'Was passiert, wenn ich Literaturdatenbanken für mein Projekt nutzen muss?',
  'Category': 'General/other',
  'Document': '604174c6'},
 {'Question': 'Sind nicht kostenlose Datenbanken auch förderfähig bei Projekten?',
  'Category': 'General/other',
  'Document': '604174c6'},
 {'Question': 'Was ist der Mindestbetrag für die Abschreibung von Geräten?',
  'Category': 'General/other',
  'Document': '904dc228'},
 {'Question': 'Bis zu welcher Summe können Geräte als Verbrauchsmaterial erworben werden?',
  'Category': 'General/other',
  'Document':

In [25]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['Document']
    results = elastic_search(query=q['Question'], category=q['Category'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  response = es_client.search(index=index_name, body=search_query)
100%|██████████| 2145/2145 [00:24<00:00, 88.01it/s]


- We iterate over all our queries in our ground truth, and check if our "ground truth" ID is among our results

In [26]:
example = [
    [True, False, False, False, False], # 1, 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1 
    [False, False, True, False, False],  # 1/3
    [False, False, False, False, False], # 0
]

# 1 => 1
# 2 => 1 / 2 = 0.5
# 3 => 1 / 3 = 0.3333
# 4 => 0.25
# 5 => 0.2
# rank => 1 / rank
# none => 0

In [27]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [28]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [29]:
hit_rate(example)

0.5833333333333334

In [30]:
mrr(example)

0.5277777777777778

- hit-rate (recall)
- Mean Reciprocal Rank (mrr)

In [31]:
hit_rate(relevance_total), mrr(relevance_total)

(0.6088578088578088, 0.46031080031080174)

In [32]:
import minsearch

# Define the index, specifying the text and keyword fields based on your documents
index = minsearch.Index(
    text_fields=["Question", "Answer"],   # Text fields to search (Question and Answer)
    keyword_fields=["Category", "id"]     # Keyword fields (Category and id)
)

# Fit the index with your documents DataFrame
index.fit(documents)


<minsearch.Index at 0x7fed68b35a80>

In [33]:
def minsearch_search(query, category=None):
    # Define the boost for specific fields (giving more weight to 'Question')
    boost = {'Question': 3.0, 'Answer': 1.0}

    # Define the filter, if a category is provided
    filter_dict = {'Category': category} if category else {}

    # Perform the search with the query, filter, and boost
    results = index.search(
        query=query,
        filter_dict=filter_dict,  # Filter by 'Category' if provided
        boost_dict=boost,         # Boost 'Question' more than 'Answer'
        num_results=5             # Limit results to 5
    )

    return results


In [34]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['Document']  # Assuming 'document' refers to the document's ID in the ground truth
    category = q['Category']  # Assuming the ground truth has a 'Category' field

    # Perform the search with the question and category
    results = minsearch_search(query=q['Question'], category=category)

    # Check if the correct document is in the search results
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)


100%|██████████| 2145/2145 [00:04<00:00, 432.50it/s]


In [35]:
hit_rate(relevance_total), mrr(relevance_total)

(0.5738927738927739, 0.43766122766122884)

Compare with ES results:
```
(0.6088578088578088, 0.46031080031080174)
```

In [36]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['Document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

- ES and minsearch again as overview:

In [37]:
evaluate(ground_truth, lambda q: elastic_search(q['Question'], q['Category']))

  response = es_client.search(index=index_name, body=search_query)
100%|██████████| 2145/2145 [00:42<00:00, 49.89it/s]


{'hit_rate': 0.6088578088578088, 'mrr': 0.46031080031080174}

In [38]:
evaluate(ground_truth, lambda q: minsearch_search(q['Question'], q['Category']))

100%|██████████| 2145/2145 [00:05<00:00, 427.71it/s]


{'hit_rate': 0.5738927738927739, 'mrr': 0.43766122766122884}