In [1]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [5]:
from elasticsearch import Elasticsearch

# Create Elasticsearch client
es_client = Elasticsearch('http://localhost:9200')

# Define index settings with mappings matching your column names
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Category": {"type": "keyword"},  # Category as a keyword
            "Question": {"type": "text"},     # Question as text (full-text search)
            "Answer": {"type": "text"},       # Answer as text (full-text search)
            "id": {"type": "keyword"},        # id as a keyword (for unique identification)
        }
    }
}

index_name = "faq_index"

# Delete index if it exists
es_client.indices.delete(index=index_name, ignore_unavailable=True)

# Create the new index
es_client.indices.create(index=index_name, body=index_settings)


  es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'faq_index'})

In [6]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 833/833 [00:03<00:00, 215.05it/s]


In [7]:
def elastic_search(query, category=None):
    search_query = {
        "size": 5,  # Limit the search to 5 results
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["Question^3", "Answer", "Category"],  # Search in 'Question' and 'Answer' fields, include category
                        "type": "most_fields"  # Use 'best_fields' for matching
                    }
                }
            }
        }
    }

    # Add a filter if a category is specified
    if category:
        search_query['query']['bool']['filter'] = {
            "term": {
                "Category": category  # Filter based on 'Category'
            }
        }

    # Execute the search query on the 'faq-documents' index
    response = es_client.search(index=index_name, body=search_query)
    
    # Collect the search results
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs


In [8]:
# elastic_search(
#     query="I just discovered the course. Can I still join?",
#     category="data-engineering-zoomcamp"
# )

In [9]:
import pandas as pd

In [11]:
df_ground_truth = pd.read_csv('ground-truth-data_old.csv')

In [12]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [14]:
ground_truth

[{'Question': 'Welche Unterlagen müssen zur Bonitätsprüfung eingereicht werden?',
  'Category': '--Bonitätsprüfung',
  'Document': 'bead1965'},
 {'Question': 'Wer wird zur Bonitätsprüfung aufgefordert?',
  'Category': '--Bonitätsprüfung',
  'Document': 'bead1965'},
 {'Question': 'Was passiert, wenn kein geprüfter Jahresabschluss vorliegt?',
  'Category': '--Bonitätsprüfung',
  'Document': 'bead1965'},
 {'Question': 'Was ist der Zweck einer Bonitätsprüfung?',
  'Category': '--Bonitätsprüfung',
  'Document': 'bead1965'},
 {'Question': 'Wo finde ich weitere Informationen zur Bonitätsprüfung?',
  'Category': '--Bonitätsprüfung',
  'Document': 'bead1965'},
 {'Question': 'Darf das Konsortium nur aus akademischen Institutionen bestehen?',
  'Category': '--Profi-Online',
  'Document': '99541b90'},
 {'Question': 'Können nur Forschungseinrichtungen ein Konsortium bilden?',
  'Category': '--Profi-Online',
  'Document': '99541b90'},
 {'Question': 'Sind ausschließlich Forscher für das Konsortium zu

In [15]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['Document']
    results = elastic_search(query=q['Question'], category=q['Category'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  response = es_client.search(index=index_name, body=search_query)
100%|██████████| 4165/4165 [00:49<00:00, 84.44it/s]


- We iterate over all our queries in our ground truth, and check if our "ground truth" ID is among our results

In [20]:
example = [
    [True, False, False, False, False], # 1, 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0 
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1 
    [False, False, True, False, False],  # 1/3
    [False, False, False, False, False], # 0
]

# 1 => 1
# 2 => 1 / 2 = 0.5
# 3 => 1 / 3 = 0.3333
# 4 => 0.25
# 5 => 0.2
# rank => 1 / rank
# none => 0

In [16]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [17]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [21]:
hit_rate(example)

0.5833333333333334

In [22]:
mrr(example)

0.5277777777777778

- hit-rate (recall)
- Mean Reciprocal Rank (mrr)

In [19]:
hit_rate(relevance_total), mrr(relevance_total)

(0.8424969987995198, 0.7043417366946788)

In [25]:
import minsearch

# Define the index, specifying the text and keyword fields based on your documents
index = minsearch.Index(
    text_fields=["Question", "Answer"],   # Text fields to search (Question and Answer)
    keyword_fields=["Category", "id"]     # Keyword fields (Category and id)
)

# Fit the index with your documents DataFrame
index.fit(documents)


<minsearch.Index at 0x7f9fc8919120>

In [26]:
def minsearch_search(query, category=None):
    # Define the boost for specific fields (giving more weight to 'Question')
    boost = {'Question': 3.0, 'Answer': 1.0}

    # Define the filter, if a category is provided
    filter_dict = {'Category': category} if category else {}

    # Perform the search with the query, filter, and boost
    results = index.search(
        query=query,
        filter_dict=filter_dict,  # Filter by 'Category' if provided
        boost_dict=boost,         # Boost 'Question' more than 'Answer'
        num_results=5             # Limit results to 5
    )

    return results


In [29]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['Document']  # Assuming 'document' refers to the document's ID in the ground truth
    category = q['Category']  # Assuming the ground truth has a 'Category' field

    # Perform the search with the question and category
    results = minsearch_search(query=q['Question'], category=category)

    # Check if the correct document is in the search results
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)


100%|██████████| 4165/4165 [00:11<00:00, 374.20it/s]


In [30]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7596638655462185, 0.6267026810724294)

Compare with ES results:
```
(0.8424969987995198, 0.7043417366946788)
```

In [32]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['Document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [34]:
evaluate(ground_truth, lambda q: elastic_search(q['Question'], q['Category']))

  response = es_client.search(index=index_name, body=search_query)
100%|██████████| 4165/4165 [01:24<00:00, 49.28it/s]


{'hit_rate': 0.8424969987995198, 'mrr': 0.7043417366946788}

In [35]:
evaluate(ground_truth, lambda q: minsearch_search(q['Question'], q['Category']))

100%|██████████| 4165/4165 [00:10<00:00, 407.48it/s]


{'hit_rate': 0.7596638655462185, 'mrr': 0.6267026810724294}