## Evaluation of Text Retrieval Techniques for RAG out

In [76]:
import json

def load_documents(file_path):
    """
    Load the JSON documents from the specified file path.

    Parameters:
    - file_path (str): The path to the JSON file.

    Returns:
    - documents (list): The list of documents loaded from the JSON file.
    """
    with open(file_path, 'rt') as f_in:
        documents = json.load(f_in)
    return documents

In [77]:
# Example usage
file_path = '../data/documents-with-ids.json'
documents = load_documents(file_path)

In [78]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [79]:
from elasticsearch import Elasticsearch

def setup_elasticsearch_index(es_url, index_name):
    """
    Set up an Elasticsearch index with specified settings and mappings.

    Parameters:
    - es_url (str): The URL of the Elasticsearch instance.
    - index_name (str): The name of the index to be created.

    Returns:
    - es_client (Elasticsearch): The Elasticsearch client instance.
    """
    es_client = Elasticsearch(es_url)

    index_settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"},
                "id": {"type": "keyword"},
            }
        }
    }

    # Delete the index if it exists
    es_client.indices.delete(index=index_name, ignore_unavailable=True)
    
    # Create the index with the specified settings
    es_client.indices.create(index=index_name, body=index_settings)
    
    return es_client

In [80]:
es_url = 'http://localhost:9200'
index_name = 'course-questions_aaron'
es_client = setup_elasticsearch_index(es_url, index_name)

In [81]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [82]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [83]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [84]:
import pandas as pd
df =  pd.read_csv('../data/ground-truth-data.csv')
df.head(3)

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef


In [85]:
ground_truth = df.to_dict(orient='records')

In [86]:
relevance_total = []
for q in tqdm(ground_truth):
  doc_id = q['document']
  result = elastic_search(query=q['question'],course=q['course'])
  relevance = [d['id'] == doc_id for d in result]
  relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [87]:
relevance_total

[[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [],
 [],
 [],
 [],
 [],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [False, True, False, False, False],
 [False, True, False, False, False],
 [True

### Hit Rate (Recall)

Hit Rate is a metric that measures how often a relevant item appears in the top-k results of a search or recommendation list. It is a simple and intuitive metric to understand the effectiveness of an algorithm in bringing relevant items to the user’s attention.

#### Formula:

Hit Rate = \frac{\text{Number of hits}}{\text{Number of queries}}

•	 Explanation:

•	 Number of hits: The number of queries for which a relevant item was found in the top-k results.

•	 Number of queries: The total number of queries evaluated.

•	 Interpretation:

•	 A higher hit rate indicates that the algorithm is effective in retrieving relevant items within the top-k results.

•	 Hit Rate is often calculated for different values of k (e.g., Hit Rate@5, Hit Rate@10).


In [88]:
def hit_rate(relevance_total):
  count = 0
  for line in relevance_total:
    if True in line:
      count +=1
  return count/len(relevance_total)

In [89]:
example = [[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False]]

In [90]:
re = hit_rate(relevance_total)
re

0.7393559541819754

### Mean Reciprocal Rank (MRR)

Mean Reciprocal Rank (MRR) is a metric that evaluates the ranking quality of the first relevant item retrieved for a set of queries. It takes into account the position of the first relevant item and gives higher scores to results where the first relevant item appears earlier in the ranked list.

- Formula:

\text{MRR} = \frac{1}{|Q|} \sum_{i=1}^{|Q|} \frac{1}{\text{rank}_i}

Where:

•|Q| is the number of queries.

•	\text{rank}_i is the rank position of the first relevant item for the i-th query.

•	Explanation:

•	For each query, the reciprocal rank is the inverse of the rank of the first relevant item.

•	The mean of these reciprocal ranks over all queries gives the MRR.

•	Interpretation:

•	An MRR of 1 means that all first relevant items are ranked at the top (rank 1).

•	An MRR closer to 0 indicates that relevant items are ranked lower in the list.

•	MRR is particularly useful in scenarios where the relevance of items decreases with their position in the ranked list.


In [91]:
def calculate_mrr(queries):
    total_score = 0.0
    for query in queries:
        for rank in range(len(query)):
            if query[rank] == True:
                total_score = total_score + 1/(rank +1) # If no relevant document is found
    return total_score/len(queries)

In [92]:
mrr_result = calculate_mrr(relevance_total)
mrr_result

0.6029140551833445

In [93]:
!rm -f minsearch.py
!wget  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-13 17:16:28--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-13 17:16:32 (30.2 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [94]:
import sys
import os

In [95]:
sys.path.append(os.path.abspath(os.path.join('../script')))
from minsearch import  Index

In [96]:
index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x145845910>

In [97]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [98]:
relevance_total = []
for q in tqdm(ground_truth):
  doc_id = q['document']
  result = minsearch_search(query=q['question'],course=q['course'])
  relevance = [d['id'] == doc_id for d in result]
  relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [99]:
hit_rate(relevance_total) , calculate_mrr(relevance_total)

(0.7722066133563864, 0.661454506159499)

In [100]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': calculate_mrr(relevance_total),
    }

In [101]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7395720769397017, 'mrr': 0.6029788920106625}

In [102]:

evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7722066133563864, 'mrr': 0.661454506159499}