In [None]:
from google.colab import drive
import pandas as pd
import json

# Mount Google Drive
drive.mount('/content/drive')

# Read json file
file_path = 'drive/MyDrive/questions_data/documents-with-ids.json'

with open(file_path, 'r') as f:
    documents = json.load(f)

print(f"Successfully read data: {file_path}")
print(f"Total entries read: {len(documents)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully read data: drive/MyDrive/questions_data/documents-with-ids.json
Total entries read: 948


In [None]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-8.16.0-py3-none-any.whl.metadata (8.8 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.15.1-py3-none-any.whl.metadata (3.7 kB)
Downloading elasticsearch-8.16.0-py3-none-any.whl (543 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/543.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/543.1 kB[0m [31m17.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m543.1/543.1 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading elastic_transport-8.15.1-py3-none-any.whl (64 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elastic-tran

In [None]:
import logging
import os
from elasticsearch import Elasticsearch
from google.colab import userdata
from getpass import getpass

# Elastic Cloud ID and API Key in Colab environment
ELASTIC_CLOUD_ID = userdata.get('ELASTIC_CLOUD_ID')
ELASTIC_API_KEY = userdata.get('ELASTIC_API_KEY')

# Create the client
es_client = Elasticsearch(
    cloud_id=ELASTIC_CLOUD_ID,
    api_key=ELASTIC_API_KEY,
    timeout=60  # increase timeout
)

# Check connection
if es_client.ping():
    print('Successfully connected to Elasticsearch')
    print(es_client.info())
else:
    print('Could not connect to Elasticsearch')

  es_client = Elasticsearch(


Successfully connected to Elasticsearch
{'name': 'instance-0000000000', 'cluster_name': 'bd5c04e72b0c4673b4cbd3db98fdb86b', 'cluster_uuid': 'K11-nm1NQeu_GpxDE5TTJg', 'version': {'number': '8.14.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'd55f984299e0e88dee72ebd8255f7ff130859ad0', 'build_date': '2024-07-07T22:04:49.882652950Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [None]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [None]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a482086d'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud ac

In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Read CSV file
file_path = 'drive/MyDrive/questions_data/ground-truth-data.csv'
df_ground_truth = pd.read_csv(file_path)

df_ground_truth

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,question,course,document
0,When is the commencement date and time of the ...,data-engineering-zoomcamp,c02e79ef
1,How can I subscribe to the course's public Goo...,data-engineering-zoomcamp,c02e79ef
2,Is there a link to register for the course bef...,data-engineering-zoomcamp,c02e79ef
3,Where can I join the Telegram channel for cour...,data-engineering-zoomcamp,c02e79ef
4,Which Slack channel should I join for the cour...,data-engineering-zoomcamp,c02e79ef
...,...,...,...
60,Where can students watch the live stream of th...,data-engineering-zoomcamp,04aa4897
61,How should students submit questions during th...,data-engineering-zoomcamp,04aa4897
62,Where is the YouTube URL for the Office Hour s...,data-engineering-zoomcamp,04aa4897
63,Why is it not recommended to post questions in...,data-engineering-zoomcamp,04aa4897


In [None]:
ground_truth = df_ground_truth.to_dict(orient='records') # Converts the DataFrame to a list of dictionaries, where each list item contains all column values for one row.

In [None]:
relevance_total = []  # Creates an empty list to store relevance results.

for q in tqdm(ground_truth):  # Iterates through each item in ground_truth using a tqdm progress bar.
    doc_id = q['document']  # Retrieves the document ID in the results.
    results = elastic_search(query=q['question'], course=q['course']) # Performs an Elasticsearch search using the question as query, and limitied to course information.
    relevance = [d['id'] == doc_id for d in results]  # Checks if each search result's ID matches the correct document ID, producing a list of Boolean values.
    relevance_total.append(relevance) # Adds the relevance result to the total results list.

  0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
ground_truth

[{'question': 'When is the commencement date and time of the course?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': "How can I subscribe to the course's public Google Calendar?",
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Is there a link to register for the course before it begins?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Where can I join the Telegram channel for course announcements?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Which Slack channel should I join for the course discussions?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What should I know before starting this data engineering course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'question': 'Are there any recommended skills or knowledge for this course?',
  'course': 'data-engineering-zoomcamp',
  '

In [None]:
# This result indicates:

# The document ID of the first search result (index 0) matches the correct answer (doc_id), so it’s True.
# The document IDs of the other four search results (index 1-4) do not match the correct answer, so they’re False.
# This is used to evaluate search accuracy — checking if Elasticsearch ranks the correct document at the top of the search results. In this example, the correct document is ranked first.

relevance

[True, False, False, False, False]

In [None]:
results

[{'text': 'The zoom link is only published to instructors/presenters/TAs.\nStudents participate via Youtube Live and submit questions to Slido (link would be pinned in the chat when Alexey goes Live). The video URL should be posted in the announcements channel on Telegram & Slack before it begins. Also, you will see it live on the DataTalksClub YouTube Channel.\nDon’t post your questions in chat as it would be off-screen before the instructors/moderators have a chance to answer it if the room is very active.',
  'section': 'General course-related questions',
  'question': 'Office Hours - What is the video/zoom link to the stream for the “Office Hour” or workshop sessions?',
  'course': 'data-engineering-zoomcamp',
  'id': '04aa4897'},
 {'text': 'We will probably have some calls during the Capstone period to clear some questions but it will be announced in advance if that happens.',
  'section': 'General course-related questions',
  'question': 'Besides the “Office Hour” which are the l

In [None]:
# Why are there 5 [False, True, False, False, False] grouped together?
# in previous code, we indicate that size = 5

relevance_total

# These 5 represent the top 5 search results returned by Elasticsearch.
# The system is set to return the 5 most relevant documents for each search,
# and then checks which of these 5 results contain the correct answer (True) and which do not (False).
# This allows for an assessment of the ranking quality of the search results.

[[False, True, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [False, True, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, 

# Hit Rate (HR) or Recall at k:

## Measures the proportion of queries for which at least one relevant document is retrieved in the top k results.
## Formula: HR@k = (Number of queries with at least one relevant document in top k) / |Q|

In [None]:
# relevance_total shows the evaluation of search results for each question:

# Each sublist represents the search results for one question (with a total of 5 results).
# True means the document ID of that search result matches the correct answer.
# False means it does not match.
# For example, [False, True, False, False, False] represents:

# The second search result is the correct document.
# The other 4 results are not the correct document.
# This allows for evaluating the performance of the search system:

# If True appears in one of the top positions, it indicates better search effectiveness.
# If True appears in a lower position or there is no True, it indicates poorer search effectiveness.

example = [
    [True, False, False, False, False], # 1
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [False, False, True, False, False],  # 1
    [False, False, False, False, False], # 0
]

In [None]:
len(example)

12

In [None]:
7 / len(example)

0.5833333333333334

In [None]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [None]:
hit_rate(example)

0.5833333333333334

# Mean Reciprocal Rank (MRR):

## Evaluates the rank position of the first relevant document.
## Formula: MRR = (1 / |Q|) * Σ (1 / rank_i) for i = 1 to |Q|

In [None]:
example = [
    [True, False, False, False, False], # 1,
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [True, False, False, False, False], # 1
    [False, False, True, False, False],  # 1/3
    [False, False, False, False, False], # 0
]

# Evaluates the rank position of the first relevant document:

"""
rank => 1 / rank
"""
# none => 0

# for first position we add 1: # 1 => 1
# for second position we add 1 / 2: # 2 => 1 / 2 = 0.5
# for third position we add 1 / 3: # 3 => 1 / 3 = 0.3333
# for forth position we add 1 / 4: # 4 => 0.25
# for fifth position we add 1 / 5: # 5 => 0.2

'\nrank => 1 / rank\n'

In [None]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [None]:
mrr(example)

0.5277777777777778

# hit-rate (recall) and Mean Reciprocal Rank (mrr)

In [None]:
hit_rate(relevance_total), mrr(relevance_total)

(0.6615384615384615, 0.5248717948717949)

# minsearch

In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-11-14 17:10:33--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-11-14 17:10:34 (72.5 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [None]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x7c5f54162320>

In [None]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [None]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7846153846153846, 0.638974358974359)

## Compare with ES results:

(0.7395720769397017, 0.6032418413658963)

Minsearch is higher

# ground-truth-data.csv

In [None]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

Elastic Search

In [None]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

  0%|          | 0/65 [00:00<?, ?it/s]

{'hit_rate': 0.6615384615384615, 'mrr': 0.5248717948717949}

minsearch

In [None]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/65 [00:00<?, ?it/s]

{'hit_rate': 0.7846153846153846, 'mrr': 0.638974358974359}

# We can make experiment on changing parameters "fields" to: question^5", "text"

In [None]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

  0%|          | 0/65 [00:00<?, ?it/s]

{'hit_rate': 0.6615384615384615, 'mrr': 0.5248717948717949}