In [1]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "A9C4z4Pb1jo7FaM*W7uu") #docker password for elastic user
)

try:
    info = es_client.info()
    print("Connected to Elasticsearch!")
    print(info)
except Exception as e:
    print(f"Error connecting: {e}")

Connected to Elasticsearch!
{'name': 'f9a115421d59', 'cluster_name': 'docker-cluster', 'cluster_uuid': '82IQaASQR2mXKzSC26s4cA', 'version': {'number': '9.2.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'dfc5c38614c29a598e132c035b66160d3d350894', 'build_date': '2026-01-08T22:07:25.170027027Z', 'build_snapshot': False, 'lucene_version': '10.3.2', 'minimum_wire_compatibility_version': '8.19.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'}


In [2]:
es_client.info()

ObjectApiResponse({'name': 'f9a115421d59', 'cluster_name': 'docker-cluster', 'cluster_uuid': '82IQaASQR2mXKzSC26s4cA', 'version': {'number': '9.2.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'dfc5c38614c29a598e132c035b66160d3d350894', 'build_date': '2026-01-08T22:07:25.170027027Z', 'build_snapshot': False, 'lucene_version': '10.3.2', 'minimum_wire_compatibility_version': '8.19.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [3]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [4]:
import json 
from tqdm import tqdm 

with open('/home/gwm-279/Documents/DTC_AI/artifacts/documents_with_ids.json','r') as f:
          docs = json.load(f)

for doc in tqdm(docs):
    es_client.index(index=index_name,document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

100%|██████████| 948/948 [00:04<00:00, 225.91it/s]


In [5]:
query = 'When is the deadline of course?' 
search_query =  {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}
search_response = es_client.search(index=index_name,body=search_query)

search_response['hits']['hits'][0].get('_source')

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [6]:
import pandas 

gt_questions = pandas.read_csv('/home/gwm-279/Documents/DTC_AI/artifacts/ground-truth-questions.csv')

for index, row in gt_questions[:2].iterrows():
    query = row['question']
    course = row['course']
    document_id = row['document']
    search_query =  {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }
    search_response = es_client.search(index=index_name,body=search_query)
    print(f"Query: {query} -> id: {document_id}")
    for hit in search_response['hits']['hits']:
        print(f"Retrieved Ids: {hit['_source']['id']}")
        print(f"Score: {hit['_score']}, Question: {hit['_source']['question']}")
    print("-----")


Query: When does the course begin? -> id: c02e79ef
Retrieved Ids: c02e79ef
Score: 30.217829, Question: Course - When will the course start?
Retrieved Ids: 1f6520ca
Score: 19.322718, Question: Course - What are the prerequisites for this course?
Retrieved Ids: 7842b56a
Score: 19.187729, Question: Course - Can I still join the course after the start date?
Retrieved Ids: 63394d91
Score: 18.713572, Question: Course - What can I do before the course starts?
Retrieved Ids: a482086d
Score: 18.713572, Question: Course - Can I follow the course after it finishes?
-----
Query: How can I get the course schedule? -> id: c02e79ef
Retrieved Ids: eb56ae98
Score: 42.350315, Question: Course - Can I get support if I take the course in the self-paced mode?
Retrieved Ids: 63394d91
Score: 36.130344, Question: Course - What can I do before the course starts?
Retrieved Ids: a482086d
Score: 36.130344, Question: Course - Can I follow the course after it finishes?
Retrieved Ids: 7842b56a
Score: 35.25964, Quest

In [7]:
import pandas 
from collections import defaultdict
elasticsearch_results = []

gt_questions = pandas.read_csv('/home/gwm-279/Documents/DTC_AI/artifacts/ground-truth-questions.csv')

for index, row in tqdm(gt_questions.iterrows(), total=len(gt_questions)):
    query = row['question']
    course = row['course']
    document_id = row['document']
    search_query =  {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }
    search_response = es_client.search(index=index_name,body=search_query)
    elasticsearch_results.append({
        'true_id': document_id,
        'retrieved_ids': [hit['_source']['id'] for hit in search_response['hits']['hits']]
    })
print('all done')
     


100%|██████████| 4627/4627 [00:16<00:00, 286.63it/s]

all done





In [12]:
for res in elasticsearch_results[:5]:
    print(f"Id: {res['true_id']} -> Retrieved Ids: {res['retrieved_ids']}")

Id: c02e79ef -> Retrieved Ids: ['c02e79ef', '1f6520ca', '7842b56a', '63394d91', 'a482086d']
Id: c02e79ef -> Retrieved Ids: ['eb56ae98', '63394d91', 'a482086d', '7842b56a', '2f19301f']
Id: c02e79ef -> Retrieved Ids: ['0e424a44', '1f6520ca', '04aa4897', '29865466', '63394d91']
Id: c02e79ef -> Retrieved Ids: ['0bbf41ec', '63394d91', 'a482086d', '2f19301f', '7842b56a']
Id: c02e79ef -> Retrieved Ids: ['7842b56a', '4eefdd01', '8dc77677', 'a1603359', '63394d91']


In [13]:
def hit_rate(results):
    total = 0
    for res in results:
        if res['true_id'] in res['retrieved_ids']:
            total+= 1 
    return total / len(results)

def mmr(results):
    total = 0
    for res in results:
        if res['true_id'] in res['retrieved_ids']:
            rank = res['retrieved_ids'].index(res['true_id']) + 1
            total += 1 / rank
    return total / len(results)

def recall_at_k(results, k):
    total = 0
    for res in results:
        if res['true_id'] in res['retrieved_ids'][:k]:
            total += 1
    return total / len(results)

In [14]:
print(f"Hit Rate: {hit_rate(elasticsearch_results)}")
print(f"MMR: {mmr(elasticsearch_results)}")
print(f"Recall@3: {recall_at_k(elasticsearch_results, 3)}")

Hit Rate: 0.7393559541819754
MMR: 0.6023233196455591
Recall@3: 0.6699805489518046


In [16]:
import minsearch
Index = minsearch.Index(
    text_fields = ['text','section','question'],
    keyword_fields = ['course']) 

Index.fit(docs)




<minsearch.Index at 0x7eb8271e4490>

In [17]:
minsearch_results = [] 
def search_with_minsearch(gt_questions):

    for index, row in tqdm(gt_questions.iterrows(), total=len(gt_questions)):
        query = row['question']
        course = row['course']
        document_id = row['document']
        num_results = 5
        results = Index.search(query, filter_dict={'course': course}, num_results=num_results)

        minsearch_results.append({
            'true_id': document_id,
            'retrieved_ids': [res['id'] for res in results]
        })
    return minsearch_results

search_with_minsearch(gt_questions)

100%|██████████| 4627/4627 [00:10<00:00, 424.95it/s]


[{'true_id': 'c02e79ef',
  'retrieved_ids': ['c02e79ef',
   'a482086d',
   '7842b56a',
   '63394d91',
   '1f6520ca']},
 {'true_id': 'c02e79ef',
  'retrieved_ids': ['a482086d',
   'c02e79ef',
   '2f19301f',
   'eb56ae98',
   '63394d91']},
 {'true_id': 'c02e79ef',
  'retrieved_ids': ['c02e79ef',
   '1f6520ca',
   '04aa4897',
   '0e424a44',
   '29865466']},
 {'true_id': 'c02e79ef',
  'retrieved_ids': ['c02e79ef',
   'a482086d',
   '2f19301f',
   '63394d91',
   '0bbf41ec']},
 {'true_id': 'c02e79ef',
  'retrieved_ids': ['c02e79ef',
   'bba0da04',
   '7842b56a',
   'a2120335',
   '009ac612']},
 {'true_id': '1f6520ca',
  'retrieved_ids': ['1f6520ca',
   '63394d91',
   'c02e79ef',
   'a482086d',
   'f2945cd2']},
 {'true_id': '1f6520ca',
  'retrieved_ids': ['1f6520ca',
   'f2945cd2',
   '63394d91',
   'c02e79ef',
   'a482086d']},
 {'true_id': '1f6520ca',
  'retrieved_ids': ['1f6520ca',
   'c02e79ef',
   '63394d91',
   'a482086d',
   '7842b56a']},
 {'true_id': '1f6520ca',
  'retrieved_ids': ['1f

In [18]:

for res in minsearch_results[:5]:
    print(f"Id: {res['true_id']} -> Retrieved Ids: {res['retrieved_ids']}")

Id: c02e79ef -> Retrieved Ids: ['c02e79ef', 'a482086d', '7842b56a', '63394d91', '1f6520ca']
Id: c02e79ef -> Retrieved Ids: ['a482086d', 'c02e79ef', '2f19301f', 'eb56ae98', '63394d91']
Id: c02e79ef -> Retrieved Ids: ['c02e79ef', '1f6520ca', '04aa4897', '0e424a44', '29865466']
Id: c02e79ef -> Retrieved Ids: ['c02e79ef', 'a482086d', '2f19301f', '63394d91', '0bbf41ec']
Id: c02e79ef -> Retrieved Ids: ['c02e79ef', 'bba0da04', '7842b56a', 'a2120335', '009ac612']


In [19]:
print(f"Hit Rate: {hit_rate(minsearch_results)}")
print(f"MMR: {mmr(minsearch_results)}")
print(f"Recall@3: {recall_at_k(minsearch_results, 3)}")

Hit Rate: 0.8178085152366544
MMR: 0.6959801167062899
Recall@3: 0.7661551761400476
