### Semantic Search using Elastic Search

![Alt Text](../images/semantic-search-architecture.png)

In [2]:
import json

#### Step 1: Prepare documents

In [3]:
with open('../01-intro/documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

#### Step 2: Create Embeddings using Pretrained Models
Sentence Transformers documentation here: https://www.sbert.net/docs/sentence_transformer/pretrained_models.html

In [6]:
# Please perform "pip install sentence_transformers==2.7.0"
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

2025-06-15 20:54:28.239738: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750017268.673433   16110 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750017268.827795   16110 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750017269.275067   16110 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750017269.275107   16110 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750017269.275113   16110 computation_placer.cc:177] computation placer alr

In [7]:
len(model.encode("This is a simple sentence"))

768

In [8]:
from tqdm import tqdm

In [9]:
#created the dense vector using the pre-trained model
operations = []
for doc in tqdm(documents):
    # Transforming the text into an embedding using the model
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [03:11<00:00,  4.95it/s]


#### Step 3: Setup ElasticSearch connection

In [12]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': 'f5deefe69e86', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'G8E3CnUmTqOWquZBsIQddA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

#### Step 4: Create Mappings and Index

In [10]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [13]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

  es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

#### Step 5: Add documents into index

In [15]:
for doc in tqdm(operations):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:17<00:00, 52.87it/s]


#### Step 6: Create end user query

In [24]:
#  Creating embedings for user query
search_term = "Can i still join while the program has started"
vector_search_term = model.encode(search_term)

In [25]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}

In [26]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'ok0udZcBQWow8RQsFpCx',
  '_score': 0.7349285,
  '_source': {'question': 'Course - Can I still join the course after the start date?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."}},
 {'_index': 'course-questions',
  '_id': 'Vk0udZcBQWow8RQssJIF',
  '_score': 0.7349285,
  '_source': {'question': 'Course - Can I still join the course after the start date?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."}},
 {'_index'

#### Step 7: Perform Keyword search with Semantic Search (Hybrid/Advanced Search)

In [27]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [34]:
# Perform a semantic search along with a filter

response = es_client.search(
    index=index_name,
    query={
        "match": {"course": "mlops-zoomcamp"},
    },
    knn=knn_query,
    size=5,
    source=["text", "section", "question", "course"]
)

In [35]:
response["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'fk0udZcBQWow8RQs7ZV9',
  '_score': 2.302585,
  '_source': {'question': 'Format for questions: [Problem title]',
   'course': 'mlops-zoomcamp',
   'section': '+-General course questions',
   'text': 'MLOps Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course, and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\n[Problem description]\n[Solution description]\n(optional) Added by Name'}},
 {'_index': 'course-questions',
  '_id': 'f00udZcBQWow8RQs7ZWG',
  '_score': 2.302585,
  '_source': {'question': 'What is the expected duration of this course or that for each module?',
   'course': 'mlops-zoomcamp',
   'section': '+-General course questions',
   'text': 'Approximately 3 months. For each module, about 1 week with possible deadline extensions (in total 6~9 weeks), 2 weeks for