### 3.1 Running Elastic search in docker

$ docker run -d --name elasticsearch -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "xpack.security.enabled=false" elasticsearch:8.18.0


# Preparing our documents

In [1]:
import json

with open('documents.json', 'rt') as doc_file:
    docs_raw = json.loads(doc_file.read())
docs_raw

documents = []
for doc in docs_raw:
    for inner_doc in doc['documents']:
        inner_doc['course'] = doc['course']
        documents.append(inner_doc)

        
documents[0]


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

### 3.2 Creating Embedding using pre-trained models

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
dim = len(model.encode('simple text'))
dim

384

In [14]:
from tqdm import tqdm
vector_docs = []
for doc in tqdm(documents, total=len(documents)):
    doc['qtn_vector'] = model.encode(doc['question'])
    vector_docs.append(doc)
vector_docs[0]

100%|█████████████████████████████████████████████████████████████████████████████████| 948/948 [00:13<00:00, 71.41it/s]


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'vector_qtn': [-0.00035094335908070207,
  -0.06201427802443504,
  -0.03799988329410553,
  0.024202896282076836,
  -0.011954830028116703,
  -0.017121000215411186,
  -0.12016370892524719,
  -0.09692652523517609,
  -0.0548442117869854,
  0.03564402088522911,
  -0.02570364810526371,
  0.030503077432513237,
  -0.031116845086216927,
  0.060350146144628525,
  -0.011938951909542084,
  -

### 3.3 Connecting to elasticsearch

In [15]:
from elasticsearch import Elasticsearch
es_client =es_client = Elasticsearch(
    "http://localhost:9200"
)
es_client.info()

ObjectApiResponse({'name': '9be7469cf655', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'eKLD7ikwRq2qebJ48Xy7pg', 'version': {'number': '8.18.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '04e979aa50b657bebd4a0937389308de82c2bdad', 'build_date': '2025-04-10T10:09:16.444104780Z', 'build_snapshot': False, 'lucene_version': '9.12.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### 3.4 Creating indexes

In [16]:
index_name = 'searchindex'
index_body = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    }, 
    "mappings":{
        "dynamic": False,
        "properties":{
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "qtn_vector": {"type": "dense_vector", "dims": dim, "index": True, "similarity": "cosine"}
        }
    }
}

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_body)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'searchindex'})

### 3.5 Add documents to the index

In [17]:
for doc in tqdm(vector_docs):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)
    

100%|█████████████████████████████████████████████████████████████████████████████████| 948/948 [00:16<00:00, 57.19it/s]


### 3.6 Create and use query

In [23]:
search_term = "windows or mac"
vector_search_term = model.encode(search_term)
query = {
    "field": "qtn_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 1000
}

res = es_client.search(index=index_name, knn=query, source=['section','course','question','text'])
res['hits']['hits']

[{'_index': 'searchindex',
  '_id': 'kwS5T5YB5jBEMC7UDynN',
  '_score': 0.730978,
  '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
   'section': 'General course-related questions',
   'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp'}},
 {'_index': 'searchindex',
  '_id': 'PQS5T5YB5jBEMC7ULivq',
  '_score': 0.65726256,
  '_source': {'text': '(Tyler Simpson)',
   'section': '1. Introduction to Machine Learning',
   'question': 'Windows WSL and VS Code\nIf you have a Windows 11 device and would like to use the built in WSL to access linux you can use the Microsoft Learn link Set up a WSL development environment | Microsoft Learn. To connect this to VS Code download the Microsoft verified VS Code extension ‘WSL’ this will allow you to remotely connect to your WSL Ubuntu instance as if it was a virtual machine.',
   'course': 'machine-learnin

### 3.7 Advanced Search Query

In [25]:
knn_query = {
    "field": "qtn_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 1000
}

query = {
    "match": {
        "course": "data-engineering-zoomcamp"
    }
}

res = es_client.search(index=index_name, knn = knn_query, query = query, size = 5, source=['section','course','question','text'])
res['hits']['hits']

[{'_index': 'searchindex',
  '_id': 'kwS5T5YB5jBEMC7UDynN',
  '_score': 1.509892,
  '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
   'section': 'General course-related questions',
   'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp'}},
 {'_index': 'searchindex',
  '_id': 'hAS5T5YB5jBEMC7UDinT',
  '_score': 1.4291756,
  '_source': {'text': 'You can set it up on your laptop or PC if you prefer to work locally from your laptop or PC.\nYou might face some challenges, especially for Windows users. If you face cnd2\nIf you prefer to work on the local machine, you may start with the week 1 Introduction to Docker and follow through.\nHowever, if you prefer to set up a virtual machine, you may start with these first:\nUsing GitHub Codespaces\nSetting up the environment on a cloudV Mcodespace\nI decided to work on a virtual machine because I have 