* Reference: https://www.youtube.com/watch?v=a4HBKEda_F8

In [None]:
!uv pip install elasticsearch
!uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

In [None]:
# Create index

from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
client_info = es.info()
pprint("Connecting to the Elasticsearch")
pprint(client_info.body)

In [None]:
# Create index
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")


es.indices.delete(index="test_index", ignore_unavailable=True)
es.indices.create(index="test_index")

In [None]:
# crerate index with shards and replicas
'''
* shards is the number of slices for the document
* replicas is the number of replicas (duplicate copy of the data)
'''

es.indices.delete(index="test_index", ignore_unavailable=True)
es.indices.create(
    index="test_index", settings={
        "number_of_shards": 3,# how many pices should the data be split into
        "number_of_replicas": 2# how many copies of the data should be created
    }
)

In [None]:
# inserting the documents
'''
* Tabular: all documents must have same fields 
* mapping: elastic search tries to figure out the data type of each field (we can set mannually)
*  
'''


es.indices.delete(index="test_index", ignore_unavailable=True)
es.indices.create(index="test_index")

document = {
    "title": "some title",
    "text": "some text",
    "creation_date":"2025-05-01"
}
response=es.index(index="test_index", body=document)
response

In [None]:
# insert multiple documents (use for loop)
def fucking_insert_multiple(es, index_name, document):
    response = es.index(index=index_name, body=document)
    return response

for doc in [document]*2:
    response = fucking_insert_multiple(es, "my_index", doc)
    print(response)

In [None]:
# print mapping
from pprint import pprint

index_mapping = es.indices.get_mapping(index="test_index")
pprint(index_mapping["test_index"]["mappings"]["properties"])

In [None]:
# mannual mapping
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

mapping = {
    'properties': {
        'created_on': {'type': 'date'},
        'text': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        },
        'title': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        }
    }
}

es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index', mappings=mapping)

index_mapping = es.indices.get_mapping(index='my_index')

'''
# Alternative
es.indices.put_mapping(index='my_index', body=mapping)

index_mapping = es.indices.get_mapping(index='my_index')
pprint(index_mapping["my_index"]["mappings"]["properties"])

'''

## Data types

In [None]:
# field data types
'''
* Binary field: 
  - not searchable and is not stored

* boolean data field: True/false
* numbers: long, integer, byte, short, ...
* dates: 
* keywords: <to filter or sort> id, email, zip codes, ...

'''

from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

In [None]:
# Binary
es.indices.delete(index='binary_index', ignore_unavailable=True)
es.indices.create(
    index='binary_index',
    mappings={
        "properties": {
            "image_data": {
                "type": "binary"
            }
        }
    }
)

# insert
import base64

image_path = "./the-imgae.png"
with open(image_path, "rb") as image_file:
    image_bytes = image_file.read()
    image_base64 = base64.b64encode(image_bytes).decode("utf-8")

print(f"fucking inserted: {image_base64[:100]}...")

# get the image back
document = {
    "image_data": image_base64
}
response = es.index(index='binary_index', body=document)
print(f"fucking got: {response}")

In [None]:
# Objects: (key value pairs like json data)
es.indices.delete(index='object_index', ignore_unavailable=True)
response = es.indices.create(
    index='object_index',
    mappings={
        "properties": {
            "author": {
                "properties": {
                    "first_name": {
                        "type": "text"
                    },
                    "last_name": {
                        "type": "text"
                    }
                }
            }
        }
    }
)
print(f"fucking mapping response: {response}")

document = {
    "author": {
        "first_name": "Imad",
        "last_name": "Saddik"
    }
}
response = es.index(index='object_index', body=document)
print(f"fucking document: {response}")

In [None]:
# Flattened objects
'''
* Does not preserve relationship between fields. 
* e.g. for author with first and last names, it mixes up the first and last names
'''

es.indices.delete(index='flattened_object_index', ignore_unavailable=True)
response = es.indices.create(
    index='flattened_object_index',
    mappings={
        "properties": {
            "author": {
                "type": "flattened"
            }
        }
    }
)
print(f"fucking mapping response: {response}")
document = {
    "author": {
        "first_name": "Imad",
        "last_name": "Saddik"
    }
}
response = es.index(index='flattened_object_index', body=document)
print(f"fucking document response: {response}")

In [None]:
# Nested Objects (like nested json) (preserves the relation between fields)
es.indices.delete(index='nested_object_index', ignore_unavailable=True)
response = es.indices.create(
    index='nested_object_index',
    mappings={
        "properties": {
            "user": {
                "type": "nested",
            }
        }
    }
)
print(f"fucking mapping response: {response}")

documents = [
    {
        "first": "John",
        "last": "Smith"
    },
    {
        "first": "Imad",
        "last": "Saddik"
    }
]
response = es.index(index='nested_object_index', body={"user": documents})
response


### Text search types
 - text is optimized for search.

In [None]:
es.indices.delete(index='text_index', ignore_unavailable=True)
response = es.indices.create(
    index='text_index',
    mappings={
        "properties": {
            "email_body": {
                "type": "text"
            }
        }
    }
)
print(f"fucking mapping response: {response}")

document = {
    "email_body": "Hello, this is a test email."
}
response = es.index(index='text_index', body=document)
response

In [None]:
# completion : used to enable fast autocomplete suggestions by pre-indexing terms in a special way
# when we enter things in gogle, it tries to auto complete the query

es.indices.delete(index='text_completion_index', ignore_unavailable=True)
response = es.indices.create(
    index='text_completion_index',
    mappings={
        "properties": {
            "suggest": {
                "type": "completion"
            }
        }
    }
)

print("fucking mapping response: ", es.indices.get_mapping(index='text_completion_index'))

document_1 = {
    "suggest": {
        "input": ["Mars", "Planet"]
    }
}

document_2 = {
    "suggest": {
        "input": ["Andromeda", "Galaxy"]
    }
}

es.index(index='text_completion_index', body=document_1)
es.index(index='text_completion_index', body=document_2)

In [None]:
# spatial data types

# 1. GEo point
response=es.indices.delete(index='geo_point_index', ignore_unavailable=True)
es.indices.create(
    index='geo_point_index',
    mappings={
        "properties": {
            "location": {
                "type": "geo_point"
            }
        }
    }
)
print(f"fucking mapping response: {es.indices.get_mapping(index='geo_point_index')}")

document = {
    "text": "Geopoint as an object using GeoJSON format",
    "location": {
        "type": "Point",
        "coordinates": [
            -71.34,
            41.12
        ]
    }
}
response = es.index(index='geo_point_index', body=document)
response

In [None]:
# Geo shape: arbitary geographic shapes with multiple coordinates
es.indices.delete(index='geo_shape_index', ignore_unavailable=True)
response = es.indices.create(
    index='geo_shape_index',
    mappings={
        "properties": {
            "location": {
                "type": "geo_shape"
            }
        }
    }
)
print(f"fucking mapping response: {es.indices.get_mapping(index='geo_shape_index')}")

document_1 = {
    "location": {
        "type": "LineString",
        "coordinates": [
            [
                -77.03653,
                38.897676
            ],
            [
                -77.009051,
                38.889939
            ]
        ]
    }
}
document_2 = {
    "location": {
        "type": "Polygon",
        "coordinates": [
            [
                [100, 0],
                [101, 0],
                [101, 1],
                [100, 1],
                [100, 0],
            ],
            [
                [100.2, 0.2],
                [100.8, 0.2],
                [100.8, 0.8],
                [100.2, 0.8],
                [100.2, 0.2],
            ]
        ]
    }
}

response_1 = es.index(index='geo_shape_index', body=document_1)
print(response_1)
es.index(index='geo_shape_index', body=document_2)

In [None]:
# point
es.indices.delete(index='point_index', ignore_unavailable=True)
es.indices.create(
    index='point_index',
    mappings={
        "properties": {
            "location": {
                "type": "point"
            }
        }
    }
)
print(f"fucking mapping response: {es.indices.get_mapping(index='point_index')}")


document = {
    "location": {
        "type": "Point",
        "coordinates": [
            -71.34,
            41.12
        ]
    }
}

response = es.index(index='point_index', body=document)
response

## Delete Documents

In [None]:
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

In [None]:
# insert documents to delete
import json
from tqdm import tqdm


document_ids = []
dummy_data = json.load(open("ElasticSearch_Python_Course/data/dummy_data.json"))
for document in tqdm(dummy_data, total=len(dummy_data)):
    response = es.index(index='my_index', body=document)
    document_ids.append(response['_id'])
document_ids

In [None]:
try:
    response = es.delete(index='my_index', id=document_ids[2])
    print(response)
except Exception as e:
    print(e)

## Get documents

In [None]:
# connect to elastic
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

In [None]:
# create index
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

In [None]:
# insert documents to get
import json
from tqdm import tqdm


document_ids = []
dummy_data = json.load(open("ElasticSearch_Python_Course/data/dummy_data.json"))
for document in tqdm(dummy_data, total=len(dummy_data)):
    response = es.index(index='my_index', body=document)
    document_ids.append(response['_id'])
document_ids

In [None]:
# get operation
response = es.get(index='my_index', id=document_ids[0])
response.body

In [None]:
# try getting non existant document
try:
    response = es.get(index='my_index', id="id")
except Exception as e:
    print(e)

## Count documents

In [None]:
# count operation
response = es.count(index='my_index')
count = response["count"]

print(f"The number of documents in the index is {count}")

In [None]:
# count operation with filters
query = {
    "range": {
        "created_on": {
            "gte": "2024-09-24",
            "lte": "2024-09-24",
            "format": "yyyy-MM-dd"
        }
    }
}

response = es.count(index='my_index', query=query)
count = response["count"]

print(f"The number of documents in the index is {count}")

## Check exists

In [None]:
# check if index exists
response = es.indices.exists(index='my_index')
response.body

In [None]:
# check if document exists
response = es.exists(index='my_index', id=document_ids[0])
response.body

## update document
* retrives document
* perform update
* save with different id

In [None]:
# update existing field
from pprint import pprint

response = es.update(
    index="my_index",
    id=document_ids[0],
    script={
        "source": "ctx._source.title = params.title",
        "params": {
            "title": "New Title"
        }
    },
)
pprint(response.body)

In [None]:
# get updated document
response = es.get(index='my_index', id=document_ids[0])
pprint(response.body)

In [None]:
# add new field
response = es.update(
    index="my_index",
    id=document_ids[0],
    doc={
        "new_value_2": "dummy_value_2",
    },
)
pprint(response.body)

response = es.get(index='my_index', id=document_ids[0])
pprint(f"\n\ndata with newly added field: {response.body}")


In [None]:
# (alternatively) add new field
response = es.update(
    index="my_index",
    id=document_ids[0],
    script={
        "source": "ctx._source.new_field = 'dummy_value'", # this is new field
    },
)
pprint(response.body)

In [None]:
# get newly added field
response = es.get(index='my_index', id=document_ids[0])
pprint(response.body)

In [None]:
# Remove a field
response = es.update(
    index="my_index",
    id=document_ids[0],
    script={
        "source": "ctx._source.remove('new_field')",
    },
)
pprint(response.body)

response = es.get(index='my_index', id=document_ids[0])
pprint(response.body)

In [None]:
# upsert: update or insert
response = es.update(
    index="my_index",
    id="1",
    doc={
        "book_id": 1234,
        "book_name": "A book",
    },
    doc_as_upsert=True,
)
pprint(f"fucking saved: {response.body}")

response = es.count(index='my_index')
pprint(f"count: {response['count']}")

## bulk operations

In [None]:
# insert documents for bulk operations
import json
from tqdm import tqdm


document_ids = []
dummy_data = json.load(open("ElasticSearch_Python_Course/data/dummy_data.json"))
for document in tqdm(dummy_data, total=len(dummy_data)):
    response = es.index(index='my_index', body=document)
    document_ids.append(response['_id'])
document_ids

In [None]:
# update first and second document
from pprint import pprint

response = es.update(
    index="my_index",
    id=document_ids[0],
    script={
        "source": "ctx._source.title = params.title",
        "params": {
            "title": "New Title"
        }
    },
)
pprint(f"fucking updated: {response.body}")

response = es.update(
    index="my_index",
    id=document_ids[1],
    script={
        "source": "ctx._source.new_field = 'dummy_value'",
    },
)
pprint(f"fucking updated: {response.body}")

In [None]:
# lets delete the third document
response = es.delete(index="my_index", id=document_ids[2])
pprint(f"fucking deleted: {response.body}")

In [None]:
# bulk operations
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

response = es.bulk(
    operations=[
        # Action 1: index
        {
            "index": {
                "_index": "my_index",
                "_id": "1"
            }
        },
        # Source 1: on this document insert a new data
        {
            "title": "Sample Title 1",
            "text": "This is the first sample document text.",
            "created_on": "2024-09-22"
        },
        # Action 2: index
        {
            "index": {
                "_index": "my_index",
                "_id": "2"
            }
        },
        # Source 2: on this document insert a new data
        {
            "title": "Sample Title 2",
            "text": "Here is another example of a document.",
            "created_on": "2024-09-24"
        },
        # Action 3: index
        {
            "index": {
                "_index": "my_index",
                "_id": "3"
            }
        },
        # Source 3: on this document insert a new data
        {
            "title": "Sample Title 3",
            "text": "The content of the third document goes here.",
            "created_on": "2024-09-24"
        },
        # Action 4: update action
        {
            "update": {
                "_id": "1",
                "_index": "my_index"
            }
        },
        # Source 4: on this document update the title
        {
            "doc": {
                "title": "New Title"
            }
        },
        # Action 5: update action
        {
            "update": {
                "_id": "2",
                "_index": "my_index"
            }
        },
        # Source 5: on this document insert a new field
        {
            "doc": {
                "new_field": "dummy_value"
            }
        },
        # Action 6: delete action
        {
            "delete": {
                "_index": "my_index",
                "_id": "3"
            }
        },
    ],
)

pprint(response.body)

In [None]:
response.body["errors"]

## search api (part1)

index: index_name
q: simple search query (lucene like)
query: more structured
timeout: max time to wait for search result
size: number of results to return
from: starting index
sort: sort results
- _score: sort by relevance
- _doc: sort by document order  


In [None]:
# create indexes
es.indices.delete(index='index_1', ignore_unavailable=True)
es.indices.create(index='index_1')

es.indices.delete(index='index_2', ignore_unavailable=True)
es.indices.create(index='index_2')

In [None]:
# index sequentially in both indexes
import json
from tqdm import tqdm


dummy_data = json.load(open("ElasticSearch_Python_Course/data/dummy_data.json"))
for document in tqdm(dummy_data, total=len(dummy_data)):
    response1 = es.index(index='index_1', body=document)
print(f"fucking response1 :{response1}")

for document in tqdm(dummy_data, total=len(dummy_data)):
    response2 = es.index(index='index_2', body=document)
print(f"fucking response2 :{response2}")

In [None]:
# search by index one at a time
response = es.search(
    index='index_1',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in index_1")



response = es.search(
    index='index_1',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in index_1")

In [None]:
# search by multiple indexes
response = es.search(
    index='index_1,index_2',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in index_1 and index_2")

In [None]:
# search by indexes starting with 'index'

response = es.search(
    index='index*',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in all indexes with name starting with 'index'")

In [None]:
# search all indexes
response = es.search(
    index='_all',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in all indexes")

## Search api part-2
- Leaf clauses: match, term, range (can combine these)
- compound clauses: bool 

- match: uses full text search that matches given text, number, ...
- term: matches exact value: must be mapped to keyword or 
numeric or date 
- range: matches values within a range


In [None]:
# Connecting to elastic
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

# Initializing new index
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

# inserting documents
import json
from tqdm import tqdm


dummy_data = json.load(open("ElasticSearch_Python_Course/data/dummy_data.json"))
for document in tqdm(dummy_data, total=len(dummy_data)):
    response = es.index(index='my_index', body=document)
    pprint(response.body)

## Search: leaf clause

In [None]:
# term query: documents created on 2024-09-22
response = es.search(
    index='my_index',
    body={
        "query": {
            "term": {
                "created_on": "2024-09-22"
            }
        }
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in my_index")

retrieved_documents = response['hits']['hits']
retrieved_documents

In [None]:
# MATCH query: containing word "document" in "text" field
response = es.search(
    index='my_index',
    body={
        "query": {
            "match": {
                "text": "document"
            }
        }
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in my_index")

retrieved_documents = response['hits']['hits']
retrieved_documents

In [None]:
# Range query: created on or before 2024-09-23
response = es.search(
    index='my_index',
    body={
        "query": {
            "range": {
                "created_on": {
                    "lte": "2024-09-23"
                }
            }
        }
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in my_index")

retrieved_documents = response['hits']['hits']
retrieved_documents

## Search: compound clause

In [None]:
# combine two leaf clause to find specific document
response = es.search(
    index='my_index',
    body={
        "query": {
            "bool": { # bool: allows us to combine multiple conditions using logical operators (must, should, must_not, etc.).
                "must": [ # both conditions must be true
                    {
                        "match": {
                            "text": "third" #  full-text search on the "text" field for the word "third".
                        }
                    },
                    {
                        "range": {
                            "created_on": { # filter documents created exactly on "2024-09-24"
                                "gte": "2024-09-24",
                                "lte": "2024-09-24"
                            }
                        }
                    }
                ]
            }
        }
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in my_index")

retrieved_documents = response['hits']['hits']
retrieved_documents

## Search part-3

In [None]:
# Connect to elastic search
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

# create index
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')


# Inserting duplicate documents
import json


dummy_data = json.load(open("ElasticSearch_Python_Course/data/dummy_data_2.json"))
for _ in range(10):
    dummy_data += dummy_data

print(f"len dummy data: {len(dummy_data)}")



In [None]:
# bulk api to index all those documents

operations = []
for document in dummy_data:
    operations.append({'index': {'_index': 'my_index'}})
    operations.append(document)

es.bulk(operations=operations)

## searching: size + from

In [None]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "match_all": {}
        },
        "size": 10,  # how many results (documents) to return in this request
        "from": 10   #  The number of documents to skip (offset) before starting to return results.
    },
)

for hit in response['hits']['hits']:
    print(hit['_source'])

In [None]:
# Timeout: Abort after timeout duration: partial result might still be returned

response = es.search(
    index="my_index",
    body={
        "query": {
            "match": {
                "message": "search keyword"
            }
        },
        "timeout": "10s"
    },
)

response.body

In [None]:
# Aggregation: average the value of age field across all documents
# result of aggregation is stored in `avg_age` key.
response = es.search(
    index="my_index",
    body={
        "query": {
            "match_all": {}
        },
        "aggs": {
            "avg_age": {
                "avg": {
                    "field": "age"
                }
            }
        }
    }
)

average_age = response['aggregations']['avg_age']['value']
print(f"Average Age: {average_age}")

In [None]:
# combining size, form, timeoutm aggs
response = es.search(
    index="my_index",
    body={
        "query": { # query: find documents with important keyword
            "match": {
                "message": "important keyword"
            }
        },
        "aggs": { # aggergation: find max price and store it in max_price key
            "max_price": {
                "max": {
                    "field": "price"
                }  
            }
        },
        "size": 5,  # return 5 results
        "from": 20, # skip first 20 documents
        "timeout": "5s" # timeout within 5 seconds
    },
)

for hit in response['hits']['hits']:
    print(hit['_source'])

max_price = response['aggregations']['max_price']['value']
print(f"Max Price: {max_price}")

## Dense vector field
- vector of numeric values
- dense means mostly non zero values
- do not support sorting or aggrigation 
- use knn search for sorting and aggregation
- have to do  the mapping mannually for vectors



In [6]:
!source .venv/bin/activate
import elasticsearch

ModuleNotFoundError: No module named 'elasticsearch'

In [4]:
import elasticsearch

ModuleNotFoundError: No module named 'elasticsearch'

In [None]:
# connecting to elasticsearch
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

In [None]:
# dense vector requires manual mapping
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(
    index="my_index",
    mappings={
        "properties": {
            "sides_length": {
                "type": "dense_vector",
                "dims": 4
            },
            "shape": {
                "type": "keyword"
            }
        }
    },
)

In [None]:
# valid case: 1D array
from pprint import pprint

response = es.index(
    index='my_index',
    id=1,
    document={
        "shape": "square",
        "sides_length": [5, 5, 5, 5],
    }
)

pprint(response.body)

pprint(es.indices.get_mapping(index='my_index').body)

In [None]:
# invalid case: 2D array (elasticsearch does not support multi dimensional arrays)
response = es.index(
    index='my_index',
    id=2,
    document={
        "shape": "square",
        "sides_length": [[5, 5], [5, 5]],
    }
)

pprint(response.body)

## Embeddings

- Embedding transforms the text into vector of numbers
- GPU is recommended for using open source embedding models
- embedding models can be language specific or multi lingual

In [None]:
# connect to elastic search
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

In [None]:
# index: since it is vector, we have to create index first
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(
    index="my_index",
    mappings={
        "properties": {
            "embedding": {
                "type": "dense_vector",
            }
        }
    },
)

In [None]:
# Define embedding model
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
model

In [None]:
# model and device
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

model = model.to(device)
model

In [None]:
# load documents
import json

documents = json.load(open("ElasticSearch_Python_Course/data/dummy_data.json"))
documents

In [None]:
# Embed documents, bulk insert
from tqdm import tqdm
from pprint import pprint


def get_embedding(text):
    return model.encode(text)


operations = []
for document in tqdm(documents, total=len(documents)):
    operations.append({'index': {'_index': 'my_index'}})
    operations.append({
        **document,
        'embedding': get_embedding(document['text']),
    })

response = es.bulk(operations=operations)
pprint(response.body)

In [None]:
# retrieve documents back to verify text was embedded correctly
response = es.search(
    index='my_index',
    body={
        'query':
            {
                'match_all': {}
            }
    }
)

pprint(response["hits"]["hits"])

In [None]:
response = es.indices.get_mapping(index='my_index')
pprint(response.body)

## knn search 
- only applicable to dense vector fields
- classification and regression tasks
- 

In [None]:
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

In [None]:
# preparing index
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(
    index="my_index",
    mappings={
        "properties": {
            "embedding": {
                "type": "dense_vector",
            }
        }
    },
)

In [None]:
# embedding model
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
model

In [None]:
# initialize device and model
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

model = model.to(device)
model

In [None]:
# load documents
import json


documents = json.load(open("ElasticSearch_Python_Course/data/astronomy.json"))
documents

In [None]:
# Embed and bulk insert documents
from tqdm import tqdm
from pprint import pprint


def get_embedding(text):
    return model.encode(text)


operations = []
for document in tqdm(documents, total=len(documents)):
    operations.append({'index': {'_index': 'my_index'}})
    operations.append({
        **document,
        'embedding': get_embedding(document['content']),
    })

response = es.bulk(operations=operations)
pprint(response.body)

In [None]:
# retrieve documents to verify the text was converted to dense vector
response = es.search(
    index='my_index',
    body={
        'query':
            {
                'match_all': {}
            }
    }
)

pprint(response["hits"]["hits"])


In [None]:
response = es.indices.get_mapping(index='my_index')
pprint(response.body)

## Search after parameters