In [1]:
from elasticsearch import Elasticsearch
from pprint import pprint

es = Elasticsearch("http://localhost:9200/")
client_info = es.info()
pprint(client_info.body)

{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'S88XwI8mTkSFeJBJ4J532w',
 'name': 'c0d7ef8c0833',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-08-05T10:05:34.233336849Z',
             'build_flavor': 'default',
             'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.0'}}


In [2]:
es.indices.delete(index="my_index",ignore_unavailable=True)
es.indices.create(index="my_index",
                  settings={
                      "index":{
                      "number_of_shards": 3,
                      "number_of_replicas": 2
                      }
                  })

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [3]:
documents = {
    'title':'title',
    'text':'text',
    'created_on':'2024-09-22'
}

response = es.index(index="my_index",document=documents)
response

ObjectApiResponse({'_index': 'my_index', '_id': 'CuQN1ZMBZRxjKM60a6Ti', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [4]:
dummy_data = [
  {
    "title": "Sample Title 1",
    "text": "This is the first sample document text.",
    "created_on": "2024-09-22"
  },
  {
    "title": "Sample Title 2",
    "text": "Here is another example of a document.",
    "created_on": "2024-09-24"
  },
  {
    "title": "Sample Title 3",
    "text": "The content of the third document goes here.",
    "created_on": "2024-09-24"
  }
]

In [5]:
def insert_document(document):
    response = es.index(index="my_index",document=documents)
    return response

def print_info(response):
    print(f"""Document ID: {response['_id']} is {
        response['result']} and is split into {response['_shards']['total']} shards""")
    
for document in dummy_data:
    response = insert_document(document)
    print_info(response)

Document ID: C-QN1ZMBZRxjKM60bKQ0 is created and is split into 3 shards
Document ID: DOQN1ZMBZRxjKM60bKQ9 is created and is split into 3 shards
Document ID: DeQN1ZMBZRxjKM60bKRG is created and is split into 3 shards


In [6]:
index_mapping = es.indices.get_mapping(index="my_index")
pprint(index_mapping)

ObjectApiResponse({'my_index': {'mappings': {'properties': {'created_on': {'type': 'date'}, 'text': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}})


In [7]:
es.indices.delete(index="my_index",ignore_unavailable=True)
es.indices.create(index="my_index")

mapping = {
    'properties': {
        'created_on': {'type': 'date'},
        'text': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        },
        'title': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        }
    }
}

es.indices.put_mapping(index="my_index",body=mapping)
index_mapping = es.indices.get_mapping(index="my_index")
pprint(index_mapping["my_index"]["mappings"]["properties"])

{'created_on': {'type': 'date'},
 'text': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
          'type': 'text'},
 'title': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
           'type': 'text'}}


In [8]:
es.indices.delete(index='other_common_data_types_index',
                  ignore_unavailable=True)
es.indices.create(
    index='other_common_data_types_index',
    mappings={
        "properties": {
            "book_reference": {
                "type": "keyword"
            },
            "price": {
                "type": "float"
            },
            "publish_date": {
                "type": "date"
            },
            "is_available": {
                "type": "boolean"
            },
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'other_common_data_types_index'})

In [9]:
document = {
    "book_reference": "978-1617294433",
    "price": 44.99,
    "publish_date": "2021-06-30",
    "is_available": True
}
response = es.index(index='other_common_data_types_index', body=document)
response

ObjectApiResponse({'_index': 'other_common_data_types_index', '_id': 'DuQN1ZMBZRxjKM60bqSQ', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [10]:
es.indices.delete(index='object_index', ignore_unavailable=True)
es.indices.create(
    index='object_index',
    mappings={
        "properties": {
            "author": {
                "properties": {
                    "first_name": {
                        "type": "text"
                    },
                    "last_name": {
                        "type": "text"
                    }
                }
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'object_index'})

In [11]:
document = {
    "author": {
        "first_name": "Imad",
        "last_name": "Saddik"
    }
}
response = es.index(index='object_index', body=document)
response

ObjectApiResponse({'_index': 'object_index', '_id': 'D-QN1ZMBZRxjKM60b6R-', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [12]:
es.indices.delete(index='flattened_object_index', ignore_unavailable=True)
es.indices.create(
    index='flattened_object_index',
    mappings={
        "properties": {
            "author": {
                "type": "flattened"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'flattened_object_index'})

In [13]:
document = {
    "author": {
        "first_name": "Imad",
        "last_name": "Saddik"
    }
}
response = es.index(index='flattened_object_index', body=document)
response

ObjectApiResponse({'_index': 'flattened_object_index', '_id': 'EOQN1ZMBZRxjKM60cKRs', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [14]:
es.indices.delete(index='nested_object_index', ignore_unavailable=True)
es.indices.create(
    index='nested_object_index',
    mappings={
        "properties": {
            "user": {
                "type": "nested",
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nested_object_index'})

In [15]:
documents = [
    {
        "first": "John",
        "last": "Smith"
    },
    {
        "first": "Imad",
        "last": "Saddik"
    }
]
response = es.index(index='nested_object_index', body={"user": documents})
response

ObjectApiResponse({'_index': 'nested_object_index', '_id': 'EeQN1ZMBZRxjKM60caQw', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [16]:
es.indices.delete(index='text_index', ignore_unavailable=True)
es.indices.create(
    index='text_index',
    mappings={
        "properties": {
            "email_body": {
                "type": "text"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_index'})

In [17]:
document = {
    "email_body": "Hello, this is a test email."
}
response = es.index(index='text_index', body=document)
response

ObjectApiResponse({'_index': 'text_index', '_id': 'EuQN1ZMBZRxjKM60cqRR', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [18]:
es.indices.delete(index='text_completion_index', ignore_unavailable=True)
es.indices.create(
    index='text_completion_index',
    mappings={
        "properties": {
            "suggest": {
                "type": "completion"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_completion_index'})

In [19]:
document_1 = {
    "suggest": {
        "input": ["Mars", "Planet"]
    }
}

document_2 = {
    "suggest": {
        "input": ["Andromeda", "Galaxy"]
    }
}

es.index(index='text_completion_index', body=document_1)
es.index(index='text_completion_index', body=document_2)

ObjectApiResponse({'_index': 'text_completion_index', '_id': 'FOQN1ZMBZRxjKM60c6Q3', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

In [20]:
es.indices.delete(index='geo_point_index', ignore_unavailable=True)
es.indices.create(
    index='geo_point_index',
    mappings={
        "properties": {
            "location": {
                "type": "geo_point"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'geo_point_index'})

In [21]:
document = {
    "text": "Geopoint as an object using GeoJSON format",
    "location": {
        "type": "Point",
        "coordinates": [
            -71.34,
            41.12
        ]
    }
}
response = es.index(index='geo_point_index', body=document)
response

ObjectApiResponse({'_index': 'geo_point_index', '_id': 'FeQN1ZMBZRxjKM60dKQD', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [22]:
es.indices.delete(index='geo_shape_index', ignore_unavailable=True)
es.indices.create(
    index='geo_shape_index',
    mappings={
        "properties": {
            "location": {
                "type": "geo_shape"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'geo_shape_index'})

In [23]:
document_1 = {
    "location": {
        "type": "LineString",
        "coordinates": [
            [
                -77.03653,
                38.897676
            ],
            [
                -77.009051,
                38.889939
            ]
        ]
    }
}
document_2 = {
    "location": {
        "type": "Polygon",
        "coordinates": [
            [
                [100, 0],
                [101, 0],
                [101, 1],
                [100, 1],
                [100, 0],
            ],
            [
                [100.2, 0.2],
                [100.8, 0.2],
                [100.8, 0.8],
                [100.2, 0.8],
                [100.2, 0.2],
            ]
        ]
    }
}

es.index(index='geo_shape_index', body=document_1)
es.index(index='geo_shape_index', body=document_2)

ObjectApiResponse({'_index': 'geo_shape_index', '_id': 'F-QN1ZMBZRxjKM60daRd', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

In [24]:
es.indices.delete(index='point_index', ignore_unavailable=True)
es.indices.create(
    index='point_index',
    mappings={
        "properties": {
            "location": {
                "type": "point"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'point_index'})

In [25]:
document = {
    "location": {
        "type": "Point",
        "coordinates": [
            -71.34,
            41.12
        ]
    }
}

response = es.index(index='point_index', body=document)
response

ObjectApiResponse({'_index': 'point_index', '_id': 'GOQN1ZMBZRxjKM60dqQz', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [26]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200/")
es.indices.delete(index="my_index",ignore_unavailable=True)
es.indices.create(index="my_index")

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [27]:
import json
from tqdm import tqdm

document_ids = []
for document in tqdm(dummy_data,total=len(dummy_data)):
    response = es.index(index="my_index",document=document)
    document_ids.append(response['_id'])

100%|██████████| 3/3 [00:00<00:00, 48.99it/s]


In [28]:
# response = es.delete(index="my_index", id=document_ids[0])

In [29]:
pprint(response.body)

{'_id': 'G-QN1ZMBZRxjKM60d6RD',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 2,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 1,
 'result': 'created'}


In [30]:
response = es.get(index="my_index", id=document_ids[0])
pprint(response.body)

{'_id': 'GeQN1ZMBZRxjKM60d6QT',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 0,
 '_source': {'created_on': '2024-09-22',
             'text': 'This is the first sample document text.',
             'title': 'Sample Title 1'},
 '_version': 1,
 'found': True}


In [31]:
response = es.count(index="my_index")
print(response['count'])

0


In [32]:
response

ObjectApiResponse({'count': 0, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

In [33]:
query = {
    "range": {
        "created_on": {
            "gte": "2024-09-24",
            "lte": "2024-09-24",
            "format": "yyyy-MM-dd"
        }
    }
}

response = es.count(index='my_index', query=query)
count = response["count"]

print(f"The number of documents in the index is {count}")

The number of documents in the index is 0


In [34]:
response = es.indices.exists(index="my_index")
response.body

True

In [35]:
response = es.update(
    index="my_index",
    id= document_ids[0],
    script={
        "source": "ctx._source.title = params.title",
        "params": {"title": "New Title"}
    }
)
pprint(response.body)

{'_id': 'GeQN1ZMBZRxjKM60d6QT',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 3,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 2,
 'result': 'updated'}


In [36]:
response = es.update(
    index="my_index",
    id=document_ids[0],
    script={
        "source":"ctx._source.new_field = 'dmm'"
    }
)
pprint(response.body)

{'_id': 'GeQN1ZMBZRxjKM60d6QT',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 4,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 3,
 'result': 'updated'}


In [37]:
response = es.get(index="my_index",id=document_ids[0])
pprint(response.body)

{'_id': 'GeQN1ZMBZRxjKM60d6QT',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 4,
 '_source': {'created_on': '2024-09-22',
             'new_field': 'dmm',
             'text': 'This is the first sample document text.',
             'title': 'New Title'},
 '_version': 3,
 'found': True}


In [38]:
response = es.update(index="my_index",id=document_ids[0],doc={"next_field":"dmm2"})

In [39]:
response = es.get(index="my_index",id=document_ids[0])
pprint(response.body)

{'_id': 'GeQN1ZMBZRxjKM60d6QT',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 5,
 '_source': {'created_on': '2024-09-22',
             'new_field': 'dmm',
             'next_field': 'dmm2',
             'text': 'This is the first sample document text.',
             'title': 'New Title'},
 '_version': 4,
 'found': True}


In [40]:
response = es.update(index="my_index", id=document_ids[0], script={"source":"ctx._source.remove('new_field')"})

In [41]:
response = es.get(index="my_index", id=document_ids[0])
pprint(response.body)

{'_id': 'GeQN1ZMBZRxjKM60d6QT',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 6,
 '_source': {'created_on': '2024-09-22',
             'next_field': 'dmm2',
             'text': 'This is the first sample document text.',
             'title': 'New Title'},
 '_version': 5,
 'found': True}


In [42]:
response = es.update(
    index="my_index",
    id="1",
    doc={
        "book_id":1234,
        "book_name": "A book"
    },
    doc_as_upsert=True
)

pprint(response.body)

{'_id': '1',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 7,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 1,
 'result': 'created'}


In [43]:
response = es.get(index="my_index", id='1')
pprint(response.body)

{'_id': '1',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 7,
 '_source': {'book_id': 1234, 'book_name': 'A book'},
 '_version': 1,
 'found': True}


In [44]:
es.count(index="my_index")

ObjectApiResponse({'count': 0, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

In [45]:
es.indices.delete(index="my_index", ignore_unavailable=True)
es.indices.create(index="my_index")

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [46]:
response = es.bulk(
    operations=[
        {
            "index": {
                "_index": "my_index",
                "_id": "1"
            }
        },
        # Source 1
        {
            "title": "Sample Title 1",
            "text": "This is the first sample document text.",
            "created_on": "2024-09-22"
        },
        # Action 2
        {
            "index": {
                "_index": "my_index",
                "_id": "2"
            }
        },
        # Source 2
        {
            "title": "Sample Title 2",
            "text": "Here is another example of a document.",
            "created_on": "2024-09-24"
        },
        # Action 3
        {
            "index": {
                "_index": "my_index",
                "_id": "3"
            }
        },
        # Source 3
        {
            "title": "Sample Title 3",
            "text": "The content of the third document goes here.",
            "created_on": "2024-09-24"
        },
        # Action 4
        {
            "update": {
                "_id": "1",
                "_index": "my_index"
            }
        },
        # Source 4
        {
            "doc": {
                "title": "New Title"
            }
        },
        # Action 5
        {
            "update": {
                "_id": "2",
                "_index": "my_index"
            }
        },
        # Source 5
        {
            "doc": {
                "new_field": "dummy_value"
            }
        },
        # Action 6
        {
            "delete": {
                "_index": "my_index",
                "_id": "3"
            }
        },
    ]
)

pprint(response.body)

{'errors': False,
 'items': [{'index': {'_id': '1',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 0,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': '2',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 1,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': '3',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 2,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
        

In [47]:
es.indices.delete(index="index_1", ignore_unavailable=True)
es.indices.create(index="index_1")

es.indices.delete(index="index_2", ignore_unavailable=True)
es.indices.create(index="index_2")

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'index_2'})

In [48]:
for document in tqdm(dummy_data, total=len(dummy_data)):
    es.index(index="index_1",document=document)

for document in tqdm(dummy_data, total=len(dummy_data)):
    es.index(index="index_2",document=document)

100%|██████████| 3/3 [00:00<00:00, 54.95it/s]
100%|██████████| 3/3 [00:00<00:00, 56.33it/s]


In [49]:
response = es.search(
    index="index_1",
    body={
        "query": {"match_all":{}}
    }
)

pprint(f"Found {response['hits']['total']['value']} document in index_1")

'Found 0 document in index_1'


In [50]:
response = es.search(
    index="index_2",
    body={
        "query": {"match_all":{}}
    }
)

pprint(f"Found {response['hits']['total']['value']} document in index_2")

'Found 0 document in index_2'


In [51]:
response = es.search(
    index="index_1,index_2",
    body={
        "query": {"match_all":{}}
    }
)

pprint(f"Found {response['hits']['total']['value']} document in index_1,index_2")

'Found 0 document in index_1,index_2'


In [52]:
response = es.search(
    index="index*",
    body={
        "query": {"match_all":{}}
    }
)

pprint(f"Found {response['hits']['total']['value']} document in index_1,index_2")

'Found 0 document in index_1,index_2'


In [53]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "term": {
                "created_on":"2024-09-22"
            }
        }
    }
)

pprint(response.body)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}


In [54]:
response = es.search(
    index="my_index",
    body={"query": {"match": {"text": "document"}}}
)

pprint(response.body)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}


In [55]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "range":{
                "created_on": {
                    "lte":"2024-09-22"
                }
            }
        }
    }
)

pprint(response.body)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 2}


In [56]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "bool": {
                "must": [
                    {"match": {"text": "third"}},
                    {"range":{
                        "created_on":{
                            "gte":"2024-09-24",
                            "lte":"2024-09-24"
                        }
                    }}
                ]
            }
        }
    }
)

pprint(response.body)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}


In [57]:
dummy_data_2=[
    {
        "message": "This is an important keyword search result.",
        "age": 25,
        "price": 100.0
    },
    {
        "message": "Another search result with an important keyword.",
        "age": 30,
        "price": 150.0
    },
    {
        "message": "Keyword match in this result as well.",
        "age": 40,
        "price": 200.0
    },
    {
        "message": "Important keyword again in this document.",
        "age": 35,
        "price": 120.0
    },
    {
        "message": "Final document with the important keyword.",
        "age": 28,
        "price": 180.0
    }
]



In [58]:
operations = []
for document in dummy_data_2:
    operations.append({"index":{"_index":"my_index"}})
    operations.append(document)

es.bulk(operations=operations)

ObjectApiResponse({'errors': False, 'took': 19779961, 'items': [{'index': {'_index': 'my_index', '_id': 'IuQN1ZMBZRxjKM60fKSG', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 6, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'I-QN1ZMBZRxjKM60fKSG', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 7, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'JOQN1ZMBZRxjKM60fKSG', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 8, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'JeQN1ZMBZRxjKM60fKSG', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 9, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_index', '_id': 'JuQN1ZMBZRxjKM60fKSG', '_version': 1, 'result': 'created', '_shards': {

In [59]:
response = es.search(
    index="my_index",
    body={"query":{
        "match_all":{}
    },
    "size":10,
    "from":10
    }
)

for hit in response['hits']['hits']:
    print(hit['_source'])

In [60]:
response = es.search(
    index="my_index",
    body={"query": {"match": {"message": "search keyword"}},
          "timeout":"10s"}
)

pprint(response.body)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 1}


In [61]:
response = es.search(
    index="my_index",
    body={"query": {"match_all": {}},
          "aggs": {
              "avg_age":{
                  "avg":{
                      "field":"age"
                  }
              }
          }}
)

pprint(response['aggregations']['avg_age']['value'])

None


In [62]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "match": {
                "message": "important keyword"
            }
        },
        "aggs": {
            "max_price": {
                "max": {
                    "field": "price"
                }
            }
        },
        "size": 5,
        "from": 20,
        "timeout": "5s"
    }
)

for hit in response['hits']['hits']:
    print(hit['_source'])

pprint(response['aggregations']['max_price']['value'])

None


In [63]:
es.indices.delete(index="my_index", ignore_unavailable=True)
es.indices.create(index="my_index",
                  mappings={
                      "properties":{
                          "sides_length":{
                              "type":"dense_vector",
                              "dims":4
                          },
                          "shape":{
                              "type":"keyword"
                          }
                      }
                  })

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [64]:
response = es.index(
    index="my_index",
    id=1,
    document={
        "shape":"square",
        "sides_length":[5,5,5,5]
    }
)

pprint(response.body)

{'_id': '1',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 0,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 1,
 'result': 'created'}


In [65]:
es.indices.get_mapping(index="my_index").body

{'my_index': {'mappings': {'properties': {'shape': {'type': 'keyword'},
    'sides_length': {'type': 'dense_vector',
     'dims': 4,
     'index': True,
     'similarity': 'cosine',
     'index_options': {'type': 'int8_hnsw',
      'm': 16,
      'ef_construction': 100}}}}}}

In [66]:
es.indices.delete(index="my_index",ignore_unavailable=True)
es.indices.create(index="my_index",
                  mappings={
                      "properties": {
                          "embedding":{
                              "type":"dense_vector"
                          }
                      }
                  })

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [67]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [68]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [70]:
def get_embedding(text):
    return model.encode(text)

In [72]:
operations = []

for document in tqdm(dummy_data, total=len(dummy_data)):
    operations.append({"index":{"_index":"my_index"}})
    operations.append({
        **document,
        "embedding":get_embedding(document['text'])
    })

response = es.bulk(operations=operations)
pprint(response.body)

100%|██████████| 3/3 [00:00<00:00, 92.32it/s]

{'errors': False,
 'items': [{'index': {'_id': 'KuQW1ZMBZRxjKM60TKTs',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 3,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'K-QW1ZMBZRxjKM60TKTs',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 4,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'LOQW1ZMBZRxjKM60TKTs',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 5,
                      '_shards': {'failed': 0, 'successful': 1, '




In [73]:
response = es.search(
    index="my_index",
    body={
        "query": {
            "match_all": {}
        }
    }
)

pprint(response['hits']['hits'])

[{'_id': 'J-QW1ZMBZRxjKM60JqRr',
  '_index': 'my_index',
  '_score': 1.0,
  '_source': {'created_on': '2024-09-22',
              'embedding': [-0.04355228319764137,
                            0.06440841406583786,
                            -0.005080128088593483,
                            0.03445187211036682,
                            0.04063338786363602,
                            0.014603231102228165,
                            -0.019641702994704247,
                            0.049041084945201874,
                            0.03582879900932312,
                            0.011970664374530315,
                            0.04181138426065445,
                            0.08254100382328033,
                            -0.00032650609500706196,
                            -0.037260282784700394,
                            -0.009786654263734818,
                            0.0391247496008873,
                            0.030936775729060173,
                            -0.0744

In [74]:
response = es.indices.get_mapping(
    index="my_index"
)
pprint(response.body)

{'my_index': {'mappings': {'properties': {'created_on': {'type': 'date'},
                                          'embedding': {'dims': 384,
                                                        'index': True,
                                                        'index_options': {'ef_construction': 100,
                                                                          'm': 16,
                                                                          'type': 'int8_hnsw'},
                                                        'similarity': 'cosine',
                                                        'type': 'dense_vector'},
                                          'text': {'fields': {'keyword': {'ignore_above': 256,
                                                                          'type': 'keyword'}},
                                                   'type': 'text'},
                                          'title': {'fields': {'keyword': {'ignore_above':

In [75]:
es.indices.delete(index="my_index", ignore_unavailable=True)
es.indices.create(
    index="my_index",
    mappings={
        "properties": {
            "embedding":{
                "type": "dense_vector",
            }
        }
        
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [76]:
documents=[
  {
    "id": 1,
    "title": "The Solar System",
    "content": "The Solar System consists of the Sun and the objects that orbit it, including eight planets, their moons, dwarf planets, and countless small bodies like asteroids and comets."
  },
  {
    "id": 2,
    "title": "Black Holes",
    "content": "A black hole is a region of space where the gravitational pull is so strong that nothing, not even light, can escape from it. They are formed when massive stars collapse under their own gravity."
  },
  {
    "id": 3,
    "title": "Galaxies",
    "content": "Galaxies are vast systems that consist of stars, stellar remnants, interstellar gas, dust, and dark matter. The Milky Way is the galaxy that contains our Solar System."
  },
  {
    "id": 4,
    "title": "The Big Bang Theory",
    "content": "The Big Bang Theory is the leading explanation about how the universe began. It suggests that the universe was once in an extremely hot and dense state and has been expanding ever since."
  },
  {
    "id": 5,
    "title": "Exoplanets",
    "content": "Exoplanets, or extrasolar planets, are planets that exist outside our solar system. They vary greatly in size and composition and are often found using methods like the transit method and radial velocity."
  },
  {
    "id": 6,
    "title": "The Life Cycle of Stars",
    "content": "Stars are born from clouds of gas and dust in space. They undergo a life cycle that includes stages such as main sequence, red giant, and, ultimately, either a supernova explosion or a gentle fade into a white dwarf."
  },
  {
    "id": 7,
    "title": "Astrobiology",
    "content": "Astrobiology is the study of the origin, evolution, distribution, and future of life in the universe. It combines elements of biology, chemistry, and planetary science."
  },
  {
    "id": 8,
    "title": "Dark Matter",
    "content": "Dark matter is a type of matter that does not emit light or energy. It cannot be observed directly but is believed to make up about 27% of the universe's total mass and energy."
  },
  {
    "id": 9,
    "title": "The Expanding Universe",
    "content": "The universe has been expanding since the Big Bang. Observations of distant galaxies show that they are moving away from us, which supports the idea of an expanding universe."
  },
  {
    "id": 10,
    "title": "Space Exploration",
    "content": "Space exploration involves the use of space technology to explore outer space. It includes missions to planets, moons, and other celestial bodies, aiming to discover more about the universe."
  }
]

In [77]:
def get_embedding(text):
    return model.encode(text)

operations = []
for document in tqdm(documents, total=len(documents)):
    operations.append({"index":{"_index":"my_index"}})
    operations.append({
        **document,
        "embedding":get_embedding(document['content'])
    })

response = es.bulk(operations=operations)
pprint(response.body)

100%|██████████| 10/10 [00:00<00:00, 95.38it/s]

{'errors': False,
 'items': [{'index': {'_id': 'LeQv1ZMBZRxjKM60E6Tx',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 0,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'LuQv1ZMBZRxjKM60E6Tx',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 1,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'L-Qv1ZMBZRxjKM60E6Tx',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 2,
                      '_shards': {'failed': 0, 'successful': 1, '


