In [49]:
from pprint import pprint
from elasticsearch import Elasticsearch

es=Elasticsearch("http://localhost:9200")
client_info = es.info()
print("Connected to Elasticsearch!")
pprint(client_info.body)

Connected to Elasticsearch!
{'cluster_name': 'es-docker-cluster',
 'cluster_uuid': '43JLb8LZRh2z6LwR5bBHMw',
 'name': 'elasticsearch',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-12-11T12:08:05.663969764Z',
             'build_flavor': 'default',
             'build_hash': '2b6a7fed44faa321997703718f07ee0420804b41',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.12.0',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.17.0'}}


# Create index

In this method, the mappings which define the structure of documents within an index are infered automatically

In [50]:
es.indices.delete(index="my_index",ignore_unavailable=True)
es.indices.create(index="my_index")

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

`Shards`: Elasticsearch divides the data in an index into multiple shards. Each shard is a self-contained index that Elasticsearch can distribute across multiple nodes in a cluster. Shards are managed automatically but configured when creating the index.

`Replicas`: For fault tolerance and high availability, an index can have replica shards, which are copies of the primary shards.

In [51]:
es.indices.delete(index="my_index",ignore_unavailable=True)
es.indices.create(
    index="my_index",
    settings={
        "index": {
            "number_of_shards":3,
        "number_of_replicas":2
        },
    },
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [52]:
document={
    "title":"title",
    "text":"text",
    "created_on": "2024-09-22"
}
response=es.index(index="my_index",body=document)
response

ObjectApiResponse({'_index': 'my_index', '_id': 'ZzU6ipwBlpHV-l7Ftm2Y', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [53]:
print(response['result'])
print(response["_shards"])
print(response["_id"])
print(response["_index"])

created
{'total': 3, 'successful': 1, 'failed': 0}
ZzU6ipwBlpHV-l7Ftm2Y
my_index


In [54]:
import json

dummy_data = json.load(open("../data/dummy_data.json"))
dummy_data

[{'title': 'Sample Title 1',
  'text': 'This is the first sample document text.',
  'created_on': '2024-09-22'},
 {'title': 'Sample Title 2',
  'text': 'Here is another example of a document.',
  'created_on': '2024-09-24'},
 {'title': 'Sample Title 3',
  'text': 'The content of the third document goes here.',
  'created_on': '2024-09-24'}]

In [55]:
def insert_document(document):
    response=es.index(index="my_index",body=document)
    return response 

def print_info(response):
    print(f"""Document ID: {response['_id']} is '{
          response["result"]}' and is split into {response['_shards']['total']} shards.""")


for document in dummy_data:
    response = insert_document(document)
    print_info(response)

Document ID: aDU6ipwBlpHV-l7Ftm3I is 'created' and is split into 3 shards.
Document ID: aTU6ipwBlpHV-l7Ftm3O is 'created' and is split into 3 shards.
Document ID: ajU6ipwBlpHV-l7Ftm3U is 'created' and is split into 3 shards.


# Print mapping

In [56]:
index_mapping = es.indices.get_mapping(index='my_index')
pprint(index_mapping["my_index"]["mappings"]["properties"])

{'created_on': {'type': 'date'},
 'text': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
          'type': 'text'},
 'title': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
           'type': 'text'}}


# Manual mapping

In [57]:
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

mapping = {
    'properties': {
        'created_on': {'type': 'date'},
        'text': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        },
        'title': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        }
    }
}
es.indices.put_mapping(index="my_index", body=mapping)

index_mapping = es.indices.get_mapping(index='my_index')
pprint(index_mapping["my_index"]["mappings"]["properties"])

{'created_on': {'type': 'date'},
 'text': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
          'type': 'text'},
 'title': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
           'type': 'text'}}


# Common types

###  Binary

In [58]:
es.indices.delete(index='binary_index', ignore_unavailable=True)
es.indices.create(
    index='binary_index',
    mappings={
        "properties": {
            "image_data": {
                "type": "binary"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'binary_index'})

In [59]:
import base64

image_path = "../images/field_data_types_docs.png"
with open(image_path, "rb") as image_file:
    image_bytes = image_file.read()
    image_base64 = base64.b64encode(image_bytes).decode("utf-8")

image_base64[:100]

'iVBORw0KGgoAAAANSUhEUgAAB4AAAAJTCAYAAADpMAvgAAAABHNCSVQICAgIfAhkiAAAABl0RVh0U29mdHdhcmUAZ25vbWUtc2Ny'

In [60]:
document = {
    "image_data": image_base64
}
response = es.index(index='binary_index', body=document)
response

ObjectApiResponse({'_index': 'binary_index', '_id': 'azU6ipwBlpHV-l7FuG01', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

### Others

In [61]:
es.indices.delete(index='other_common_data_types_index',
                  ignore_unavailable=True)
es.indices.create(
    index='other_common_data_types_index',
    mappings={
        "properties": {
            "book_reference": {
                "type": "keyword"
            },
            "price": {
                "type": "float"
            },
            "publish_date": {
                "type": "date"
            },
            "is_available": {
                "type": "boolean"
            },
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'other_common_data_types_index'})

In [62]:
document = {
    "book_reference": "978-1617294433",
    "price": 44.99,
    "publish_date": "2021-06-30",
    "is_available": True
}
response = es.index(index='other_common_data_types_index', body=document)
response

ObjectApiResponse({'_index': 'other_common_data_types_index', '_id': 'bDU6ipwBlpHV-l7FuG3o', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

# Object types

### Object

In [63]:
es.indices.delete(index='object_index', ignore_unavailable=True)
es.indices.create(
    index='object_index',
    mappings={
        "properties": {
            "author": {
                "properties": {
                    "first_name": {
                        "type": "text"
                    },
                    "last_name": {
                        "type": "text"
                    }
                }
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'object_index'})

In [64]:
document = {
    "author": {
        "first_name": "Imad",
        "last_name": "Saddik"
    }
}
response = es.index(index='object_index', body=document)
response

ObjectApiResponse({'_index': 'object_index', '_id': 'bTU6ipwBlpHV-l7FuW19', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

## Flattened object

In [65]:
es.indices.delete(index='flattened_object_index', ignore_unavailable=True)
es.indices.create(
    index='flattened_object_index',
    mappings={
        "properties": {
            "author": {
                "type": "flattened"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'flattened_object_index'})

In [66]:
document = {
    "author": {
        "first_name": "Imad",
        "last_name": "Saddik"
    }
}
response = es.index(index='flattened_object_index', body=document)
response

ObjectApiResponse({'_index': 'flattened_object_index', '_id': 'bjU6ipwBlpHV-l7Fum0t', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

## Nested object

In [67]:
es.indices.delete(index='nested_object_index', ignore_unavailable=True)
es.indices.create(
    index='nested_object_index',
    mappings={
        "properties": {
            "user": {
                "type": "nested",
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nested_object_index'})

In [68]:
documents = [
    {
        "first": "John",
        "last": "Smith"
    },
    {
        "first": "Imad",
        "last": "Saddik"
    }
]
response = es.index(index='nested_object_index', body={"user": documents})
response

ObjectApiResponse({'_index': 'nested_object_index', '_id': 'bzU6ipwBlpHV-l7Fum3F', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

# Text search types

In [69]:
es.indices.delete(index='text_index', ignore_unavailable=True)
es.indices.create(
    index="text_index",
    mappings={
        "properties":{
            "email_body":{
                "type":"text"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_index'})

In [70]:
document = {
    "email_body": "Hello, this is a test email."
}
response = es.index(index='text_index', body=document)
response

ObjectApiResponse({'_index': 'text_index', '_id': 'cDU6ipwBlpHV-l7Fu21v', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

# Completion

In [71]:
es.indices.delete(index='text_completion_index', ignore_unavailable=True)
es.indices.create(
    index='text_completion_index',
    mappings={
        "properties": {
            "suggest": {
                "type": "completion"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_completion_index'})

In [72]:
document_1 = {
    "suggest": {
        "input": ["Mars", "Planet"]
    }
}

document_2 = {
    "suggest": {
        "input": ["Andromeda", "Galaxy"]
    }
}

es.index(index='text_completion_index', body=document_1)
es.index(index='text_completion_index', body=document_2)

ObjectApiResponse({'_index': 'text_completion_index', '_id': 'cjU6ipwBlpHV-l7FvG0E', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

#  Spatial data types

In [73]:
es.indices.delete(index='geo_point_index', ignore_unavailable=True)
es.indices.create(
    index='geo_point_index',
    mappings={
        "properties": {
            "location": {
                "type": "geo_point"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'geo_point_index'})

In [74]:
document = {
    "text": "Geopoint as an object using GeoJSON format",
    "location": {
        "type": "Point",
        "coordinates": [
            -71.34,
            41.12
        ]
    }
}
response = es.index(index='geo_point_index', body=document)
response

ObjectApiResponse({'_index': 'geo_point_index', '_id': 'czU6ipwBlpHV-l7FvG2r', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

# Geo shape

In [75]:
es.indices.delete(index='geo_shape_index', ignore_unavailable=True)
es.indices.create(
    index='geo_shape_index',
    mappings={
        "properties": {
            "location": {
                "type": "geo_shape"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'geo_shape_index'})

In [76]:
document_1 = {
    "location": {
        "type": "LineString",
        "coordinates": [
            [
                -77.03653,
                38.897676
            ],
            [
                -77.009051,
                38.889939
            ]
        ]
    }
}
document_2 = {
    "location": {
        "type": "Polygon",
        "coordinates": [
            [
                [100, 0],
                [101, 0],
                [101, 1],
                [100, 1],
                [100, 0],
            ],
            [
                [100.2, 0.2],
                [100.8, 0.2],
                [100.8, 0.8],
                [100.2, 0.8],
                [100.2, 0.2],
            ]
        ]
    }
}

es.index(index='geo_shape_index', body=document_1)
es.index(index='geo_shape_index', body=document_2)

ObjectApiResponse({'_index': 'geo_shape_index', '_id': 'dTU6ipwBlpHV-l7FvW1j', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

#  Point

In [77]:
es.indices.delete(index='point_index', ignore_unavailable=True)
es.indices.create(
    index='point_index',
    mappings={
        "properties": {
            "location": {
                "type": "point"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'point_index'})

In [78]:
document = {
    "location": {
        "type": "Point",
        "coordinates": [
            -71.34,
            41.12
        ]
    }
}

response = es.index(index='point_index', body=document)
response

ObjectApiResponse({'_index': 'point_index', '_id': 'djU6ipwBlpHV-l7Fvm0A', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [79]:
import json
from tqdm import tqdm


document_ids = []
dummy_data = json.load(open("../data/dummy_data.json"))
for document in tqdm(dummy_data, total=len(dummy_data)):
    response = es.index(index='my_index', body=document)
    document_ids.append(response['_id'])

100%|██████████| 3/3 [00:00<00:00, 186.17it/s]


In [80]:
document_ids

['dzU6ipwBlpHV-l7Fvm0N', 'eDU6ipwBlpHV-l7Fvm0U', 'eTU6ipwBlpHV-l7Fvm0Y']

In [81]:
response = es.delete(index='my_index', id=document_ids[0])
pprint(response.body)

{'_id': 'dzU6ipwBlpHV-l7Fvm0N',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 3,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 2,
 'result': 'deleted'}


In [82]:
try:
    response = es.delete(index='my_index', id="id")
except Exception as e:
    print(e)

NotFoundError(404, "{'_index': 'my_index', '_id': 'id', '_version': 1, 'result': 'not_found', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 4, '_primary_term': 1}")


# Get operation
This is an example of a successful operation. If the ID exists in the document, the get operation won't return any errors

In [83]:
response = es.get(index='my_index', id=document_ids[1])
pprint(response.body)

{'_id': 'eDU6ipwBlpHV-l7Fvm0U',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 1,
 '_source': {'created_on': '2024-09-24',
             'text': 'Here is another example of a document.',
             'title': 'Sample Title 2'},
 '_version': 1,
 'found': True}


# Count operation

In [84]:
response = es.count(index='my_index')
count = response["count"]

print(f"The number of documents in the index is {count}")

The number of documents in the index is 0


In [85]:
query = {
    "range": {
        "created_on": {
            "gte": "2024-09-24",
            "lte": "2024-09-24",
            "format": "yyyy-MM-dd"
        }
    }
}

response = es.count(index='my_index', query=query)
count = response["count"]

print(f"The number of documents in the index is {count}")

The number of documents in the index is 0


# Exists API

In [86]:

response = es.indices.exists(index='my_index')
response.body

True

In [87]:
response = es.exists(index='my_index', id=document_ids[1])
response.body

True

# Update API

### If documents exists in the index

In [88]:
response = es.update(
    index="my_index",
    id=document_ids[1],
    script={
        "source": "ctx._source.title = params.title",
        "params": {
            "title": "New Title"
        }
    },
)
pprint(response.body)

{'_id': 'eDU6ipwBlpHV-l7Fvm0U',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 5,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 2,
 'result': 'updated'}


In [89]:
response = es.get(index='my_index', id=document_ids[1])
pprint(response.body)

{'_id': 'eDU6ipwBlpHV-l7Fvm0U',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 5,
 '_source': {'created_on': '2024-09-24',
             'text': 'Here is another example of a document.',
             'title': 'New Title'},
 '_version': 2,
 'found': True}


# Add a new field
To add a new field, you can either use the script argument or the doc argument.

In [90]:
response = es.update(
    index="my_index",
    id=document_ids[1],
    script={
        "source": "ctx._source.new_field = 'dummy_value'",
    },
)
pprint(response.body)

{'_id': 'eDU6ipwBlpHV-l7Fvm0U',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 6,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 3,
 'result': 'updated'}


In [91]:
response = es.get(index='my_index', id=document_ids[1])
pprint(response.body)

{'_id': 'eDU6ipwBlpHV-l7Fvm0U',
 '_index': 'my_index',
 '_primary_term': 1,
 '_seq_no': 6,
 '_source': {'created_on': '2024-09-24',
             'new_field': 'dummy_value',
             'text': 'Here is another example of a document.',
             'title': 'New Title'},
 '_version': 3,
 'found': True}


# Bulk API

In [92]:
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [93]:
response = es.bulk(
    operations=[
        # Action 1
        {
            "index": {
                "_index": "my_index",
                "_id": "1"
            }
        },
        # Source 1
        {
            "title": "Sample Title 1",
            "text": "This is the first sample document text.",
            "created_on": "2024-09-22"
        },
        # Action 2
        {
            "index": {
                "_index": "my_index",
                "_id": "2"
            }
        },
        # Source 2
        {
            "title": "Sample Title 2",
            "text": "Here is another example of a document.",
            "created_on": "2024-09-24"
        },
        # Action 3
        {
            "index": {
                "_index": "my_index",
                "_id": "3"
            }
        },
        # Source 3
        {
            "title": "Sample Title 3",
            "text": "The content of the third document goes here.",
            "created_on": "2024-09-24"
        },
        # Action 4
        {
            "update": {
                "_id": "1",
                "_index": "my_index"
            }
        },
        # Source 4
        {
            "doc": {
                "title": "New Title"
            }
        },
        # Action 5
        {
            "update": {
                "_id": "2",
                "_index": "my_index"
            }
        },
        # Source 5
        {
            "doc": {
                "new_field": "dummy_value"
            }
        },
        # Action 6
        {
            "delete": {
                "_index": "my_index",
                "_id": "3"
            }
        },
    ],
)

pprint(response.body)


{'errors': False,
 'items': [{'index': {'_id': '1',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 0,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': '2',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 1,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': '3',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 2,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
        

In [94]:
response.body["errors"]

False

# Searching

In [97]:
response = es.search(
    index='my_index',
    body={
        "query": {"match_all": {}}
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in index_1")

Found 2 documents in index_1


In [104]:
response['hits']

{'total': {'value': 2, 'relation': 'eq'},
 'max_score': 1.0,
 'hits': [{'_index': 'my_index',
   '_id': '1',
   '_score': 1.0,
   '_source': {'title': 'New Title',
    'text': 'This is the first sample document text.',
    'created_on': '2024-09-22'}},
  {'_index': 'my_index',
   '_id': '2',
   '_score': 1.0,
   '_source': {'title': 'Sample Title 2',
    'text': 'Here is another example of a document.',
    'created_on': '2024-09-24',
    'new_field': 'dummy_value'}}]}