In [None]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-9.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.17.1-py3-none-any.whl.metadata (3.8 kB)
Downloading elasticsearch-9.0.0-py3-none-any.whl (895 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m895.8/895.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading elastic_transport-8.17.1-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.17.1 elasticsearch-9.0.0


In [None]:
#connecting to a server
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

In [None]:
##Creating an index with replication and document set up
es.indices.delete(index='AfricaGenderData', ignore_unavailable=True)
es.indices.create(
    index="AfricaGenderData",
    settings={
        "index": {
            "number_of_shards": 3,  # how many pieces the data is split into
            "number_of_replicas": 2  # how many copies of the data
        }
    },
)

In [None]:
document = {
    'title': 'title',
    'description': 'text',
    'level': '2024-09-22',
    'geo': 'Africa',
    'link': 'Female',
    'source': 'Male'
}
response = es.index(index='AfricaGenderData', body=document)
response

In [None]:
#confirming
print(response["result"])
print(response["_shards"])
print(response["_id"])
print(response["_index"])

In [None]:
#Using a json source
import json

agdp_data = json.load(open("../data/agdp_data.json"))
agdp_data

In [None]:
#function to print after inserting
def insert_document(document):
    response = es.index(index='AfricaGenderData', body=document)
    return response


def print_info(response):
    print(f"""Document ID: {response['_id']} is '{
          response["result"]}' and is split into {response['_shards']['total']} shards.""")


for document in agdp_data:
    response = insert_document(document)
    print_info(response)

In [None]:
#check the mapping
from pprint import pprint

index_mapping = es.indices.get_mapping(index='AfricaGenderData')
pprint(index_mapping["my_index"]["mappings"]["properties"])

In [None]:
#Mapping manually
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

mapping = {
    'properties': {
        'created_on': {'type': 'date'},
        'text': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        },
        'title': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        }
    }
}

es.indices.put_mapping(index='my_index', body=mapping)

index_mapping = es.indices.get_mapping(index='my_index')
pprint(index_mapping["my_index"]["mappings"]["properties"])

In [None]:
#Mapping Manually version 2
mapping = {
    'properties': {
        'created_on': {'type': 'date'},
        'text': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        },
        'title': {
            'type': 'text',
            'fields': {
                'keyword': {
                    'type': 'keyword',
                    'ignore_above': 256
                }
            }
        }
    }
}

es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index', mappings=mapping)

index_mapping = es.indices.get_mapping(index='my_index')
pprint(index_mapping["my_index"]["mappings"]["properties"])

In [None]:
#deleting documents
response = es.delete(index='AfricaGenderData', id=document_ids[0])
#confirmation
from pprint import pprint
pprint(response.body)

In [None]:
#getting a document with id
response = es.get(index='AfricaGenderData', id=document_ids[0])
#printing it
from pprint import pprint
pprint(response.body)

In [None]:
#count number of docs in index
response = es.count(index='AfricaGenderData')
count = response["count"]

print(f"The number of documents in the index is {count}")

In [None]:
#checking if index exists
response = es.indices.exists(index='AfricGenderData')
response.body #this should return true

In [None]:
#check if a document exists within the index
import json
from tqdm import tqdm

document_ids = []
dummy_data = json.load(open("../data/exampledata.json"))
for document in tqdm(dummy_data, total=len(dummy_data)):
    response = es.index(index='my_index', body=document)
    document_ids.append(response['_id'])

response = es.exists(index='AfricaGenderData', id=document_ids[0])
response.body #should return true

In [None]:
#updating a document
from pprint import pprint

response = es.update(
    index="AfricaGenderData",
    id=document_ids[0],
    script={
        "source": "ctx._source.title = params.title",
        "params": {
            "title": "New Title"
        }
    },
)
pprint(response.body)

#check the new updated title
response = es.get(index='AfricaGenderData', id=document_ids[0])
pprint(response.body)

In [None]:
#Adding a new field
response = es.update(
    index="AfricaGenderData",
    id=document_ids[0],
    script={
        "source": "ctx._source.new_field = 'dummy_value'",
    },
)
pprint(response.body)

#Alternative
response = es.update(
    index="AfricaGenderData",
    id=document_ids[0],
    doc={
        "new_value_2": "dummy_value_2",
    },
)
pprint(response.body)

#check the updated
response = es.get(index='AfricaGenderData', id=document_ids[0])
pprint(response.body)

In [None]:
#remvoing a field
response = es.update(
    index="AfricaGenderData",
    id=document_ids[0],
    script={
        "source": "ctx._source.remove('new_field')",
    },
)
pprint(response.body)

In [None]:
#inserting a document if it doesnt exist
response = es.update(
    index="AfricaGenderData",
    id="1",
    doc={
        "book_id": 1234,
        "book_name": "A book",
    },
    doc_as_upsert=True,
)

In [None]:
#doing multiple operations at once
response = es.bulk(
    operations=[
        # Action 1
        {
            "index": {
                "_index": "AfricaGenderData",
                "_id": "1"
            }
        },
        # Source 1
        {
            "title": "Sample Title 1",
            "text": "This is the first sample document text.",
            "created_on": "2024-09-22"
        },
        # Action 2
        {
            "index": {
                "_index": "my_index",
                "_id": "2"
            }
        },
        # Source 2
        {
            "title": "Sample Title 2",
            "text": "Here is another example of a document.",
            "created_on": "2024-09-24"
        },
        # Action 3
        {
            "index": {
                "_index": "my_index",
                "_id": "3"
            }
        },
        # Source 3
        {
            "title": "Sample Title 3",
            "text": "The content of the third document goes here.",
            "created_on": "2024-09-24"
        },
        # Action 4
        {
            "update": {
                "_id": "1",
                "_index": "my_index"
            }
        },
        # Source 4
        {
            "doc": {
                "title": "New Title"
            }
        },
        # Action 5
        {
            "update": {
                "_id": "2",
                "_index": "my_index"
            }
        },
        # Source 5
        {
            "doc": {
                "new_field": "dummy_value"
            }
        },
        # Action 6
        {
            "delete": {
                "_index": "my_index",
                "_id": "3"
            }
        },
    ],
)

pprint(response.body)

In [None]:
response.body["errors"] #should be flase to show zero errors