In [1]:
import json
import logging
import elasticsearch
from elasticsearch import helpers

In [2]:
def connect_elasticssearch():
    config = {
        'host': 'localhost',
        'port': '9200'
    }
    _es = elasticsearch.Elasticsearch([config,], timeout=300)
    if _es.ping():
        print('Conectado!')
    else:
        print('Não há conexão com o Elasticsearch')
    return _es

if __name__ == '__main__':
  logging.basicConfig(level=logging.ERROR)


In [3]:
def create_index(es_object, index_name):
    created = False
    #index settings
    settings = {
        "settings":{
            "number_of_shards": 1,
            "number_of_replicas": 0
        }
    }
    
    try:
        if not es_object.indices.exists(index_name):
                # Ignore 400 means to ignore "Index Already Exist" error.
                es_object.indices.create(index=index_name, body=settings)
                print('Created Index')
        created = True
    except Exception as ex:
        print(str(ex))
    finally:
            return created


In [4]:
def get_indices(es_object):
    return es_object.indices.get_alias("*")

In [5]:
def delete_all_indices(es_object):
    
    for key in es_object.indices.get_alias("*").keys():
        es_object.indices.delete(index=key)

In [6]:
def get_data_from_file(filename):
    data = [l.strip() for l in open(filename, encoding="utf8", errors='ignore')]
    return data

In [7]:
def bulk_insert_from_file(es_object, filename):
    data = get_data_from_file(filename)
    response = helpers.bulk(es_object, 
                            data,
                            index = 'accounts',
                            doc_type = '_doc'
                           )

In [14]:
def get_all_documents(es_object, index_name, size):
    
    body = {
        'size': size,
        'query': {
            'match_all': {}
            },
        'sort': [{ 'account_number': 'asc' }] 
    }

    response = es_object.search(index = index_name, doc_type = '_doc', body = body)

    return [doc for doc in response['hits']['hits']]

In [17]:
def get_n_documents(es_object, index_name, size, start):
    body = {
        'query': {
            'match_all': {}
        },
        'sort': [{'account_number': 'asc'}],
        'from': start,
        'size': size
    }
    
    response = es_object.search(index = index_name, doc_type = '_doc', body = body)
    
    return [doc for doc in response['hits']['hits']]

In [21]:
def get_document_by_address(es_object, index_name, address):
    body = {
        'query':{
            'match':{
                'address': address
            }
        }
    }
    
    response = es_object.search(index = index_name, body = body)
    
    return [doc for doc in response['hits']['hits']]

In [23]:
def get_document_by_address_strict(es_object, index_name, address):
    body = {
        'query':{
            'match_phrase':{
                'address': address
            }
        }
    }
    
    response = es_object.search(index = index_name, body = body)
    
    return [doc for doc in response['hits']['hits']]

In [27]:
def get_document_by_age_and_state(es_object, index_name, age, state):
    body = {
        'query': {
            'bool': {
                'must': [
                    {
                        'match' : {'age': age}
                    }
                ],
                'must_not': [
                    {'match': {'state': state}}
                ]
            }
        }
    }
    
    response = es_object.search(index = index_name, body = body)
    return [doc for doc in response['hits']['hits']]

In [31]:
def get_documents_in_balance_range(es_object, index_name, gte, lte):
    body = {
        'query': {
            'bool': {
                'must': {
                    'match_all': {}
                },
                'filter': {
                    'range':{
                        'balance':{
                            'gte': gte,
                            'lte': lte
                        }
                    }
                }
            } 
        }
    }
    
    response = es_object.search(index = index_name, body = body)
    return [doc for doc in response['hits']['hits']]

In [45]:
def aggregate_accounts_by_state(es_object, index_name):
    body = {
        'size': 0,
        'aggs': {
            'group_by_state': {
                'terms': {
                    'field': 'state.keyword'
                }
            }
        }
    }
    
    response = es_object.search(index = index_name, body = body)
    print(response)
    return response['aggregations']['group_by_state']

In [9]:
es_object = connect_elasticssearch()

Conectado!


In [30]:
create_index(es_object, 'accounts')

Created Index


True

In [31]:
get_indices(es_object)

{'accounts': {'aliases': {}}}

In [66]:
bulk_insert_from_file(es_object, 'accounts.json')

In [15]:
get_all_documents(es_object, "accounts", 1000)

[{'_index': 'accounts',
  '_type': '_doc',
  '_id': '4XRYanQBXWwj1NjsoQA7',
  '_score': None,
  '_source': {'account_number': 0,
   'balance': 16623,
   'firstname': 'Bradshaw',
   'lastname': 'Mckenzie',
   'age': 29,
   'gender': 'F',
   'address': '244 Columbus Place',
   'employer': 'Euron',
   'email': 'bradshawmckenzie@euron.com',
   'city': 'Hobucken',
   'state': 'CO'},
  'sort': [0]},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'snRaanQBXWwj1Njsbwhf',
  '_score': None,
  '_source': {'account_number': 0,
   'balance': 16623,
   'firstname': 'Bradshaw',
   'lastname': 'Mckenzie',
   'age': 29,
   'gender': 'F',
   'address': '244 Columbus Place',
   'employer': 'Euron',
   'email': 'bradshawmckenzie@euron.com',
   'city': 'Hobucken',
   'state': 'CO'},
  'sort': [0]},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'gnRfanQBXWwj1Njs_hCW',
  '_score': None,
  '_source': {'account_number': 0,
   'balance': 16623,
   'firstname': 'Bradshaw',
   'lastname': 'Mckenzie',
 

In [18]:
get_n_documents(es_object, "accounts", 10, 10)

[{'_index': 'accounts',
  '_type': '_doc',
  '_id': 'YnRfanQBXWwj1Njs_Q3Z',
  '_score': None,
  '_source': {'account_number': 2,
   'balance': 28838,
   'firstname': 'Roberta',
   'lastname': 'Bender',
   'age': 22,
   'gender': 'F',
   'address': '560 Kingsway Place',
   'employer': 'Chillium',
   'email': 'robertabender@chillium.com',
   'city': 'Bennett',
   'state': 'LA'},
  'sort': [2]},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'MnRganQBXWwj1NjsHhX5',
  '_score': None,
  '_source': {'account_number': 2,
   'balance': 28838,
   'firstname': 'Roberta',
   'lastname': 'Bender',
   'age': 22,
   'gender': 'F',
   'address': '560 Kingsway Place',
   'employer': 'Chillium',
   'email': 'robertabender@chillium.com',
   'city': 'Bennett',
   'state': 'LA'},
  'sort': [2]},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'cXRYanQBXWwj1NjsoQK-',
  '_score': None,
  '_source': {'account_number': 3,
   'balance': 44947,
   'firstname': 'Levine',
   'lastname': 'Burks',
   'age'

In [22]:
get_document_by_address(es_object, 'accounts', 'mill lane')

[{'_index': 'accounts',
  '_type': '_doc',
  '_id': 'h3NYanQBXWwj1NjsoP-Y',
  '_score': 9.61595,
  '_source': {'account_number': 136,
   'balance': 45801,
   'firstname': 'Winnie',
   'lastname': 'Holland',
   'age': 38,
   'gender': 'M',
   'address': '198 Mill Lane',
   'employer': 'Neteria',
   'email': 'winnieholland@neteria.com',
   'city': 'Urie',
   'state': 'IL'}},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'WHRaanQBXWwj1Njsbwce',
  '_score': 9.61595,
  '_source': {'account_number': 136,
   'balance': 45801,
   'firstname': 'Winnie',
   'lastname': 'Holland',
   'age': 38,
   'gender': 'M',
   'address': '198 Mill Lane',
   'employer': 'Neteria',
   'email': 'winnieholland@neteria.com',
   'city': 'Urie',
   'state': 'IL'}},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'KHRfanQBXWwj1Njs_g8_',
  '_score': 9.61595,
  '_source': {'account_number': 136,
   'balance': 45801,
   'firstname': 'Winnie',
   'lastname': 'Holland',
   'age': 38,
   'gender': 'M',
   'addre

In [24]:
get_document_by_address_strict(es_object, 'accounts', 'mill lane')

[{'_index': 'accounts',
  '_type': '_doc',
  '_id': 'h3NYanQBXWwj1NjsoP-Y',
  '_score': 9.61595,
  '_source': {'account_number': 136,
   'balance': 45801,
   'firstname': 'Winnie',
   'lastname': 'Holland',
   'age': 38,
   'gender': 'M',
   'address': '198 Mill Lane',
   'employer': 'Neteria',
   'email': 'winnieholland@neteria.com',
   'city': 'Urie',
   'state': 'IL'}},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'WHRaanQBXWwj1Njsbwce',
  '_score': 9.61595,
  '_source': {'account_number': 136,
   'balance': 45801,
   'firstname': 'Winnie',
   'lastname': 'Holland',
   'age': 38,
   'gender': 'M',
   'address': '198 Mill Lane',
   'employer': 'Neteria',
   'email': 'winnieholland@neteria.com',
   'city': 'Urie',
   'state': 'IL'}},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'KHRfanQBXWwj1Njs_g8_',
  '_score': 9.61595,
  '_source': {'account_number': 136,
   'balance': 45801,
   'firstname': 'Winnie',
   'lastname': 'Holland',
   'age': 38,
   'gender': 'M',
   'addre

In [28]:
get_document_by_age_and_state(es_object, 'accounts', 40, 'ID')

[{'_index': 'accounts',
  '_type': '_doc',
  '_id': '7XNYanQBXWwj1Njsnvzr',
  '_score': 1.0,
  '_source': {'account_number': 474,
   'balance': 35896,
   'firstname': 'Obrien',
   'lastname': 'Walton',
   'age': 40,
   'gender': 'F',
   'address': '192 Ide Court',
   'employer': 'Suremax',
   'email': 'obrienwalton@suremax.com',
   'city': 'Crucible',
   'state': 'UT'}},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': '73NYanQBXWwj1Njsnvzr',
  '_score': 1.0,
  '_source': {'account_number': 479,
   'balance': 31865,
   'firstname': 'Cameron',
   'lastname': 'Ross',
   'age': 40,
   'gender': 'M',
   'address': '904 Bouck Court',
   'employer': 'Telpod',
   'email': 'cameronross@telpod.com',
   'city': 'Nord',
   'state': 'MO'}},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'C3NYanQBXWwj1Njsnv3s',
  '_score': 1.0,
  '_source': {'account_number': 549,
   'balance': 1932,
   'firstname': 'Jacqueline',
   'lastname': 'Maxwell',
   'age': 40,
   'gender': 'M',
   'address': '444 S

In [32]:
get_documents_in_balance_range(es_object, 'accounts', 20000, 30000)

[{'_index': 'accounts',
  '_type': '_doc',
  '_id': 'Q3NYanQBXWwj1Njsnvzq',
  '_score': 1.0,
  '_source': {'account_number': 49,
   'balance': 29104,
   'firstname': 'Fulton',
   'lastname': 'Holt',
   'age': 23,
   'gender': 'F',
   'address': '451 Humboldt Street',
   'employer': 'Anocha',
   'email': 'fultonholt@anocha.com',
   'city': 'Sunriver',
   'state': 'RI'}},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'WXNYanQBXWwj1Njsnvzq',
  '_score': 1.0,
  '_source': {'account_number': 102,
   'balance': 29712,
   'firstname': 'Dena',
   'lastname': 'Olson',
   'age': 27,
   'gender': 'F',
   'address': '759 Newkirk Avenue',
   'employer': 'Hinway',
   'email': 'denaolson@hinway.com',
   'city': 'Choctaw',
   'state': 'NJ'}},
 {'_index': 'accounts',
  '_type': '_doc',
  '_id': 'ZXNYanQBXWwj1Njsnvzq',
  '_score': 1.0,
  '_source': {'account_number': 133,
   'balance': 26135,
   'firstname': 'Deena',
   'lastname': 'Richmond',
   'age': 36,
   'gender': 'F',
   'address': '646 Und

In [46]:
aggregate_accounts_by_state(es_object, 'accounts')

{'took': 5, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 8000, 'relation': 'eq'}, 'max_score': None, 'hits': []}, 'aggregations': {'group_by_state': {'doc_count_error_upper_bound': 0, 'sum_other_doc_count': 2972, 'buckets': [{'key': 'TX', 'doc_count': 120}, {'key': 'MD', 'doc_count': 112}, {'key': 'ID', 'doc_count': 108}, {'key': 'AL', 'doc_count': 100}, {'key': 'ME', 'doc_count': 100}, {'key': 'TN', 'doc_count': 100}, {'key': 'WY', 'doc_count': 100}, {'key': 'DC', 'doc_count': 96}, {'key': 'MA', 'doc_count': 96}, {'key': 'ND', 'doc_count': 96}]}}}


{'doc_count_error_upper_bound': 0,
 'sum_other_doc_count': 2972,
 'buckets': [{'key': 'TX', 'doc_count': 120},
  {'key': 'MD', 'doc_count': 112},
  {'key': 'ID', 'doc_count': 108},
  {'key': 'AL', 'doc_count': 100},
  {'key': 'ME', 'doc_count': 100},
  {'key': 'TN', 'doc_count': 100},
  {'key': 'WY', 'doc_count': 100},
  {'key': 'DC', 'doc_count': 96},
  {'key': 'MA', 'doc_count': 96},
  {'key': 'ND', 'doc_count': 96}]}