In [17]:
from elasticsearch import Elasticsearch
from utils import _pprint

es = Elasticsearch()
# usually put IP:Port as params to locate cluster,
# here localhost:9200 by default

In [47]:
# small query to fecth no of docs in our index
idxn = 'book-index'

doc_count = es.count( index = idxn,
                    body = {
                        'query' : {
                            'match_all' : {}
                        }
                    })

In [48]:
doc_count

{'count': 20,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

In [12]:
# update each document with an _id field,
# as using this 'low level' field to filter is deprecated

es.update_by_query( index = idxn,
                  body = {
                      "script" : {
                          "source" : "ctx._source.book_id = doc['_id'].value",
                          "lang" : "painless"
                      }
                  })

RequestError: RequestError(400, 'script_exception', 'compile error')

In [29]:
def _get_book_fields():
    fields_as_keys = es.search( index = idxn,
                    body = {
                        'query' : {
                            'match_all' : {}
                        }
                    }).get('hits').get('hits')[0].\
                    get('_source').keys()
    return fields_as_keys

In [30]:
_get_book_fields()

dict_keys(['_author_str', '_num_pages', '_book_type', '_name', '_pages'])

In [75]:
# basics

def _count_all_docs():
    doc_count = es.count( index = idxn,
                    body = {
                        'query' : {
                            'match_all' : {}
                        }
                    }).get('count')
    return doc_count

def _query_to_matchlist(query_obj):
    ''' strips a query dsl result of its metadata,
        to only return the list of results
    '''
    return query_obj.get('hits').get('hits')
def _all_docs_regexp():
    ''' uses regexp query to get all docs in index'''
    _all_docs = es.search( index = idxn,
                    body = {
                        'query' : {
                            "regexp" : {
                                "_name" : {
                                    "value" : "@",
                                    "flags" : "ALL"
                                }
                            }
                        }
                    })
    return _query_to_matchlist(_all_docs)

def _all_docs_by_id(metadata=False):
    ''' as strangely the classical method fails,
        another way is to count all docs, then
        iterate over all ids.
        Supposes one knows the _id field is an int,
        with no skipped values in [1, _num_docs]
    '''
    doc_count = _count_all_docs()
    if metadata:
        docs = [
            es.get( index=idxn, id=i)
            for i in range(0,doc_count)
        ]
    else: 
        docs = [
            es.get( index=idxn, id=i).get('_source')
            for i in range(0,doc_count)
        ]
    return docs

In [76]:
for doc in _all_docs_by_id(metadata=True):
    print(doc['_id']) ; _pprint(doc , depth = 2)

print( 4 * '-=- ', 'Now without metadata', 4 * '-=- ')

for doc in _all_docs_by_id(metadata=False):
    _pprint(doc , depth = 1)

0
{   '_id': '0',
    '_index': 'book-index',
    '_primary_term': 2,
    '_seq_no': 41,
    '_source': {   '_author_str': ' George Bernard Shaw',
                   '_book_type': 'drama',
                   '_name': 'Pygmalion',
                   '_num_pages': 144,
                   '_pages': [...]},
    '_type': '_doc',
    '_version': 1,
    'found': True}
1
{   '_id': '1',
    '_index': 'book-index',
    '_primary_term': 2,
    '_seq_no': 42,
    '_source': {   '_author_str': ' William Shakespeare',
                   '_book_type': 'drama',
                   '_name': 'Hamlet',
                   '_num_pages': 184,
                   '_pages': [...]},
    '_type': '_doc',
    '_version': 2,
    'found': True}
2
{   '_id': '2',
    '_index': 'book-index',
    '_primary_term': 2,
    '_seq_no': 43,
    '_source': {   '_author_str': 'author of the work,',
                   '_book_type': 'drama',
                   '_name': 'Macbeth',
                   '_num_pages': 97,
           

In [73]:
# nasty queries which use python to update fields,
# instead of using the much-much faster update_by_query
# method & confusing Query DSL ...

def _nasty_add_book_id():
    for i,doc in enumerate(_all_docs_by_id(metadata=False)):
        doc['book_id'] = i
        es.index( index = idxn,
                body = doc,
                id = i)

In [77]:
_nasty_add_book_id() # call nasty updater

for doc in _all_docs_by_id(metadata=False):
    _pprint(doc , depth = 1)
    
# check there if field added

{   '_author_str': ' George Bernard Shaw',
    '_book_type': 'drama',
    '_name': 'Pygmalion',
    '_num_pages': 144,
    '_pages': [...],
    'book_id': 0}
{   '_author_str': ' William Shakespeare',
    '_book_type': 'drama',
    '_name': 'Hamlet',
    '_num_pages': 184,
    '_pages': [...],
    'book_id': 1}
{   '_author_str': 'author of the work,',
    '_book_type': 'drama',
    '_name': 'Macbeth',
    '_num_pages': 97,
    '_pages': [...],
    'book_id': 2}
{   '_author_str': ' Oscar Wilde',
    '_book_type': 'drama',
    '_name': 'The Importance of Being Earnest',
    '_num_pages': 79,
    '_pages': [...],
    'book_id': 3}
{   '_author_str': ' Edgar Allan Poe',
    '_book_type': 'thriller',
    '_name': 'Collected Works of Poe',
    '_num_pages': 208,
    '_pages': [...],
    'book_id': 4}
{   '_author_str': ' Edgar Allan Poe',
    '_book_type': 'thriller',
    '_name': 'Collected Works of Poe',
    '_num_pages': 208,
    '_pages': [...],
    'book_id': 5}
{   '_author_str': ' F

In [82]:
_count_all_docs()

20