# TP Elasticsearch


**Alumno: Ivan Fabio Gutierrez Pinedo DNI 33437838**

Utilizando el archivo G20 provisto durante la clase (https://bit.ly/g20_Diplo2020), indexar los tweets del archivo, asegurándose de definir el mapping de forma correcta para los campos de interés. Notar que el mapping correcto no sólo depende del tipo de datos, sino de las consultas que deben hacerse sobre los mismos. De ser necesario o conveniente, se podría elegir indexar un campo de más de una forma.
Los campos de interés son:
*  “text”, “created_at”, “id”, “user.location”, “user.followers_count”, “place.bounding_box”,  “source”, “entities.hashtags”, “timestamp_ms”, “retweeted”
Las consultas de interés son:
1. Poder buscar teniendo en cuenta el patrón de mayúsculas o minúsculas, por ejemplo: “President” o “PRESIDENT” en el campo ‘text’, solo debe devolver documentos que tengan la palabra escrita con el mismo patrón de mayúsculas y minúsculas.
2. Poder buscar por palabras que compartan una misma raíz, por ejemplo: “pray”, haría un match con tweets que contengan: “prays”, “prayer”, “praying”, “prayers” en el campo ‘text’.
3. Consulta para poder buscar una ubicación en el campo ‘user.location’ y que el matching sea exacto, por ejemplo para las consultas “York” y “California”, no debería devolver documentos en donde user.location sea “New York” o “LA, California” por ejemplo.
4. Consulta para poder buscar tweets cuyos usuarios hayan abierto sus cuentas en un rango de fechas.
5. Consulta para poder buscar tweets que hayan sido posteados en un área específica,por ejemplo, en el área de “Washington DC” usando el campo “place.bounding_box”.

Investigacion para realizar el TP 
* https://www.youtube.com/watch?v=ma3BC8aPBfE&ab_channel=AmineM.Boulouma

In [296]:
from elasticsearch import Elasticsearch, helpers
import os, uuid
import json
from datetime import datetime
import dateutil.parser

In [304]:
# Configuration

mapping = {
  "properties": {
    "text": {
      "type": "text",
      "analyzer": "my_analyzer",
      "fields": {
        "case_sensitive": {
          "type": "text",
          "analyzer": "case_sensitive"
        }        
      }      
    },
    "created_at": {
      "type": "date",
      "format": "E MMM dd HH:mm:ss Z yyyy"
    },
    "user": {
      "properties": {
        "location": { "type": "keyword" }
      }
    },
    "place": {
      "properties": {
        "bounding_box": {
          "type": "geo_shape"
          }
      }
    }  
  }
}

setting = {
  "settings": {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "my_stemmer"
          ]
        },
        "stem_analysis": {
          "tokenizer": "whitespace",
          "filter": [ "stemmer" ]
        },
      "case_sensitive" : {
             "type" : "custom",
             "tokenizer":    "standard",
             "filter": ["stop", "porter_stem" ]                    
         }
      },
      "filter": {
        "my_stemmer": {
          "type": "stemmer",
          "language": "english"#"light_english"#"english"
        }
      }
    },
    "index.mapping.total_fields.limit": 2000,
  }
}

In [305]:
mapping

{'properties': {'text': {'type': 'text',
   'analyzer': 'my_analyzer',
   'fields': {'case_sensitive': {'type': 'text',
     'analyzer': 'case_sensitive'}}},
  'created_at': {'type': 'date', 'format': 'E MMM dd HH:mm:ss Z yyyy'},
  'user': {'properties': {'location': {'type': 'keyword'}}},
  'place': {'properties': {'bounding_box': {'type': 'geo_shape'}}}}}

In [306]:
index_name = 'g20_index'
es = Elasticsearch(HOST="http://localhost", PORT=9200)
exist_index = es.indices.exists(index_name)
if exist_index :
    es.indices.delete(index_name)
es.indices.create(index_name, body=setting)



{'acknowledged': True, 'shards_acknowledged': True, 'index': 'g20test'}

In [307]:
es.indices.put_mapping(body=mapping, index=index_name)

{'acknowledged': True}

In [308]:
es.indices.put_settings(json.dumps(index_setting), index=index_name)
es.indices.get_settings(index_name)

{'g20test': {'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}},
    'mapping': {'total_fields': {'limit': '2000'}},
    'number_of_shards': '1',
    'provided_name': 'g20test',
    'creation_date': '1611447431632',
    'analysis': {'filter': {'my_stemmer': {'type': 'stemmer',
       'language': 'english'}},
     'analyzer': {'my_analyzer': {'filter': ['lowercase', 'my_stemmer'],
       'tokenizer': 'standard'},
      'stem_analysis': {'filter': ['stemmer'], 'tokenizer': 'whitespace'},
      'case_sensitive': {'filter': ['stop', 'porter_stem'],
       'type': 'custom',
       'tokenizer': 'standard'}}},
    'number_of_replicas': '1',
    'uuid': 'AmgD7lv4QDW3DxhvC0efHg',
    'version': {'created': '7100099'}}}}}

In [309]:
def get_data_from_file(file_name):
    if "/" in file_name or chr(92) in file_name:
        file = open(file_name, encoding="utf8", errors='ignore')
    else:
        # use the script_path() function to get path if none is passed
        file = open('./' + str(file_name), encoding="utf8", errors='ignore')
    data = [line.strip() for line in file]
    file.close()
    return data

def bulk_g20_ndjson_data(json_file, _index):
    json_list = get_data_from_file(json_file)
    index = None
    dict_index = None
    for doc in json_list:
        if '{"index"' in doc:
            dict_index = json.loads(doc)
        if '{"index"' not in doc and index != None:        
            yield {
                "_index": _index,
                #"_type": doc_type,
                "_id": dict_index.get('index').get('_id'),
                "_source": doc
            }
            dict_index = None
            
def bulk_g20_from_dict_data(g20_dict, _index):      
    for key, doc in g20_dict.items():
          
        yield {
            "_index": _index,
            #"_type": doc_type,
            "_id": key,
            "_source": doc
        }
        
def convert_g20_to_dict(file_name):    
    data = get_data_from_file('g20.json')
    index_list = []
    doc_list = []
    for row in data:
        if '{"index"' in row:
            dict_index = json.loads(row)
            index_list.append(dict_index)
        else :
            doc_list.append(row)


    g20_dict = { index_list[i].get('index').get('_id') : doc_list[i] for i in range(0, len(index_list)) } 
    return g20_dict


In [310]:
data = convert_g20_to_dict('g20.json')
helpers.bulk(es, bulk_g20_from_dict_data(data, index_name))

BulkIndexError: ('7 document(s) failed to index.', [{'index': {'_index': 'g20test', '_type': '_doc', '_id': '1059862707782987777', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse field [place.bounding_box] of type [geo_shape]', 'caused_by': {'type': 'x_content_parse_exception', 'reason': 'Failed to build [geojson] after last required field arrived', 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'first and last points of the linear ring must be the same (it must close itself): x[0]=72.74484 x[3]=73.003648 y[0]=18.845343 y[3]=18.845343'}}}, 'data': '{"quote_count": 0, "contributors": null, "truncated": false, "text": "@realDonaldTrump  : Mr. Trump please make America great again, Vote for Obama\\u2019s party.", "is_quote_status": false, "in_reply_to_status_id": null, "reply_count": 0, "id": 1059862707782987777, "favorite_count": 0, "source": "<a href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\">Twitter for iPhone</a>", "retweeted": false, "coordinates": null, "timestamp_ms": "1541525935489", "entities": {"user_mentions": [{"id": 25073877, "indices": [0, 16], "id_str": "25073877", "screen_name": "realDonaldTrump", "name": "Donald J. Trump"}], "symbols": [], "hashtags": [], "urls": []}, "in_reply_to_screen_name": "realDonaldTrump", "id_str": "1059862707782987777", "retweet_count": 0, "in_reply_to_user_id": 25073877, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 110503319, "default_profile": false, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/469106999678861313/RQXTYjtr_normal.jpeg", "profile_sidebar_fill_color": "F6FFD1", "profile_text_color": "333333", "followers_count": 333, "profile_sidebar_border_color": "FFF8AD", "id_str": "110503319", "profile_background_color": "FFF04D", "listed_count": 5, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme19/bg.gif", "utc_offset": null, "statuses_count": 6944, "description": "I dont follow people on Twitter anymore, I realised they are lost too.", "friends_count": 746, "location": "Colaba, Mumbai", "profile_link_color": "0099CC", "profile_image_url": "http://pbs.twimg.com/profile_images/469106999678861313/RQXTYjtr_normal.jpeg", "following": null, "geo_enabled": true, "profile_background_image_url": "http://abs.twimg.com/images/themes/theme19/bg.gif", "name": "nader dabestani", "lang": "en", "profile_background_tile": false, "favourites_count": 66, "screen_name": "naderdabestani", "notifications": null, "url": null, "created_at": "Mon Feb 01 19:46:05 +0000 2010", "contributors_enabled": false, "time_zone": null, "protected": false, "translator_type": "none", "is_translator": false}, "geo": null, "in_reply_to_user_id_str": "25073877", "lang": "en", "created_at": "Tue Nov 06 17:38:55 +0000 2018", "filter_level": "low", "in_reply_to_status_id_str": null, "place": {"country_code": "IN", "url": "https://api.twitter.com/1.1/geo/id/7929cea6bd5b32bd.json", "country": "India", "place_type": "city", "bounding_box": {"type": "Polygon", "coordinates": [[[72.74484, 18.845343], [72.74484, 19.502937], [73.003648, 19.502937], [73.003648, 18.845343]]]}, "full_name": "Mumbai, India", "attributes": {}, "id": "7929cea6bd5b32bd", "name": "Mumbai"}}'}}, {'index': {'_index': 'g20test', '_type': '_doc', '_id': '1059862741937254401', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse field [place.bounding_box] of type [geo_shape]', 'caused_by': {'type': 'x_content_parse_exception', 'reason': 'Failed to build [geojson] after last required field arrived', 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'first and last points of the linear ring must be the same (it must close itself): x[0]=88.758028 x[3]=92.124652 y[0]=26.698986 y[3]=26.698986'}}}, 'data': '{"quote_count": 0, "contributors": null, "truncated": true, "text": "@narendramodi @MamataOfficial @RamNanthKobind @amitabhbigb @AmitShahArmy @KBCsony This Deepawali-Ek Diya Saheedo ke\\u2026 https://t.co/vufxKBlKUS", "is_quote_status": false, "in_reply_to_status_id": null, "reply_count": 0, "id": 1059862741937254401, "favorite_count": 0, "entities": {"user_mentions": [{"id": 18839785, "indices": [0, 13], "id_str": "18839785", "screen_name": "narendramodi", "name": "Narendra Modi"}, {"id": 2526794479, "indices": [14, 29], "id_str": "2526794479", "screen_name": "MamataOfficial", "name": "Mamata Banerjee"}, {"id": 876764558215462914, "indices": [30, 45], "id_str": "876764558215462914", "screen_name": "RamNanthKobind", "name": "Ram Nath Kovind"}, {"id": 135122652, "indices": [46, 58], "id_str": "135122652", "screen_name": "amitabhbigb", "name": "Amitabh Bachhan"}, {"id": 2268748494, "indices": [59, 72], "id_str": "2268748494", "screen_name": "AmitShahArmy", "name": "Amit Shah Army"}, {"id": 339600819, "indices": [73, 81], "id_str": "339600819", "screen_name": "KBCsony", "name": "KBC"}], "symbols": [], "hashtags": [], "urls": [{"url": "https://t.co/vufxKBlKUS", "indices": [117, 140], "expanded_url": "https://twitter.com/i/web/status/1059862741937254401", "display_url": "twitter.com/i/web/status/1\\u2026"}]}, "retweeted": false, "coordinates": null, "timestamp_ms": "1541525943632", "source": "<a href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\">Twitter for iPhone</a>", "in_reply_to_screen_name": "narendramodi", "id_str": "1059862741937254401", "display_text_range": [0, 140], "retweet_count": 0, "in_reply_to_user_id": 18839785, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 89105979, "default_profile": true, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/832484784064471040/uHBK_qPh_normal.jpg", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "followers_count": 22, "profile_sidebar_border_color": "C0DEED", "id_str": "89105979", "profile_background_color": "C0DEED", "listed_count": 0, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "utc_offset": null, "statuses_count": 60, "description": "Panchayat Member at Jaigaon II Gram Panchayat", "friends_count": 27, "location": "West Bengal, India", "profile_link_color": "1DA1F2", "profile_image_url": "http://pbs.twimg.com/profile_images/832484784064471040/uHBK_qPh_normal.jpg", "following": null, "geo_enabled": true, "profile_banner_url": "https://pbs.twimg.com/profile_banners/89105979/1493631626", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "name": "JAYANT MUNDHRA", "lang": "en", "profile_background_tile": false, "favourites_count": 35, "screen_name": "jaybtn", "notifications": null, "url": null, "created_at": "Wed Nov 11 04:23:14 +0000 2009", "contributors_enabled": false, "time_zone": null, "protected": false, "translator_type": "none", "is_translator": false}, "geo": null, "in_reply_to_user_id_str": "18839785", "possibly_sensitive": false, "lang": "hi", "extended_tweet": {"display_text_range": [0, 148], "entities": {"user_mentions": [{"id": 18839785, "indices": [0, 13], "id_str": "18839785", "screen_name": "narendramodi", "name": "Narendra Modi"}, {"id": 2526794479, "indices": [14, 29], "id_str": "2526794479", "screen_name": "MamataOfficial", "name": "Mamata Banerjee"}, {"id": 876764558215462914, "indices": [30, 45], "id_str": "876764558215462914", "screen_name": "RamNanthKobind", "name": "Ram Nath Kovind"}, {"id": 135122652, "indices": [46, 58], "id_str": "135122652", "screen_name": "amitabhbigb", "name": "Amitabh Bachhan"}, {"id": 2268748494, "indices": [59, 72], "id_str": "2268748494", "screen_name": "AmitShahArmy", "name": "Amit Shah Army"}, {"id": 339600819, "indices": [73, 81], "id_str": "339600819", "screen_name": "KBCsony", "name": "KBC"}], "symbols": [], "hashtags": [], "urls": [], "media": [{"expanded_url": "https://twitter.com/jaybtn/status/1059862741937254401/photo/1", "display_url": "pic.twitter.com/LXRfSIc736", "url": "https://t.co/LXRfSIc736", "media_url_https": "https://pbs.twimg.com/media/DrVjhY9WoAASaW2.jpg", "id_str": "1059862712602238976", "sizes": {"large": {"h": 768, "resize": "fit", "w": 1024}, "small": {"h": 510, "resize": "fit", "w": 680}, "medium": {"h": 768, "resize": "fit", "w": 1024}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [149, 172], "type": "photo", "id": 1059862712602238976, "media_url": "http://pbs.twimg.com/media/DrVjhY9WoAASaW2.jpg"}]}, "extended_entities": {"media": [{"expanded_url": "https://twitter.com/jaybtn/status/1059862741937254401/photo/1", "display_url": "pic.twitter.com/LXRfSIc736", "url": "https://t.co/LXRfSIc736", "media_url_https": "https://pbs.twimg.com/media/DrVjhY9WoAASaW2.jpg", "id_str": "1059862712602238976", "sizes": {"large": {"h": 768, "resize": "fit", "w": 1024}, "small": {"h": 510, "resize": "fit", "w": 680}, "medium": {"h": 768, "resize": "fit", "w": 1024}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [149, 172], "type": "photo", "id": 1059862712602238976, "media_url": "http://pbs.twimg.com/media/DrVjhY9WoAASaW2.jpg"}]}, "full_text": "@narendramodi @MamataOfficial @RamNanthKobind @amitabhbigb @AmitShahArmy @KBCsony This Deepawali-Ek Diya Saheedo ke naam At Jaigaon-Border Of Bhutan https://t.co/LXRfSIc736"}, "created_at": "Tue Nov 06 17:39:03 +0000 2018", "filter_level": "low", "in_reply_to_status_id_str": null, "place": {"country_code": "BT", "url": "https://api.twitter.com/1.1/geo/id/3a6b696db46e9e1d.json", "country": "Bhutan", "place_type": "country", "bounding_box": {"type": "Polygon", "coordinates": [[[88.758028, 26.698986], [88.758028, 28.323778], [92.124652, 28.323778], [92.124652, 26.698986]]]}, "full_name": "Bhutan", "attributes": {}, "id": "3a6b696db46e9e1d", "name": "Bhutan"}}'}}, {'index': {'_index': 'g20test', '_type': '_doc', '_id': '1059862800229679105', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse field [place.bounding_box] of type [geo_shape]', 'caused_by': {'type': 'x_content_parse_exception', 'reason': 'Failed to build [geojson] after last required field arrived', 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'first and last points of the linear ring must be the same (it must close itself): x[0]=-94.507512 x[3]=-94.46981 y[0]=39.194245 y[3]=39.194245'}}}, 'data': '{"quote_count": 0, "contributors": null, "truncated": false, "text": "@realDonaldTrump Who cares", "is_quote_status": false, "in_reply_to_status_id": 1059827782434328582, "reply_count": 0, "id": 1059862800229679105, "favorite_count": 0, "entities": {"user_mentions": [{"id": 25073877, "indices": [0, 16], "id_str": "25073877", "screen_name": "realDonaldTrump", "name": "Donald J. Trump"}], "symbols": [], "hashtags": [], "urls": []}, "retweeted": false, "coordinates": null, "timestamp_ms": "1541525957530", "source": "<a href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\">Twitter for iPhone</a>", "in_reply_to_screen_name": "realDonaldTrump", "id_str": "1059862800229679105", "display_text_range": [17, 26], "retweet_count": 0, "in_reply_to_user_id": 25073877, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 35817002, "default_profile": true, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/1004171132826783744/0CIpJVGU_normal.jpg", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "followers_count": 63, "profile_sidebar_border_color": "C0DEED", "id_str": "35817002", "profile_background_color": "C0DEED", "listed_count": 0, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "utc_offset": null, "statuses_count": 964, "description": null, "friends_count": 95, "location": "Liberty, MO", "profile_link_color": "1DA1F2", "profile_image_url": "http://pbs.twimg.com/profile_images/1004171132826783744/0CIpJVGU_normal.jpg", "following": null, "geo_enabled": true, "profile_banner_url": "https://pbs.twimg.com/profile_banners/35817002/1519325637", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "name": "Djackstra", "lang": "en", "profile_background_tile": false, "favourites_count": 4590, "screen_name": "Djackstra", "notifications": null, "url": null, "created_at": "Mon Apr 27 18:27:27 +0000 2009", "contributors_enabled": false, "time_zone": null, "protected": false, "translator_type": "none", "is_translator": false}, "geo": null, "in_reply_to_user_id_str": "25073877", "lang": "en", "created_at": "Tue Nov 06 17:39:17 +0000 2018", "filter_level": "low", "in_reply_to_status_id_str": "1059827782434328582", "place": {"country_code": "US", "url": "https://api.twitter.com/1.1/geo/id/003a09e92547a85d.json", "country": "United States", "place_type": "city", "bounding_box": {"type": "Polygon", "coordinates": [[[-94.507512, 39.194245], [-94.507512, 39.210172], [-94.46981, 39.210172], [-94.46981, 39.194245]]]}, "full_name": "Claycomo, MO", "attributes": {}, "id": "003a09e92547a85d", "name": "Claycomo"}}'}}, {'index': {'_index': 'g20test', '_type': '_doc', '_id': '1059862802964328448', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse field [place.bounding_box] of type [geo_shape]', 'caused_by': {'type': 'x_content_parse_exception', 'reason': 'Failed to build [geojson] after last required field arrived', 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'first and last points of the linear ring must be the same (it must close itself): x[0]=-84.321948 x[3]=-75.40012 y[0]=33.752879 y[3]=33.752879'}}}, 'data': '{"quote_count": 0, "quoted_status_permalink": {"url": "https://t.co/GzNmAz4RQW", "expanded": "https://twitter.com/republocratist/status/1059860337493794816", "display": "twitter.com/republocratist\\u2026"}, "contributors": null, "truncated": false, "text": "@realDonaldTrump", "is_quote_status": true, "in_reply_to_status_id": null, "reply_count": 0, "id": 1059862802964328448, "favorite_count": 0, "entities": {"user_mentions": [{"id": 25073877, "indices": [0, 16], "id_str": "25073877", "screen_name": "realDonaldTrump", "name": "Donald J. Trump"}], "symbols": [], "hashtags": [], "urls": []}, "quoted_status_id": 1059860337493794816, "retweeted": false, "coordinates": null, "timestamp_ms": "1541525958182", "quoted_status": {"quote_count": 0, "contributors": null, "truncated": true, "text": "JUST IN: AZ POLLING PLACE In Heavily Republican Precinct Turns Away Voters\\u2026Writes Their Names and Cell Numbers On S\\u2026 https://t.co/PEy3hyMsPC", "is_quote_status": false, "in_reply_to_status_id": null, "reply_count": 0, "id": 1059860337493794816, "favorite_count": 1, "entities": {"user_mentions": [], "symbols": [], "hashtags": [], "urls": [{"url": "https://t.co/PEy3hyMsPC", "indices": [117, 140], "expanded_url": "https://twitter.com/i/web/status/1059860337493794816", "display_url": "twitter.com/i/web/status/1\\u2026"}]}, "retweeted": false, "coordinates": null, "source": "<a href=\\"https://ifttt.com\\" rel=\\"nofollow\\">IFTTT</a>", "in_reply_to_screen_name": null, "id_str": "1059860337493794816", "display_text_range": [0, 140], "retweet_count": 2, "in_reply_to_user_id": null, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 796776554915385344, "default_profile": true, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/796777139601145856/TQl8_sFK_normal.jpg", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "followers_count": 1415, "profile_sidebar_border_color": "C0DEED", "id_str": "796776554915385344", "profile_background_color": "F5F8FA", "listed_count": 13, "profile_background_image_url_https": "", "utc_offset": null, "statuses_count": 11454, "description": null, "friends_count": 4235, "location": null, "profile_link_color": "1DA1F2", "profile_image_url": "http://pbs.twimg.com/profile_images/796777139601145856/TQl8_sFK_normal.jpg", "following": null, "geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/796776554915385344/1478801435", "profile_background_image_url": "", "name": "republocratist", "lang": "en", "profile_background_tile": false, "favourites_count": 30, "screen_name": "republocratist", "notifications": null, "url": null, "created_at": "Thu Nov 10 18:08:28 +0000 2016", "contributors_enabled": false, "time_zone": null, "protected": false, "translator_type": "none", "is_translator": false}, "geo": null, "in_reply_to_user_id_str": null, "possibly_sensitive": false, "lang": "en", "extended_tweet": {"display_text_range": [0, 151], "entities": {"user_mentions": [], "symbols": [], "hashtags": [], "urls": [], "media": [{"expanded_url": "https://twitter.com/republocratist/status/1059860337493794816/photo/1", "display_url": "pic.twitter.com/rwdXVzCdZS", "url": "https://t.co/rwdXVzCdZS", "media_url_https": "https://pbs.twimg.com/media/DrVhXDyXcAEZZ2f.jpg", "id_str": "1059860336097062913", "sizes": {"small": {"h": 417, "resize": "fit", "w": 680}, "large": {"h": 454, "resize": "fit", "w": 740}, "medium": {"h": 454, "resize": "fit", "w": 740}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [152, 175], "type": "photo", "id": 1059860336097062913, "media_url": "http://pbs.twimg.com/media/DrVhXDyXcAEZZ2f.jpg"}]}, "extended_entities": {"media": [{"expanded_url": "https://twitter.com/republocratist/status/1059860337493794816/photo/1", "display_url": "pic.twitter.com/rwdXVzCdZS", "url": "https://t.co/rwdXVzCdZS", "media_url_https": "https://pbs.twimg.com/media/DrVhXDyXcAEZZ2f.jpg", "id_str": "1059860336097062913", "sizes": {"small": {"h": 417, "resize": "fit", "w": 680}, "large": {"h": 454, "resize": "fit", "w": 740}, "medium": {"h": 454, "resize": "fit", "w": 740}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [152, 175], "type": "photo", "id": 1059860336097062913, "media_url": "http://pbs.twimg.com/media/DrVhXDyXcAEZZ2f.jpg"}]}, "full_text": "JUST IN: AZ POLLING PLACE In Heavily Republican Precinct Turns Away Voters\\u2026Writes Their Names and Cell Numbers On Scratch Paper\\u2026\\u201dWe\\u2019ll get back to you\\u201d https://t.co/rwdXVzCdZS"}, "created_at": "Tue Nov 06 17:29:30 +0000 2018", "filter_level": "low", "in_reply_to_status_id_str": null, "place": null}, "source": "<a href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\">Twitter for iPhone</a>", "in_reply_to_screen_name": "realDonaldTrump", "id_str": "1059862802964328448", "retweet_count": 0, "in_reply_to_user_id": 25073877, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": false, "default_profile_image": false, "id": 141950326, "default_profile": false, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/978985864310218752/5EKcbxZJ_normal.jpg", "profile_sidebar_fill_color": "000000", "profile_text_color": "000000", "followers_count": 4760, "profile_sidebar_border_color": "000000", "id_str": "141950326", "profile_background_color": "000000", "listed_count": 11, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "utc_offset": null, "statuses_count": 40067, "description": "Chicago Cubs, Blackhawks, Bears, Bulls, Fire, ECU sports and UNC sports...even the Whitesox, ProLife, Pro 2 Admendment, father, husband, christian. #MAGA", "friends_count": 5217, "location": null, "profile_link_color": "1B95E0", "profile_image_url": "http://pbs.twimg.com/profile_images/978985864310218752/5EKcbxZJ_normal.jpg", "following": null, "geo_enabled": true, "profile_banner_url": "https://pbs.twimg.com/profile_banners/141950326/1519405645", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "name": "Chef Boutwell", "lang": "en", "profile_background_tile": false, "favourites_count": 39886, "screen_name": "chefboutwell", "notifications": null, "url": null, "created_at": "Sun May 09 14:02:46 +0000 2010", "contributors_enabled": false, "time_zone": null, "protected": false, "translator_type": "none", "is_translator": false}, "geo": null, "in_reply_to_user_id_str": "25073877", "lang": "und", "created_at": "Tue Nov 06 17:39:18 +0000 2018", "quoted_status_id_str": "1059860337493794816", "filter_level": "low", "in_reply_to_status_id_str": null, "place": {"country_code": "US", "url": "https://api.twitter.com/1.1/geo/id/3b98b02fba3f9753.json", "country": "United States", "place_type": "admin", "bounding_box": {"type": "Polygon", "coordinates": [[[-84.321948, 33.752879], [-84.321948, 36.588118], [-75.40012, 36.588118], [-75.40012, 33.752879]]]}, "full_name": "North Carolina, USA", "attributes": {}, "id": "3b98b02fba3f9753", "name": "North Carolina"}}'}}, {'index': {'_index': 'g20test', '_type': '_doc', '_id': '1059862812527345664', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse field [place.bounding_box] of type [geo_shape]', 'caused_by': {'type': 'x_content_parse_exception', 'reason': 'Failed to build [geojson] after last required field arrived', 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'first and last points of the linear ring must be the same (it must close itself): x[0]=-85.551867 x[3]=-85.449557 y[0]=42.854485 y[3]=42.854485'}}}, 'data': '{"quote_count": 0, "quoted_status_permalink": {"url": "https://t.co/5zaS9tG6BY", "expanded": "https://twitter.com/icecube/status/1059832143843606528", "display": "twitter.com/icecube/status\\u2026"}, "contributors": null, "truncated": false, "text": "@realDonaldTrump i Know u still hurting from when Em got you but check this shit out.", "is_quote_status": true, "in_reply_to_status_id": null, "reply_count": 0, "id": 1059862812527345664, "favorite_count": 0, "entities": {"user_mentions": [{"id": 25073877, "indices": [0, 16], "id_str": "25073877", "screen_name": "realDonaldTrump", "name": "Donald J. Trump"}], "symbols": [], "hashtags": [], "urls": []}, "quoted_status_id": 1059832143843606528, "retweeted": false, "coordinates": null, "timestamp_ms": "1541525960462", "quoted_status": {"quote_count": 165, "contributors": null, "truncated": false, "text": "Cuffs are ready... Get out there and vote. #ArrestThePresident droppin this Friday #EverythangsCorrupt https://t.co/VwQaOiyyzB", "is_quote_status": false, "in_reply_to_status_id": null, "reply_count": 160, "id": 1059832143843606528, "favorite_count": 3072, "entities": {"user_mentions": [], "symbols": [], "hashtags": [{"indices": [43, 62], "text": "ArrestThePresident"}, {"indices": [83, 102], "text": "EverythangsCorrupt"}], "urls": [], "media": [{"additional_media_info": {"monetizable": false, "embeddable": true, "description": "", "title": ""}, "expanded_url": "https://twitter.com/icecube/status/1059832143843606528/video/1", "display_url": "pic.twitter.com/VwQaOiyyzB", "url": "https://t.co/VwQaOiyyzB", "media_url_https": "https://pbs.twimg.com/media/DrVGAFbX0AARWls.jpg", "id_str": "1059827544877359104", "sizes": {"large": {"h": 360, "resize": "fit", "w": 640}, "small": {"h": 360, "resize": "fit", "w": 640}, "medium": {"h": 360, "resize": "fit", "w": 640}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [103, 126], "type": "photo", "id": 1059827544877359104, "media_url": "http://pbs.twimg.com/media/DrVGAFbX0AARWls.jpg"}]}, "retweeted": false, "coordinates": null, "source": "<a href=\\"https://studio.twitter.com\\" rel=\\"nofollow\\">Media Studio</a>", "in_reply_to_screen_name": null, "id_str": "1059832143843606528", "display_text_range": [0, 102], "retweet_count": 1220, "in_reply_to_user_id": null, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": false, "default_profile_image": false, "id": 19671129, "default_profile": false, "verified": true, "profile_image_url_https": "https://pbs.twimg.com/profile_images/490531474018676736/r4CIt4u9_normal.jpeg", "profile_sidebar_fill_color": "252429", "profile_text_color": "666666", "followers_count": 4678888, "profile_sidebar_border_color": "181A1E", "id_str": "19671129", "profile_background_color": "1A1B1F", "listed_count": 7277, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme9/bg.gif", "utc_offset": null, "statuses_count": 4581, "description": "West Coast Warlord", "friends_count": 118, "location": "Los Angeles, CA", "profile_link_color": "FF0000", "profile_image_url": "http://pbs.twimg.com/profile_images/490531474018676736/r4CIt4u9_normal.jpeg", "following": null, "geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/19671129/1539275490", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme9/bg.gif", "name": "Ice Cube", "lang": "en", "profile_background_tile": true, "favourites_count": 745, "screen_name": "icecube", "notifications": null, "url": "https://big3.com/tickets/", "created_at": "Wed Jan 28 19:47:00 +0000 2009", "contributors_enabled": false, "time_zone": null, "protected": false, "translator_type": "none", "is_translator": false}, "geo": null, "in_reply_to_user_id_str": null, "possibly_sensitive": false, "lang": "en", "created_at": "Tue Nov 06 15:37:28 +0000 2018", "filter_level": "low", "in_reply_to_status_id_str": null, "place": null, "extended_entities": {"media": [{"additional_media_info": {"monetizable": false, "embeddable": true, "description": "", "title": ""}, "expanded_url": "https://twitter.com/icecube/status/1059832143843606528/video/1", "display_url": "pic.twitter.com/VwQaOiyyzB", "url": "https://t.co/VwQaOiyyzB", "media_url_https": "https://pbs.twimg.com/media/DrVGAFbX0AARWls.jpg", "video_info": {"aspect_ratio": [16, 9], "duration_millis": 43227, "variants": [{"url": "https://video.twimg.com/amplify_video/1059827544877359104/vid/1280x720/lgF_vw9FfHJGlfMP.mp4?tag=8", "bitrate": 2176000, "content_type": "video/mp4"}, {"url": "https://video.twimg.com/amplify_video/1059827544877359104/pl/EEMfV9D8fqLrnUhj.m3u8?tag=8", "content_type": "application/x-mpegURL"}, {"url": "https://video.twimg.com/amplify_video/1059827544877359104/vid/320x180/EpsR8TS9KlPIptBr.mp4?tag=8", "bitrate": 288000, "content_type": "video/mp4"}, {"url": "https://video.twimg.com/amplify_video/1059827544877359104/vid/640x360/upqUfXTpHicolNuA.mp4?tag=8", "bitrate": 832000, "content_type": "video/mp4"}]}, "id_str": "1059827544877359104", "sizes": {"large": {"h": 360, "resize": "fit", "w": 640}, "small": {"h": 360, "resize": "fit", "w": 640}, "medium": {"h": 360, "resize": "fit", "w": 640}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [103, 126], "type": "video", "id": 1059827544877359104, "media_url": "http://pbs.twimg.com/media/DrVGAFbX0AARWls.jpg"}]}}, "source": "<a href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\">Twitter for iPhone</a>", "in_reply_to_screen_name": "realDonaldTrump", "id_str": "1059862812527345664", "retweet_count": 0, "in_reply_to_user_id": 25073877, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 107485339, "default_profile": true, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/694022193345335296/XXymNUIq_normal.jpg", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "followers_count": 145, "profile_sidebar_border_color": "C0DEED", "id_str": "107485339", "profile_background_color": "C0DEED", "listed_count": 4, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "utc_offset": null, "statuses_count": 13833, "description": "Grand Rapids Mi", "friends_count": 227, "location": null, "profile_link_color": "1DA1F2", "profile_image_url": "http://pbs.twimg.com/profile_images/694022193345335296/XXymNUIq_normal.jpg", "following": null, "geo_enabled": true, "profile_banner_url": "https://pbs.twimg.com/profile_banners/107485339/1419109882", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "name": "Oscar", "lang": "en", "profile_background_tile": false, "favourites_count": 12254, "screen_name": "OscarCortez30", "notifications": null, "url": null, "created_at": "Fri Jan 22 18:28:29 +0000 2010", "contributors_enabled": false, "time_zone": null, "protected": false, "translator_type": "none", "is_translator": false}, "geo": null, "in_reply_to_user_id_str": "25073877", "lang": "en", "created_at": "Tue Nov 06 17:39:20 +0000 2018", "quoted_status_id_str": "1059832143843606528", "filter_level": "low", "in_reply_to_status_id_str": null, "place": {"country_code": "US", "url": "https://api.twitter.com/1.1/geo/id/00e12cc87130c6bf.json", "country": "United States", "place_type": "city", "bounding_box": {"type": "Polygon", "coordinates": [[[-85.551867, 42.854485], [-85.551867, 42.971813], [-85.449557, 42.971813], [-85.449557, 42.854485]]]}, "full_name": "Forest Hills, MI", "attributes": {}, "id": "00e12cc87130c6bf", "name": "Forest Hills"}}'}}, {'index': {'_index': 'g20test', '_type': '_doc', '_id': '1059862846576697344', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse field [place.bounding_box] of type [geo_shape]', 'caused_by': {'type': 'x_content_parse_exception', 'reason': 'Failed to build [geojson] after last required field arrived', 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'first and last points of the linear ring must be the same (it must close itself): x[0]=-73.727776 x[3]=-71.786994 y[0]=40.950918 y[3]=40.950918'}}}, 'data': '{"quote_count": 0, "contributors": null, "truncated": false, "text": "@WhiteHouse 2 6 bags https://t.co/CQ9EVqkGAn", "is_quote_status": false, "in_reply_to_status_id": null, "reply_count": 0, "id": 1059862846576697344, "favorite_count": 0, "entities": {"user_mentions": [{"id": 822215673812119553, "indices": [0, 11], "id_str": "822215673812119553", "screen_name": "WhiteHouse", "name": "The White House"}], "symbols": [], "hashtags": [], "urls": [], "media": [{"expanded_url": "https://twitter.com/MR8DONRIKOCZAE/status/1059862846576697344/photo/1", "display_url": "pic.twitter.com/CQ9EVqkGAn", "url": "https://t.co/CQ9EVqkGAn", "media_url_https": "https://pbs.twimg.com/media/DrVjoQ2X4AAKg4O.jpg", "id_str": "1059862830684561408", "sizes": {"small": {"h": 453, "resize": "fit", "w": 680}, "large": {"h": 1280, "resize": "fit", "w": 1920}, "medium": {"h": 800, "resize": "fit", "w": 1200}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [21, 44], "type": "photo", "id": 1059862830684561408, "media_url": "http://pbs.twimg.com/media/DrVjoQ2X4AAKg4O.jpg"}]}, "retweeted": false, "coordinates": null, "timestamp_ms": "1541525968580", "source": "<a href=\\"http://twitter.com\\" rel=\\"nofollow\\">Twitter Web Client</a>", "in_reply_to_screen_name": "WhiteHouse", "id_str": "1059862846576697344", "display_text_range": [0, 20], "retweet_count": 0, "in_reply_to_user_id": 822215673812119553, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 2544795223, "default_profile": false, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/1058893931746869249/KBkJ8UQ0_normal.jpg", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "followers_count": 322, "profile_sidebar_border_color": "000000", "id_str": "2544795223", "profile_background_color": "050C0F", "listed_count": 51, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "utc_offset": null, "statuses_count": 86337, "description": "http://KICKSTARTER.COM/PROOFOFLYFE\\nhttp://SOUNDCLOUD.COM/DON-CZAE\\nhttp://SOUNDCLOUD.COM/RIKOCZAE58", "friends_count": 500, "location": "MRDONRIKOCZAE@AOL.COM", "profile_link_color": "4A913C", "profile_image_url": "http://pbs.twimg.com/profile_images/1058893931746869249/KBkJ8UQ0_normal.jpg", "following": null, "geo_enabled": true, "profile_banner_url": "https://pbs.twimg.com/profile_banners/2544795223/1446841480", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "name": "DONRIKOCZAE", "lang": "en", "profile_background_tile": true, "favourites_count": 49772, "screen_name": "MR8DONRIKOCZAE", "notifications": null, "url": "http://WWW.MYSPACE.COM/DONLYFEMUSIC", "created_at": "Wed Jun 04 00:38:19 +0000 2014", "contributors_enabled": false, "time_zone": null, "protected": false, "translator_type": "none", "is_translator": false}, "geo": null, "in_reply_to_user_id_str": "822215673812119553", "possibly_sensitive": false, "lang": "en", "created_at": "Tue Nov 06 17:39:28 +0000 2018", "filter_level": "low", "in_reply_to_status_id_str": null, "place": {"country_code": "US", "url": "https://api.twitter.com/1.1/geo/id/e86b380cfefcced5.json", "country": "United States", "place_type": "admin", "bounding_box": {"type": "Polygon", "coordinates": [[[-73.727776, 40.950918], [-73.727776, 42.050588], [-71.786994, 42.050588], [-71.786994, 40.950918]]]}, "full_name": "Connecticut, USA", "attributes": {}, "id": "e86b380cfefcced5", "name": "Connecticut"}, "extended_entities": {"media": [{"expanded_url": "https://twitter.com/MR8DONRIKOCZAE/status/1059862846576697344/photo/1", "display_url": "pic.twitter.com/CQ9EVqkGAn", "url": "https://t.co/CQ9EVqkGAn", "media_url_https": "https://pbs.twimg.com/media/DrVjoQ2X4AAKg4O.jpg", "id_str": "1059862830684561408", "sizes": {"small": {"h": 453, "resize": "fit", "w": 680}, "large": {"h": 1280, "resize": "fit", "w": 1920}, "medium": {"h": 800, "resize": "fit", "w": 1200}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [21, 44], "type": "photo", "id": 1059862830684561408, "media_url": "http://pbs.twimg.com/media/DrVjoQ2X4AAKg4O.jpg"}]}}'}}, {'index': {'_index': 'g20test', '_type': '_doc', '_id': '1059862884149350401', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse field [place.bounding_box] of type [geo_shape]', 'caused_by': {'type': 'x_content_parse_exception', 'reason': 'Failed to build [geojson] after last required field arrived', 'caused_by': {'type': 'illegal_argument_exception', 'reason': 'first and last points of the linear ring must be the same (it must close itself): x[0]=-75.563587 x[3]=-73.88506 y[0]=38.788657 y[3]=38.788657'}}}, 'data': '{"quote_count": 0, "contributors": null, "truncated": false, "text": "@realDonaldTrump You\\u2019re a LOSER! Only ones cheating are your FOLLOWERS", "is_quote_status": false, "in_reply_to_status_id": 1059470847751131138, "reply_count": 0, "id": 1059862884149350401, "favorite_count": 0, "entities": {"user_mentions": [{"id": 25073877, "indices": [0, 16], "id_str": "25073877", "screen_name": "realDonaldTrump", "name": "Donald J. Trump"}], "symbols": [], "hashtags": [], "urls": []}, "retweeted": false, "coordinates": null, "timestamp_ms": "1541525977538", "source": "<a href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\">Twitter for iPhone</a>", "in_reply_to_screen_name": "realDonaldTrump", "id_str": "1059862884149350401", "display_text_range": [17, 70], "retweet_count": 0, "in_reply_to_user_id": 25073877, "favorited": false, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 870820904, "default_profile": true, "verified": false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/378800000326214972/327b321ca82a1fc008f8b2043b89750a_normal.jpeg", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "followers_count": 82, "profile_sidebar_border_color": "C0DEED", "id_str": "870820904", "profile_background_color": "C0DEED", "listed_count": 2, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "utc_offset": null, "statuses_count": 806, "description": null, "friends_count": 230, "location": "New Jersey", "profile_link_color": "1DA1F2", "profile_image_url": "http://pbs.twimg.com/profile_images/378800000326214972/327b321ca82a1fc008f8b2043b89750a_normal.jpeg", "following": null, "geo_enabled": true, "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "name": "karen reinecke", "lang": "en", "profile_background_tile": false, "favourites_count": 325, "screen_name": "ernislost", "notifications": null, "url": null, "created_at": "Wed Oct 10 00:15:54 +0000 2012", "contributors_enabled": false, "time_zone": null, "protected": false, "translator_type": "none", "is_translator": false}, "geo": null, "in_reply_to_user_id_str": "25073877", "lang": "en", "created_at": "Tue Nov 06 17:39:37 +0000 2018", "filter_level": "low", "in_reply_to_status_id_str": "1059470847751131138", "place": {"country_code": "US", "url": "https://api.twitter.com/1.1/geo/id/65b4760a2b411e11.json", "country": "United States", "place_type": "admin", "bounding_box": {"type": "Polygon", "coordinates": [[[-75.563587, 38.788657], [-75.563587, 41.357424], [-73.88506, 41.357424], [-73.88506, 38.788657]]]}, "full_name": "New Jersey, USA", "attributes": {}, "id": "65b4760a2b411e11", "name": "New Jersey"}}'}}])

---
1. Poder buscar teniendo en cuenta el patrón de mayúsculas o minúsculas, por ejemplo: “President” o “PRESIDENT” en el campo ‘text’, solo debe devolver documentos que tengan la palabra escrita con el mismo patrón de mayúsculas y minúsculas.


In [273]:
word = "PRESIDENT"
search_body = {
  "query": {
    "match": { "text.case_sensitive": word } 
  } 
  , "_source": ["text"]

}
es.search(body=search_body, index=index_name)

{'took': 5,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 6, 'relation': 'eq'},
  'max_score': 8.57464,
  'hits': [{'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059864699968372737',
    '_score': 8.57464,
    '_source': {'text': '@realDonaldTrump Great Job Mr. PRESIDENT! Finally.. https://t.co/QePjSzlIi8'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059868987855724544',
    '_score': 7.782443,
    '_source': {'text': '@WhiteHouse @realDonaldTrump #HELLO..MY PRESIDENT PRAY FOR..EVER..CONQUER..SMILE..SECOND..45.'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059869552916512771',
    '_score': 7.3309164,
    '_source': {'text': '@realDonaldTrump \nPOTUS:  It is time for obstructionists to work with you and accept that YOU are the PRESIDENT!\nLO… https://t.co/ETZ0A7d5y4'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059869654993256449',
    '_score': 6.928910

In [274]:
word = "President"
search_body = {
  "query": {
    "match": { "text.case_sensitive": word } 
  } 
  , "_source": ["text"]

}
es.search(body=search_body, index=index_name)

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 580, 'relation': 'eq'},
  'max_score': 3.9671803,
  'hits': [{'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059868781495894021',
    '_score': 3.9671803,
    '_source': {'text': '@realDonaldTrump Come on, Mr President!!!!'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059868897493557248',
    '_score': 3.9671803,
    '_source': {'text': '@realDonaldTrump Weak Baby President.'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059869201450569734',
    '_score': 3.9671803,
    '_source': {'text': '@realDonaldTrump Ok Mr. President!'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059868532975026178',
    '_score': 3.8114316,
    '_source': {'text': '@realDonaldTrump Thank you President Klump!'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059868985058041856',
    '_score': 3.8114316,
    '_sou

---
2. Poder buscar por palabras que compartan una misma raíz, por ejemplo: “pray”, haría un match con tweets que contengan: “prays”, “prayer”, “praying”, “prayers” en el campo ‘text’.


In [251]:
#Testing custom analyzer
analisis = {
  "analyzer" : "my_analyzer",
  "text" : "pray prayed praying prayer prayers"
}
es.indices.analyze(body=analisis, index=index_name)

{'tokens': [{'token': 'prai',
   'start_offset': 0,
   'end_offset': 4,
   'type': '<ALPHANUM>',
   'position': 0},
  {'token': 'prai',
   'start_offset': 5,
   'end_offset': 11,
   'type': '<ALPHANUM>',
   'position': 1},
  {'token': 'prai',
   'start_offset': 12,
   'end_offset': 19,
   'type': '<ALPHANUM>',
   'position': 2},
  {'token': 'prayer',
   'start_offset': 20,
   'end_offset': 26,
   'type': '<ALPHANUM>',
   'position': 3},
  {'token': 'prayer',
   'start_offset': 27,
   'end_offset': 34,
   'type': '<ALPHANUM>',
   'position': 4}]}

In [275]:
word = "praying"
search_body = {
  "query": {
    "match": { "text": word } 
  } 
  , "_source": ["text"]

}
es.search(body=search_body, index=index_name)

{'took': 5,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 11, 'relation': 'eq'},
  'max_score': 7.68461,
  'hits': [{'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059868987855724544',
    '_score': 7.68461,
    '_source': {'text': '@WhiteHouse @realDonaldTrump #HELLO..MY PRESIDENT PRAY FOR..EVER..CONQUER..SMILE..SECOND..45.'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059864553037684736',
    '_score': 7.4821825,
    '_source': {'text': '@WhiteHouse @realDonaldTrump The American People are praying for you Mr President 🙏🇺🇸'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059862775965446144',
    '_score': 7.4586577,
    '_source': {'text': '@POTUS   🙏🏻🙏🏻Pray For President Trump🙏🏻🙏🏻🙏🏻\n🙏🏻🙏🏻🙏🏻🙏🏻Pray For America🙏🏻🙏🏻🙏🏻🙏🏻🙏🏻\nIn the Mighty Name of Jesus Release… https://t.co/HY2yIhB7LR'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059867694256795655',
    '_score

---
3. Consulta para poder buscar una ubicación en el campo ‘user.location’ y que el matching sea exacto, por ejemplo para las consultas “York” y “California”, no debería devolver documentos en donde user.location sea “New York” o “LA, California” por ejemplo.


In [277]:
location_string = 'California'
search_body = {
  "query": {
    "match": {
      "user.location": location_string
    }
  }
  , "_source": ["id", "user.location"]
}

es.search(body=search_body, index=index_name)

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 5, 'relation': 'eq'},
  'max_score': 6.9929314,
  'hits': [{'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059867162557394944',
    '_score': 6.9929314,
    '_source': {'id': 1059867162557394944,
     'user': {'location': 'California'}}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059863755603947520',
    '_score': 6.9929314,
    '_source': {'id': 1059863755603947520,
     'user': {'location': 'California'}}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059867742466039808',
    '_score': 6.9929314,
    '_source': {'id': 1059867742466039808,
     'user': {'location': 'California'}}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059868585873395713',
    '_score': 6.9929314,
    '_source': {'id': 1059868585873395713,
     'user': {'location': 'California'}}},
   {'_index': 'g20test',
    '_type': '_doc',
  

---
4. Consulta para poder buscar tweets cuyos usuarios hayan abierto sus cuentas en un rango de fechas.


In [295]:
date_format ="%a %b %d %H:%M:%S %z %Y"

date_to = dateutil.parser.parse("2018-11-05T08:27:18-0000")
date_from = dateutil.parser.parse("2018-11-07T08:27:18-0000")

date_to_str = date_to.strftime(date_format)
date_from_str = date_from.strftime(date_format)

print(date_to_str)
print(date_from_str)

Mon Nov 05 08:27:18 +0000 2018
Wed Nov 07 08:27:18 +0000 2018


In [294]:
search_body = {
  "query": {
    "bool": {
      "must": [
        {
          "range": {
            "created_at": {
              "gte": date_to_str,
              "lte": date_from_str
            }
          }
        }
      ]
    }
  }
  , "_source": ["created_at"]
}

es.search(body=search_body, index=index_name)

{'took': 15,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 9567, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059866770843025408',
    '_score': 1.0,
    '_source': {'created_at': 'Tue Nov 06 17:55:04 +0000 2018'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059866771212115971',
    '_score': 1.0,
    '_source': {'created_at': 'Tue Nov 06 17:55:04 +0000 2018'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059866771006676992',
    '_score': 1.0,
    '_source': {'created_at': 'Tue Nov 06 17:55:04 +0000 2018'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059866771547738113',
    '_score': 1.0,
    '_source': {'created_at': 'Tue Nov 06 17:55:04 +0000 2018'}},
   {'_index': 'g20test',
    '_type': '_doc',
    '_id': '1059866772696940544',
    '_score': 1.0,
    '_source': {'created_at': 'Tue Nov 06 17:55:04 +000

---
5. Consulta para poder buscar tweets que hayan sido posteados en un área específica,por ejemplo, en el área de “Washington DC” usando el campo “place.bounding_box”.