# Elasticsearch

In [None]:
import os
from elasticsearch import Elasticsearch, helpers, NotFoundError
import json
from datetime import datetime

In [None]:
USER = "elastic"
PWD = "mXwp5dz4"
API_KEY = "a1JVeVo1VUJBdUJsalpERXYwNXg6RnpOeEZvUnRTTC0xZVJDQ0ZacHhRdw=="

In [None]:
client = Elasticsearch("http://localhost:9200", \
                       basic_auth=(USER, PWD))  

In [None]:
client.info()

In [None]:
nyc_index = "nyc_restaurants"
nyc_wiki_index = "nyc_restaurants_wiki"

### New York City Restaurants JSON dataset

### Boolean operators

- `must` operator: AND
- `should` operator: OR
- `must_not` operator: NOT

#### Q1: Find all Italian restaurants that were inspected in Manhattan.

In [None]:
query = {
    "query": {
        "bool": {
            "must": [
                {"match": {"CUISINE DESCRIPTION": "Italian"}},
                {"match": {"BORO": "Manhattan"}},
                {"exists": {"field": "DBA"}} 
            ]
        }
    },
    "_source": ["DBA", "BORO", "CUISINE DESCRIPTION"],
    "size": 54
}

response = client.search(index=nyc_index, body=query)
print(response['hits']['total']['value'])
for hit in response['hits']['hits']:
    print(hit['_source'])

#### Q2: Find all inspected restaurants that have Italian or Mexican in their "CUISINE DESCRIPTION".

In [None]:
query = {
    "query": {
        "bool": {
            "should": [
                {"match": {"CUISINE DESCRIPTION": "Italian"}},
                {"match": {"CUISINE DESCRIPTION": "Mexican"}}
            ],
            "minimum_should_match": 1
        }
    },
    "_source": ["DBA", "BORO", "CUISINE DESCRIPTION"],
    "size": 200
}

response = client.search(index=nyc_index, body=query)
print(response['hits']['total']['value'])
for hit in response['hits']['hits']:
    print(hit['_source'])

#### Q3: Find all inspected restaurants that are not in Bronx.

In [None]:
query = {
    "query": {
        "bool": {
            "must": [
                {"exists": {"field": "BORO"}}
            ],
            "must_not": [
                {"match": {"BORO": "Bronx"}}
            ]
        }
    },
    "_source": ["DBA", "BORO"]
}

response = client.search(index=nyc_index, body=query)
print(response['hits']['total']['value'])
for hit in response['hits']['hits']:
    print(hit['_source'])

#### Q4: Find all inspected restaurants that have Latin and American in their "CUISINE DESCRIPTION".

In [None]:
query = {
    "query": {
        "multi_match": {
            "query": "Latin American",
            "fields": ["CUISINE DESCRIPTION"],
            "operator": "and"
        }
    },
    "size": 200
}

response = client.search(index=nyc_index, body=query)
print(response['hits']['total']['value'])
for hit in response['hits']['hits']:
    print(hit['_source']['DBA'], hit['_source']['CUISINE DESCRIPTION'])

In [None]:
query = {
    "query": {
        "match_phrase": {
            "CUISINE DESCRIPTION": "Latin American"
        }
    },
    "size": 200
}

response = client.search(index=nyc_index, body=query)
print(response['hits']['total']['value'])
for hit in response['hits']['hits']:
    print(hit['_source']['DBA'], hit['_source']['CUISINE DESCRIPTION'])

### Aggregations

#### Q5: How many restaurants are listed in the dataset?

In [None]:
query = {
    "query": {
        "match_all": {} 
    },
    "size": 0, 
    "aggs": {
        "total_restaurants": {
            "value_count": {
                "field": "DBA.keyword" 
            }
        }
    }
}

response = client.search(index=nyc_index, body=query)
total_count = response['aggregations']['total_restaurants']['value']
print(f"Total number of restaurants: {total_count}")

#### Q6: Find the total score of all restaurant inspections in the dataset.

In [None]:
query = {
    "size": 0,
    "aggs": {
        "total_score": {
            "sum": {
                "field": "SCORE"
            }
        }
    }
}

response = client.search(index=nyc_index, body=query)
print(response['aggregations']['total_score']['value'])

#### Q7: What is the average score of restaurant inspections?

In [None]:
query = {
    "size": 0,
    "aggs": {
        "average_score": {
            "avg": {
                "field": "SCORE"
            }
        }
    }
}

response = client.search(index=nyc_index, body=query)
print(response['aggregations']['average_score']['value'])

#### Q8: What is the minimum score of restaurant inspections?

In [None]:
query = {
    "size": 0,
    "aggs": {
        "min_score": {
            "min": {
                "field": "SCORE"
            }
        }
    }
}

response = client.search(index=nyc_index, body=query)
print(response['aggregations']['min_score']['value'])

#### Q9: What is the maximum score of restaurant inspections?

In [None]:
query = {
    "size": 0,
    "aggs": {
        "max_score": {
            "max": {
                "field": "SCORE"
            }
        }
    }
}

response = client.search(index=nyc_index, body=query)
print(response['aggregations']['max_score']['value'])

#### Q10: What are the top 20 most common cuisine types among the inspected restaurants?

In [None]:
query = {
    "size": 0,
    "aggs": {
        "cuisine_count": {
            "terms": {
                "field": "CUISINE DESCRIPTION.keyword",
                "size": 20
            }
        }
    }
}

response = client.search(index=nyc_index, body=query)
for bucket in response['aggregations']['cuisine_count']['buckets']:
    print(bucket['key'], ":", bucket['doc_count'])

#### Q11: How many restaurants fall into each inspection score range (intervals of 5)?

In [None]:
query = {
    "size": 0,
    "aggs": {
        "score_histogram": {
            "histogram": {
                "field": "SCORE",
                "interval": 5
            }
        }
    }
}

response = client.search(index=nyc_index, body=query)
for bucket in response['aggregations']['score_histogram']['buckets']:
    print(bucket['key'], ":", bucket['doc_count'])

#### Q12: How many unique cuisine types have been inspected?

In [None]:
query = {
    "size": 0,
    "aggs": {
        "unique_cuisines": {
            "cardinality": {
                "field": "CUISINE DESCRIPTION.keyword"
            }
        }
    }
}

response = client.search(index=nyc_index, body=query)
print(response['aggregations']['unique_cuisines']['value'])

### Wikidump dataset

#### Q13: Find all wikidump documents that mention "India".

In [None]:
query = {
    "query": {
        "match": {
            "wiki": "India"
            }
    },
    "_source": ["wiki"],
    "size": 1000
}

response = client.search(index=nyc_wiki_index, body=query)

print(response['hits']['total']['value'])

for hit in response['hits']['hits']:
    print(hit['_source'])
    print('__________________')

### Nested fields

- documentation: https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-nested-query.html

In [None]:
try:
    client.indices.delete(index="test-index")
except NotFoundError as e:
    print("Index doesn't exist!")

In [None]:
doc1 = {
    "group": "nogroup",
    "user": [
        {
            "first": "Jon",
            "last": "Snow"
        }
    ]
}

doc2 = {
    "group": "Starks",
    "user": [
        {
            "first": "Eddard",
            "last": "Stark"
        },
        {
            "first": "Arya",
            "last": "Stark"
        },
        {
            "first": "Sansa",
            "last": "Stark"
        }
    ]
}

doc3 = {
    "group": "Arryns",
    "user": [
        {
            "first": "Jon",
            "last": "Arryn"
        },
        {
            "first": "Lysa",
            "last": "Arryn"
        }
    ]
}

resp = client.index(index="test-index", id=1, document=doc1)
resp = client.index(index="test-index", id=2, document=doc2)
resp = client.index(index="test-index", id=2, document=doc3)
print(resp)

In [None]:
query = {
    "query": {
        "match": {
            "user.first": "Jon"
        }
    },
    "size": 1000
}
response = client.search(index="test-index", body=query)
response

### Geo queries: `geo_distance`

- documentation: https://www.elastic.co/guide/en/elasticsearch/reference/current/geo-queries.html

In [None]:
try:
    client.indices.delete(index="nyc_restaurants_geo")
except NotFoundError as e:
    print("Index doesn't exist!")

In [None]:
geo_mapping = {
    "mappings": {
        "properties": {
            "location": {"type": "geo_point"},
            "Latitude": {"type": "float"},
            "Longitude": {"type": "float"}
        }
    }
}

client.indices.create(index="nyc_restaurants_geo", body=geo_mapping)

reindex_body = {
    "source": {"index": "nyc_restaurants"},
    "dest": {"index": "nyc_restaurants_geo"},
    "script": {
        "source": "ctx._source['location'] = [ctx._source['Longitude'], ctx._source['Latitude']]"
    }
}
client.reindex(body=reindex_body)

#### Q15: Find all inspected restaurants within 5 km of Central Park (NYC) and plot them.

In [None]:
query = {
    "query": {
        "geo_distance": {
            "distance": "5km",
            "location": { 
                "lat": 40.7851,
                "lon": -73.9654
            }
        }
    },
    "_source": ["DBA", "CUISINE DESCRIPTION", "location"],
    "size": 600
}

response = client.search(index="nyc_restaurants_geo", body=query)
print(response['hits']['total']['value'])
# for hit in response['hits']['hits']:
#     print(f"Restaurant: {hit['_source']['DBA']}, Cuisine: {hit['_source']['CUISINE DESCRIPTION']}, Location: {hit['_source']['location']}")