# Search 🔎

In [None]:
# Settings

INDEX_NAME = 'library'
TYPE_NAME = 'books'

In [None]:
from elasticsearch import Elasticsearch
import requests as r



client = Elasticsearch('http://elsatic:9200')
# client.indices.delete(index=INDEX_NAME)

data = {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        TYPE_NAME: {
          "properties": {
            "title": {
              "type": "text",
              "fielddata": True,
            },
            "tag": {
              "type": "keyword"
            },
            "description": {
              "type": "text",
              "fielddata": True,
            },
            "author": {
              "type": "text",
              "fielddata": True,
            },
            "city": {
              "type": "text",
              "fielddata": True,
            },
            "date_of_birth": {
              "type":   "date",
              "format": "yyyy-MM-dd"
            }
          }
        }
      }
}
res = client.indices.create(index=INDEX_NAME, body=data)
print(res)

In [None]:
res = r.get(f'http://elsatic:9200/{INDEX_NAME}/{TYPE_NAME}/_mapping?pretty')
print(res.text)

In [None]:
# took

import json

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
        "query": {
             "match_all": {},
        }
    }
)

print(json.dumps(res, indent=4, sort_keys=True))

### Generate data

In [None]:
from faker import Faker


fake = Faker()


def generate_data():
    for idx in range(1000000):


        doc = {
            'title': fake.text(max_nb_chars=20),
            'text': fake.text(max_nb_chars=15000),
            'tag': fake.word().lower(),
            'author': fake.name(),
            'pages': int(fake.numerify()),
            'city': fake.city(),
            'date_of_birth': fake.date_of_birth(),
        }


        res = client.index(index=INDEX_NAME, doc_type=TYPE_NAME, id=idx, body=doc)

    print('Done!')


generate_data()

### Items count

In [None]:
# count
res = r.get('http://elsatic:9200/_cat/count')

print(res.text)

### Select all

In [None]:
# query DSL, _source

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
        "query": {
             "match_all": {},
        }
    }
)


print(len(res['hits']['hits']))
print(json.dumps(res['hits']['hits'], indent=4, sort_keys=True))

### Pagination

In [None]:
res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
        "query": {
             "match_all": {},
        },
        "size": 60,
    }
)


print(len(res['hits']['hits']))

In [None]:
# from 10000

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
        "query": {
             "match_all": {},
        },
        "size": 2,
        "from": 0
    }
)


print(len(res['hits']['hits']))
print(json.dumps(res['hits']['hits'], indent=4, sort_keys=True))

### Sort

In [None]:
# avg

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
        "query": {
             "match_all": {},
        },
        "sort": [{"pages": "desc"}]
    }
)


print(len(res['hits']['hits']))
# print(json.dumps(res['hits']['hits'], indent=4, sort_keys=True))

### Script Field

In [None]:
# _source, sort, from 10 000

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
        "query": {
             "match_all": {},
        },
        "size": 2,
        "script_fields" : {
            "some_value" : {
                "script" : {
                    "lang": "painless",
                    "source": "params['_source']['pages'] / 100.0"
                }
            },
         },
    }
)


print(json.dumps(res['hits']['hits'], indent=4, sort_keys=True))

### Highlight

In [None]:
# operator
import json

QUERY = "Dog are very interesting?"

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
        "query": {
             "match": {
                 "text": {
                     "query" : QUERY,
                 }
             },
        },
        "highlight" : {
             "fields" : {
                  "text" : {}
             }
          },
#         "_source": False,
        "size": 100,
    }
)

print(json.dumps(res['hits']['hits'][0]['highlight'], indent=2, sort_keys=True))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from collections import defaultdict

# only for the retina monitor
%config InlineBackend.figure_format='retina'

def plot_score(res: dict) -> None:
    """
    Plot scor distribution from elaticsearch response.
    """
    print(len(res['hits']['hits']))

    plt.figure(figsize=(7, 7))
    plt.plot([float(hit["_score"]) for hit in res['hits']['hits']])


    plt.xlabel('doc')
    plt.ylabel('_score')
    plt.title('Scoring')
    plt.grid(True)
    plt.show()

def plot_token_frequency(res: dict, query: str, fields: list) -> None:
    """
    Plot frequency of tokens by document.
    """
    headers = {
        'Content-Type': 'application/json'
    }
    data = {
      "tokenizer": "standard",
      "text": query
    }

    res_token = r.post(f'http://elsatic:9200/_analyze', json=data, headers=headers)
    tokens = [token['token'] for token in res_token.json()['tokens']]


    y_dict = defaultdict(list)
    
    
    for hit in res['hits']['hits']:
        total = 0

        for token in tokens:
            for field in fields:
                if token in hit["_source"][field]:
                    total += hit["_source"][field].count(token)


            y_dict[token].append(total)
            total = 0
    
    plt.figure(figsize=(7.5, 7))

    for token in tokens:
        plt.plot(y_dict[token], label=token)

    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.grid(True)
    plt.show()
    

In [None]:
plot_score(res)
plot_token_frequency(res, QUERY, ['text'])

### Query DSL

In [None]:
# match, match_phrase, prefix

QUERY = "Less manage"

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
        "query": {
             "match": {
                 "text": {
                     "query": QUERY,
                 },
             },
        },
        "size": 100
    }
)

plot_score(res)
plot_token_frequency(res, QUERY, ['text'])

In [None]:
# Multi match
# ^, phrase, most_fields

QUERY = "this is Alan"

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
      "query": {
        "multi_match" : {
          "query": QUERY, 
          "fields": [ "text", "author" ] 
        }
      },
      "size": 100
    }
)

plot_score(res)
plot_token_frequency(res, QUERY, ['text', 'author',])

In [None]:
# most_fields, cross_fields, phrase, phrase_prefix

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
      "query": {
        "multi_match" : {
          "query": "you need", 
          "type": "phrase",  
          "fields": [ "text", "title", ],
        }
      },
      "size": 100
    }
)

plot_score(res)

# Bool query




```
query = {
  "query": {
    "bool" : {
      "must" : {
        "term" : ...
      },
      "filter": {
        "term" : ...
      },
      "must_not" : {
        "range" : {
          "pages" : { "gte" : 10, "lte" : 20 }
        }
      },
      "should" : [
        { "term" : ... },
        { "term" : ... }
      ]
    }
  }
}
```


**Bool:**
 - must
 - must_not
 - should
 - filter

In [None]:
# Bool query

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body={
      "query": {
        "bool": {
            "should": [
              {
                "term" : {
                  "date": {
                    "value": '2017-06-12',
                    "boost": 1.1 
                  }
                }
              },
                {
                   "terms" : {
                     "date": ['2017-06-13', '2017-06-14']
                   }
                }
             ]
            },
        },
        
      "size": 100
    }
)


In [None]:
# Function Score Query

query = {
    "query": {
        "function_score": {
          "query": { "match_all": {} },
          "boost": "2", 
          "functions": [
              {
                  "filter": { "match": { "author": "Bob" } },
                  "weight": 23
              },
              {
                  "filter": { "match": { "author": "Alan" } },
                  "weight": 15
              },
          ],
        }
    },
    "size": 100
}

res = client.search(
    index=INDEX_NAME,
    doc_type=TYPE_NAME,
    body=query
)


plot_score(res)

In [None]:
import requests as r
    
res = r.get('http://elsatic:9200/bank/_mapping?pretty')

print(res.text)
# client.search(index=INDEX_NAME, body={})

