In [None]:
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

In [None]:
#Creating an index with data
import json

from pprint import pprint

es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index='my_index')

operations = []
clothes_documents = json.load(open("../data/astronomy.json"))

for document in clothes_documents:
    operations.append({'index': {'_index': 'my_index'}})
    operations.append(document)

response = es.bulk(operations=operations)
pprint(response.body)

In [None]:
#count the number of documents in the index
count = es.count(index='my_index')
print('Number of documents in index:', count.body['count'])

In [None]:
#sql query
query = {
    "query": "SELECT title FROM my_index ORDER BY id LIMIT 5" #returns titles to all documents
}

response = es.sql.query(body=query)
for row in response['rows']:
    print(row)

In [None]:
#changing the format of the response
query = {
    "query": "SELECT * FROM my_index",
}

response = es.sql.query(body=query, format='json') #to json (txt, csv, tsv)
print(response)

In [None]:
#filtering
query = {
    "query": "SELECT * FROM my_index"
}

response = es.sql.query(
    body=query,
    filter={
        "term": {
            "title.keyword": "Black Holes" #where title is black holes
        }
    },
)
print(response)

In [None]:
#pagination
query = {
    "query": "SELECT * FROM my_index ORDER BY id DESC"
}

response = es.sql.query(
    body=query,
    format='json',
    fetch_size=5,
)
response.body

In [None]:
#translating the query
translate_query = {
    "query": "SELECT * FROM my_index WHERE content LIKE '%universe%' ORDER BY id DESC LIMIT 20"
}

translated_query = es.sql.translate(body=translate_query)
translated_query.body

In [None]:
#ILM Policy #readmore

from pprint import pprint

policy = {
    "phases": {
        "hot": {
            "actions": {
                "rollover": {
                    "max_age": "5m", #the index is rolled over after 5 minutes
                }
            }
        },
        "delete": {
            "min_age": "20m", #at 20 minutes the index is deleted
            "actions": {
                "delete": {}
            }
        }
    }
}

response = es.ilm.put_lifecycle(name="cpu_usage_policy_v2", policy=policy)
pprint(response.body)


In [None]:
#Analyzers

#HTML filter removes unwanted html tags
from pprint import pprint

response = es.indices.analyze(
    char_filter=[
        "html_strip"
    ],
    text="I&apos;m so happy</b>!</p>",
)
pprint(response.body)

In [None]:
#mapping maps characters e.g. arabic to the roman
response = es.indices.analyze(
    tokenizer="keyword",
    char_filter=[
        {
            "type": "mapping",
            "mappings": [
                "٠ => 0",
                "١ => 1",
                "٢ => 2",
                "٣ => 3",
                "٤ => 4",
                "٥ => 5",
                "٦ => 6",
                "٧ => 7",
                "٨ => 8",
                "٩ => 9"
            ]
        }
    ],
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
)
pprint(response.body)

In [None]:
#tokenizers splits tokens into individual words
response = es.indices.analyze(
    tokenizer="standard",
    text="The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}', Type: {token['type']}")

In [None]:
#converts to lower case
response = es.indices.analyze(
    tokenizer="lowercase",
    text="The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}', Type: {token['type']}")

In [None]:
#white space removes white space
response = es.indices.analyze(
    tokenizer="whitespace",
    text="The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}', Type: {token['type']}")

In [None]:
#removing everything after the apostrophee
response = es.indices.analyze(
    tokenizer="standard",
    filter=[
        "apostrophe"
    ],
    text="The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}'")

In [None]:
#converts symbols to value
response = es.indices.analyze(
    tokenizer="standard",
    filter=[
        "decimal_digit"
    ],
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}'")

In [None]:
#reverses the text
response = es.indices.analyze(
    tokenizer="standard",
    filter=[
        "reverse"
    ],
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}'")

In [None]:
#standard analyzer
response = es.indices.analyze(
    analyzer="standard",
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}'")

In [None]:
#stop removes stop words
response = es.indices.analyze(
    analyzer="stop",
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}'")

In [None]:
#keyword analyzer returns the sentence as a token
response = es.indices.analyze(
    analyzer="keyword",
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}'")

In [None]:
#working with synonyms
from pprint import pprint


settings = {
    "settings": {
        "analysis": {
            "filter": {
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": [ #created
                        "car, automobile, vehicle",
                        "tv, television",
                        "smartphone, mobile, cell phone",
                        "jupyter, jupyter notebook, jupyterlab",
                        "jupiter, mars, earth, venus, mercury, saturn, uranus, neptune => planet"
                    ]
                }
            },
            "analyzer": {
                "synonym_analyzer": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "synonym_filter" #added to the analyzer
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "description": {
                "type": "text",
                "analyzer": "synonym_analyzer" #added in the mapping
            }
        }
    }
}

index_name = "my_synonym_index"
es.indices.delete(index=index_name, ignore_unavailable=True)
response = es.indices.create(index=index_name, body=settings)
pprint(response.body)

In [None]:
#add docunebts to the index and then search with the synonyms
query = {
    "query": {
        "match": {
            "description": "vehicle"
        }
    }
}

response = es.search(index=index_name, body=query)

print("Search Results:")
for hit in response["hits"]["hits"]:
    print(hit["_source"])

In [None]:
#using synonyms only at search time
settings = {
    "settings": {
        "analysis": {
            "filter": {
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": [
                        "car, automobile, vehicle",
                        "tv, television"
                    ]
                }
            },
            "analyzer": {
                "index_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase"] #remove synonym filter and add it to the search analyzer below
                },
                "search_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase", "synonym_filter"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "description": {
                "type": "text",
                "analyzer": "index_analyzer",
                "search_analyzer": "search_analyzer" #add search analyzer that has the synonym filter
            }
        }
    }
}

es.indices.delete(index=index_name)
response = es.indices.create(index=index_name, body=settings)
pprint(response.body)

Options

In [None]:
#make output human redable
response = es.cluster.stats(human=True) #human = true
pprint(response["nodes"]["jvm"])


In [None]:
#fetching results from the time of running the query
response = es.search(
    index=index_name,
    body={
        "query": {
            "range": {
                "created_on": {
                    "gte": "2024-09-22||+1d/d",  # 2024-09-23
                    "lte": "now/d"  # 2024-11-16
                }
            }
        }
    }
)
hits = response['hits']['hits']
print(f"Found {len(hits)} documents")
