In [9]:
from dotenv import load_dotenv
import os
from elasticsearch import helpers  # For bulk Data Uploading
from elasticsearch import Elasticsearch  # Base function for interacting with Elasticsearch
from elasticsearch import RequestError
from pprint import pprint
from load_func import *

# remove excessive HTTPS request warnings
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Connect to Elastic search 
This is done using your own API key, generated using kibana, which should be stored localy on your machine. We stored our API key in a .env file in the same folder as this script, and use the python module dotenv to load it as an environment variable.

In [10]:
load_dotenv()
client = Elasticsearch("https://localhost:9200/", api_key=os.getenv('API_KEY'),verify_certs=False)

#test client
print(client.info())

{'name': 'alhena.ster.kuleuven.be', 'cluster_name': 'elasticsearch', 'cluster_uuid': '9KoZG2x-QPS21BzKMnLzqw', 'version': {'number': '8.15.3', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'f97532e680b555c3a05e73a74c28afb666923018', 'build_date': '2024-10-09T22:08:00.328917561Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


  _transport = transport_class(


## Create the dictionnaries containing all the mappings

We read the mappings list from the metadata.csv file, which is generated externally from the shared excel file, and the list is then transformed into a dictionnary of mappings with the proper formatting.

In [None]:
# grab mappings list from the csv file, which should be in the same directory as the script
DIR_NAME = os.getcwd()
FILE_NAME = "metadata.csv"
FILE_PATH = os.path.join(DIR_NAME, FILE_NAME)
data, header = read_csv(FILE_PATH)
# index name
INDEX_NAME = "wind"

# indicate if you want to update existing documents
UPDATE = False

# create the dictionnary of mappings
mappings = create_mapping(data, header)
#pprint(mappings)

Add other settings to the index parameters

In [12]:
"""
"settings" is technically not needed if we are working on a simple local host, but can be changed to optimise search performance on a database that is hosted on a cluster and searched by multiple users.
"mappings" is required if you wish to explicitly map fields to specific values
"""

index_definition = {
    "settings": {
        "number_of_shards": 1,
    },
    "mappings": {"properties": mappings},
}

Create the index using the parameters set above.

In [13]:
""" create index if it does not exist """

if INDEX_NAME in client.indices.get_alias(index="*"):
    print("Index already exists, delete it if you want to recreate it")
else:
    client.indices.create(index=INDEX_NAME, body=index_definition)


  if INDEX_NAME in client.indices.get_alias(index="*"):


## Kibana

Once the index is created in elasticsearch, in order to view it in Kibana, go to **Management** and on left menu bar, scroll down to the **Kibana** subsection, and click **Data Views**. From here, on the top right, click **Create data view** in order to integrate the new index into the Kibana interface. This will allow you to view how kibana interprets the index you have created. 

## Loading data
Now that the index is created and visible in Kibana, we can start mapping models to the index.

In [14]:
# Define directories and file names here
DIR = "/Users/camille/Documents/runs/phantom/database/wind"  # Careful, this is a local path - change it to your own
PREFIX = "wind"

# read model list from external file
list_dir = "/Users/camille/Documents/PhantomDatabase/"
list_name = "model_list.txt"
MODELS = read_model_list(os.path.join(list_dir, list_name))


### Load information from .setup and .in file
We can upload Documents of interest by indexing them using the parameters in their data files, for instance .setup and .in files. We can load multiple models at a time, which is preferable of course.



In [15]:
# Generate the operations list to upload multiple documents

operations = []
update_count = 0
skip_count = 0
for model in MODELS:
    base_command = {"_index": INDEX_NAME, "_op_type": "index"}
    # check if document already exists
    id = query_document(client, INDEX_NAME,model)

    # check if existing document should be updated
    if id and UPDATE:
        # delete and reupload
        update_count += 1
        client.delete(index=INDEX_NAME, id=id)
    elif id and not UPDATE:
        skip_count += 1
        continue
    # load data
    modelData = LoadDoc(DIR, model, PREFIX, index_definition)
    # check that all the entries are correctly filled
    CheckEntries(model,modelData)
    operations.append((base_command | {"_source": modelData}))

if UPDATE and update_count>0 : print(f'{update_count}/{len(MODELS)} documents already exist and will be updated.')
elif skip_count>0: print(f'{skip_count}/{len(MODELS)} documents already exist and will be skipped.')
else: print(f'All {len(MODELS)} documents will be uploaded.')
#pprint(operations)



All 339 documents will be uploaded.


In [16]:
# Upload the documents
helpers.bulk(client, operations, refresh=True)

(339, [])