# FULL-TEXT INDEXING

## This notebook defines a step-by-step procedure for indexing data in ElasticSearch.

First we import the required libraries and set the important variables

In [2]:
import sys
sys.path.append('../')
from elasticsearch import Elasticsearch
from mappings import mappings
import json
import os
from utils import data_utils, index_utils, embed_utils
import numpy as np
from textwrap import fill
from dotenv import load_dotenv
from itertools import islice

# Load environment variables from the .env file
load_dotenv()

# Get sensitive configuration from .env file or define as environment variables
ELASTICSEARCH_URL = os.getenv('ELASTICSEARCH_URL')
ELASTIC_USERNAME = os.getenv('ELASTIC_USERNAME')
ELASTIC_PASSWORD = os.getenv('ELASTIC_PASSWORD')

# Define other configuration variables
INDEX_NAME = 'fulltext_test_index'

## Data preparation

Point variables to the XML and XSD files of the grants data.

In [3]:
GRANTS_FILE = '../data/grants.xml'
GRANTS_SCHEMA = '../data/grants-20230530.xsd'

Validate the XML file using the schema defined in XSD file.

Go through the errors, if any, to ensure they are not critical.

In [4]:
# Validate the XML file using schema
data_utils.validate_xml_with_xsd(GRANTS_FILE, GRANTS_SCHEMA)

XML is valid according to XSD


Convert XML data to dict format using function in data_utils.

In [5]:
dict_data = data_utils.parse_xml_to_dict(GRANTS_FILE)

# Print data
# with open('grants.json', 'w') as f: 
#     json.dump(dict_data, f)
# with open('grants.json', 'r') as f: 
#     json_data = json.load(f)

print(f"Number of grants = {len(dict_data['grants_data']['grant'])}")

Number of grants = 10


Clean the data using function in data_utils. This function renames some fields and converts some to required format.

In [6]:
dict_data = data_utils.clean_dict_data(dict_data)

In [7]:
print(fill(str(dict_data['grants_data']['grant'][0].keys()), width=200))

dict_keys(['@id', 'url', 'amount_info', 'site_grant_type', 'modified_date', 'application_url', 'title', 'all_titles', 'submission_info', 'all_grant_source_urls', 'status', 'description',
'eligibility', 'categories_display', 'limited_grant_info', 'user_categories', 'submit_date', 'is_limited', 'site_categories', 'cost_sharing', 'grant_source_url', 'deadlines', 'amounts', 'all_types',
'all_applicant_types', 'locations', 'sponsors'])


# Indexing

In [8]:
# Connecting to ElasticSearch container
ESclient = Elasticsearch(
  ELASTICSEARCH_URL,
  basic_auth = (ELASTIC_USERNAME, ELASTIC_PASSWORD),
  request_timeout = 60
)

ESclient.info()

ObjectApiResponse({'name': '601bb54c777c', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'FS5t3wmfS3y-5pkG0ZWv-A', 'version': {'number': '8.14.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '93a57a1a76f556d8aee6a90d1a95b06187501310', 'build_date': '2024-06-10T23:35:17.114581191Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

Create index with settings as defined in mappings.py.

In [9]:
# ESclient.indices.delete(index = INDEX_NAME)
index_utils.create_index(ESclient, INDEX_NAME, mappings)

Index 'fulltext_test_index' created successfully.


Index data

In [10]:
def batched(iterable, n):
    """Batch data into tuples of length n. The last batch may be shorter."""
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while (batch := tuple(islice(it, n))):
        yield batch

In [12]:
body = index_utils.construct_indexing_actions(dict_data, INDEX_NAME)
batches = embed_utils.batched(body, 100)
# print(f"Total batches = {len(batches)}")

In [13]:
for n,batch in enumerate(batches):
    index_utils.bulk_index_documents(ESclient, batch, chunk_size=100)
    print(f"Batch {n} done")

Successfully indexed 10 documents.
Batch 0 done


In [14]:
ESclient.indices.refresh(index=INDEX_NAME)
ESclient.indices.stats(index=INDEX_NAME)

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_all': {'primaries': {'docs': {'count': 10, 'deleted': 0, 'total_size_in_bytes': 204168}, 'shard_stats': {'total_count': 1}, 'store': {'size_in_bytes': 204417, 'total_data_set_size_in_bytes': 204417, 'reserved_in_bytes': 0}, 'indexing': {'index_total': 10, 'index_time_in_millis': 58, 'index_current': 0, 'index_failed': 0, 'delete_total': 0, 'delete_time_in_millis': 0, 'delete_current': 0, 'noop_update_total': 0, 'is_throttled': False, 'throttle_time_in_millis': 0, 'write_load': 0.00016270218107412567}, 'get': {'total': 0, 'time_in_millis': 0, 'exists_total': 0, 'exists_time_in_millis': 0, 'missing_total': 0, 'missing_time_in_millis': 0, 'current': 0}, 'search': {'open_contexts': 0, 'query_total': 0, 'query_time_in_millis': 0, 'query_current': 0, 'fetch_total': 0, 'fetch_time_in_millis': 0, 'fetch_current': 0, 'scroll_total': 0, 'scroll_time_in_millis': 0, 'scroll_current': 0, 'suggest_total': 0, 'suggest_time_in

## Error Handling

In [15]:
# Check if all docs are indexed
len(dict_data['grants_data']['grant']) - ESclient.indices.stats(index=INDEX_NAME)['_all']['primaries']['docs']['count']

0

In [16]:
# find ids of docs which are not indexed
ids = [item['@id'] for item in dict_data['grants_data']['grant']]
failed_ids = index_utils.get_missing_ids(ESclient, INDEX_NAME, ids)
print(len(failed_ids))

0


In [19]:
body = index_utils.construct_actions_from_ids(dict_data, failed_ids, INDEX_NAME)
batches = embed_utils.batched(body, 100)

In [20]:
for n,batch in enumerate(batches):
    index_utils.bulk_index_documents(ESclient, batch, chunk_size=100)
    print(f"Batch {n} done")

In [21]:
print(len(index_utils.get_missing_ids(ESclient, INDEX_NAME, ids)))

0
