# SEMANTIC INDEXING

## This notebook defines a step-by-step procedure for semantic indexing data in ElasticSearch.

First we import the required libraries and set the important variables

In [1]:
import sys
sys.path.append('../')
from elasticsearch import Elasticsearch, exceptions, NotFoundError, helpers
from time import sleep
import xmltodict
from mappings import mappings
from ingest_pipeline import get_ingest_pipeline
import json
import os
from openai import OpenAI
from lxml import etree
from utils import data_utils, index_utils, embed_utils
import numpy as np
from textwrap import fill
from dotenv import load_dotenv
from tiktoken import get_encoding
from itertools import islice

# Load environment variables from the .env file
load_dotenv()

# Get sensitive configuration from .env file or define as environment variables
ELASTICSEARCH_URL = os.getenv('ELASTICSEARCH_URL')
ELASTIC_USERNAME = os.getenv('ELASTIC_USERNAME')
ELASTIC_PASSWORD = os.getenv('ELASTIC_PASSWORD')
OPENAI_KEY = os.getenv('OPENAI_KEY')

# Define other configuration variables
INDEX_NAME = 'sem_index'


OAIclient = OpenAI(api_key=OPENAI_KEY)

## Data preparation

Point variables to the XML and XSD files of the grants data.

In [2]:
GRANTS_FILE = '../data/grants.xml'
GRANTS_SCHEMA = '../data/grants-20230530.xsd'

Validate the XML file using the schema defined in XSD file.

Go through the errors, if any, to ensure they are not critical.

In [3]:
# Validate the XML file using schema
data_utils.validate_xml_with_xsd(GRANTS_FILE, GRANTS_SCHEMA)

XML is valid according to XSD


Convert XML data to dict format using function in data_utils.

In [4]:
dict_data = data_utils.parse_xml_to_dict(GRANTS_FILE)

# Print data
# with open('grants.json', 'w') as f: 
#     json.dump(dict_data, f)
# with open('grants.json', 'r') as f: 
#     json_data = json.load(f)

print(f"Number of grants = {len(dict_data['grants_data']['grant'])}")

Number of grants = 10


Clean the data using function in data_utils. This function renames some fields and converts some to required format.

In [5]:
dict_data = data_utils.clean_dict_data(dict_data)

In [6]:
print(fill(str(dict_data['grants_data']['grant'][0].keys()), width=200))

dict_keys(['@id', 'url', 'amount_info', 'site_grant_type', 'modified_date', 'application_url', 'title', 'all_titles', 'submission_info', 'all_grant_source_urls', 'status', 'description',
'eligibility', 'categories_display', 'limited_grant_info', 'user_categories', 'submit_date', 'is_limited', 'site_categories', 'cost_sharing', 'grant_source_url', 'deadlines', 'amounts', 'all_types',
'all_applicant_types', 'locations', 'sponsors'])


### Find token limits for longer fields in the documents. For simplicity, we will truncate the fields before creating embeddings for them. 

In [7]:
def find_token_limit_index(tokenizer, text, max_tokens=8191):
    tokens = tokenizer.encode(text)
    # Return the full length of the text if it has fewer or equal tokens
    if len(tokens) <= max_tokens:
        return len(text)  

    truncated_tokens = tokens[:max_tokens]
    truncated_text = tokenizer.decode(truncated_tokens)
    
    # Find the last character index of the truncated text in the original text
    index = text.find(truncated_text) + len(truncated_text)
    return index


tokenizer = get_encoding("cl100k_base")
for data in dict_data['grants_data']['grant']:
    data['description_truncate_length'] = find_token_limit_index(tokenizer, data['description'])
    data['submission_info_truncate_length'] = find_token_limit_index(tokenizer, data['submission_info'])
    data['eligibility_truncate_length'] = find_token_limit_index(tokenizer, data['eligibility'])

Connect to your ElasticSearch client.

In [8]:
# Connecting to ElasticSearch container
ESclient = Elasticsearch(
  ELASTICSEARCH_URL,
  basic_auth = (ELASTIC_USERNAME, ELASTIC_PASSWORD),
  request_timeout = 60
)

ESclient.info()

ObjectApiResponse({'name': '601bb54c777c', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'FS5t3wmfS3y-5pkG0ZWv-A', 'version': {'number': '8.14.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '93a57a1a76f556d8aee6a90d1a95b06187501310', 'build_date': '2024-06-10T23:35:17.114581191Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

Create index with settings as defined in mappings.py. Alter the "embeddings" field to according to your model

In [9]:
# ESclient.indices.delete(index = INDEX_NAME)
index_utils.create_index(ESclient, INDEX_NAME, mappings)

Index 'sem_index' created successfully.


Create an inference endpoint within the ElasticSearch for embeddings

In [10]:
# Add OpenAI inference endpoint for embeddings
INFERENCE_ID = "openai-embeddings"
EMBEDDING_MODEL = "text-embedding-3-large"
try: 
    resp = ESclient.inference.put_model(
        task_type = "text_embedding",
        inference_id = INFERENCE_ID,
        body = {
            "service": "openai",
            "service_settings": {
                "api_key": OPENAI_KEY,
                "model_id": EMBEDDING_MODEL,
            },
        },
    )
    print(resp)
except Exception as e:
    print(f"Unexpected error: {e}")


Unexpected error: BadRequestError(400, 'resource_already_exists_exception', 'Inference model [openai-embeddings] already exists')


Create an ingestion pipeline that adds embeddings to documents as they are indexed. It is defined in ingest_pipeline.py

In [11]:
PIPELINE_ID = "sem_embedding_pipeline"
pipeline_json = get_ingest_pipeline(INFERENCE_ID)

# ESclient.ingest.delete_pipeline(id=pipeline_id)
try:
    resp = ESclient.ingest.put_pipeline(
        id=PIPELINE_ID,
        body = pipeline_json
    )
    print(resp)
except Exception as e:
    print(f"Unexpected error: {e}") 

{'acknowledged': True}


Index data

In [12]:
def batched(iterable, n):
    """Batch data into tuples of length n. The last batch may be shorter."""
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while (batch := tuple(islice(it, n))):
        yield batch

In [13]:
body = index_utils.construct_indexing_actions(dict_data, INDEX_NAME, pipeline_id=PIPELINE_ID)
batches = embed_utils.batched(body, 100)
# print(f"Total batches = {len(batches)}")

In [14]:
for n,batch in enumerate(batches):
    index_utils.bulk_index_documents(ESclient, batch, chunk_size=100)
    print(f"Batch {n} done")

Successfully indexed 10 documents.
Batch 0 done


In [16]:
ESclient.indices.refresh(index=INDEX_NAME)
ESclient.indices.stats(index=INDEX_NAME)

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_all': {'primaries': {'docs': {'count': 10, 'deleted': 0, 'total_size_in_bytes': 859139}, 'shard_stats': {'total_count': 1}, 'store': {'size_in_bytes': 859471, 'total_data_set_size_in_bytes': 859471, 'reserved_in_bytes': 0}, 'indexing': {'index_total': 10, 'index_time_in_millis': 72, 'index_current': 0, 'index_failed': 0, 'delete_total': 0, 'delete_time_in_millis': 0, 'delete_current': 0, 'noop_update_total': 0, 'is_throttled': False, 'throttle_time_in_millis': 0, 'write_load': 0.0007957248541119825}, 'get': {'total': 0, 'time_in_millis': 0, 'exists_total': 0, 'exists_time_in_millis': 0, 'missing_total': 0, 'missing_time_in_millis': 0, 'current': 0}, 'search': {'open_contexts': 0, 'query_total': 0, 'query_time_in_millis': 0, 'query_current': 0, 'fetch_total': 0, 'fetch_time_in_millis': 0, 'fetch_current': 0, 'scroll_total': 0, 'scroll_time_in_millis': 0, 'scroll_current': 0, 'suggest_total': 0, 'suggest_time_in_

## Error Handling

In [17]:
# Check if all docs are indexed
len(dict_data['grants_data']['grant']) - ESclient.indices.stats(index=INDEX_NAME)['_all']['primaries']['docs']['count']

0

In [18]:
# find ids of docs which are not indexed
ids = [item['@id'] for item in dict_data['grants_data']['grant']]
failed_ids = index_utils.get_missing_ids(ESclient, INDEX_NAME, ids)
print(len(failed_ids))

0


In [20]:
body = index_utils.construct_actions_from_ids(dict_data, failed_ids, INDEX_NAME, pipeline_id=PIPELINE_ID)
batches = embed_utils.batched(body, 100)

In [21]:
for n,batch in enumerate(batches):
    index_utils.bulk_index_documents(ESclient, batch, chunk_size=100)
    print(f"Batch {n} done")

In [22]:
print(len(index_utils.get_missing_ids(ESclient, INDEX_NAME, ids)))

0


In [23]:
# find ids of docs that where creation of embeddings failed
failed_ids = index_utils.get_failed_embedding_ids(ESclient, INDEX_NAME)
print(len(failed_ids))

0


In [24]:
body = index_utils.construct_actions_from_ids(dict_data, failed_ids, INDEX_NAME, pipeline_id=PIPELINE_ID)
batches = embed_utils.batched(body, 100)

In [25]:
for n,batch in enumerate(batches):
    index_utils.bulk_index_documents(ESclient, batch, chunk_size=100)
    print(f"Batch {n} done")

In [26]:
print(len(index_utils.get_failed_embedding_ids(ESclient, INDEX_NAME)))

0
