In [5]:
from openai import OpenAI
from utils import data_utils
import re

client = OpenAI(api_key='')
GRANTS_FILE = './data/grants.xml'

In [6]:
from elasticsearch import Elasticsearch, exceptions, helpers
import json
import os

index_name = "sem_temp_index"  # Name of the Elasticsearch index
elastic_password = os.getenv('ELASTIC_PASSWORD', 'pass123')
# Connect to elasticsearch container
ESclient = Elasticsearch(
  "http://localhost:9200/",
  basic_auth=('elastic', elastic_password)
)

ESclient.info()

ObjectApiResponse({'name': '601bb54c777c', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'FS5t3wmfS3y-5pkG0ZWv-A', 'version': {'number': '8.14.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '93a57a1a76f556d8aee6a90d1a95b06187501310', 'build_date': '2024-06-10T23:35:17.114581191Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [7]:
dict_data = data_utils.parse_xml_to_dict(GRANTS_FILE)
dict_data = data_utils.clean_dict_data(dict_data)

In [11]:
def get_prompt(grant):
    return  [
            {
                "role": "system",
                "content": "You are a highly skilled summarizer specialized in grants. You are provided with detailed information about a grant and your task is to create a structured summary according to a specified format. The summary must be accurate and informative, following the given structure precisely."
            },
            {
                "role": "user",
                "content": f"Based on the provided grant information, generate a structured summary with the following fields:\n\n1. Title: [The name or title of the grant.](Limit: 10 words)\n2. Amount: [The funding details, including the minimum and maximum amounts, if specified.](Limit: 40 words)\n3. Deadline: [The submission deadline(s) for the grant application.](Limit: 15 words)\n4. Description: [A very detailed overview of the grant's purpose and objectives. This should be comprehensive and informative.](Limit: 200 words)\n5. Eligibility: [The detailed criteria for applicants to be eligible for the grant, including any specific requirements.](Limit: 50 words)\n6. Sponsor: [The organization or entity sponsoring the grant.](Limit: 20 words)\n7. Categories: [The areas or fields the grant supports.](Limit: 20 words)\n7. Activity: [The EXACT activity/activities the grant funds. This should be comprehensive and accurate](Limit: 70 words)\n\nEnsure that the description and eligibility sections are detailed and comprehensive. Reply in JSON format while following word limits.\nHere is the information for the grant:\n[{grant}]"
            }
            ]


summaries = {}
for grant in dict_data['grants_data']['grant']:
    messages = get_prompt(grant)
    
    completion = client.chat.completions.create(
        model='gpt-4o',
        messages=messages,
        temperature = 0.5)
    resp = completion.choices[0].message.content
    # print(completion.choices[0].message.content)
    summaries[grant['@id']] = resp

In [20]:
for id, summary in summaries.items():
    match = re.search(r'\{.*\}', summary, re.DOTALL)
    summary = match.group(0)
    summaries[id] = summary

In [13]:
mapping = {
    "properties": {
        "normalized_info": 
            {"type": "text"}
    }
}

# Update the mapping
ESclient.indices.put_mapping(index=index_name, body=mapping)


ObjectApiResponse({'acknowledged': True})

In [21]:
for grant in dict_data['grants_data']['grant']:
    id = grant['@id']
    normalized_summary = summaries[id]
    ESclient.update(index=index_name, id=id, body={"doc": {"normalized_info": normalized_summary}})

In [22]:
from elasticsearch.helpers import scan

# Use the scan helper to retrieve all documents
results = scan(
    client=ESclient,
    index=index_name,
    query={
        "query": {
            "match_all": {}
        },
        "_source_excludes": ['embeddings']
    },
    scroll='2m',  # Time to keep the search context alive
    size=1000  # Number of documents per batch
)

# Iterate over the results
all_docs = []
for res in results:
    all_docs.append(res['_source'])

print(f"Retrieved {len(all_docs)} documents.")

Retrieved 10 documents.


In [23]:
all_docs

[{'description': "Submissions for the 2023 Student Ethics Essay Award (SEEA) competition are now being accepted. Authors of winning essays will be notified in May 2023. See [SEEA Award recipients](https://www.asha.org/practice/ethics/student-ethics-essay-award-recipients/) to view previous winners and read their winning essays.\n\nThe SEEA program is conducted as part of ASHA's efforts to enhance ethics education activities. The goal of the program is to encourage students to think about ethical decision making and create greater awareness of situations that could pose ethical dilemmas as they prepare to start careers in audiology, speech-language pathology, or speech, language, and hearing sciences.\n\nThe essay competition is open to students who are enrolled in any undergraduate, post baccalaureate, or entry-level graduate program (U.S. only) in communication sciences and disorders (CSD). Former and current members of ASHA's Board of Ethics review the essays and select three winning