In [None]:
%matplotlib inline

import ssl

import opensearchpy.helpers
import pandas as pd
from matplotlib import pyplot as plt

# databases
from opensearchpy import OpenSearch

# import elasticsearch
# import elasticsearch.helpers
# from elasticsearch.connection import create_ssl_context

# Configuration

In [None]:
# ES connector config
es_user = 'admin'
es_pass = 'admin'
es_host = 'elasticsearch-1'
es_port = 9200
auth=(es_user, es_pass)

# Init connectors (Opensearch/OpenDistro version)

In [4]:

es = OpenSearch(
    hosts = [{'host': es_host, 'port': 9200}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

# check status
print('checking ES connection ...')
if not es.ping():
    raise Exception("Connection failed")
print('connected to ES')

checking ES connection ...


Exception: Connection failed

# Example queries

In [None]:
def print_plot(cui_occ):
    cui_names = []
    cui_counts = []

    for cui, cnt in sorted(cui_occ.items(), key=lambda kv: kv[1], reverse=True):
        cui_names.append(cui)
        cui_counts.append(cnt)

    # display the results
    plt.figure(figsize=(25,8))

    plt.bar(range(len(cui_names)), list(cui_counts), align='center')
    plt.xticks(range(len(cui_occ)), list(cui_names), rotation=90)

    plt.tick_params(axis='x', which='major', labelsize=20)
    plt.tick_params(axis='y', which='major', labelsize=20)

    plt.show()

## ElasticSearch

### Direct data manipulations

In [None]:
# example 1:
# retrieve the number of documents satisfying the query criteria
#
# an example query -- search for keyword 'cancer' in documents
query_body_text = {
    "query": {
        "match": {
            "document": "cancer"}
    }
}
index_to_query_text = 'medical_reports_text'

documents = es.search(index=index_to_query_text, body=query_body_text)

print(documents['hits']['total'])

In [None]:
# example 2:
# - retrieve all the documents matching the query criteria 
# - calculate the CUI codes occurencies
# - visualize the results

query_body_medcat = {
    "query": {
        "match": {
            "nlp.source_value": "skin"}
    }
}
index_to_query_medcat = "medical_reports_anns_medcat_medmen_cancer"

# query the elasticsearch
results = opensearchpy.helpers.scan(es, index=index_to_query_medcat, query=query_body_medcat)

# calculate the occurencies
bio_cui_occ = {}
bio_tui_arr = {}
for item in results:
    cui = item['_source']['pretty_name']
    if cui not in bio_cui_occ: 
        bio_cui_occ[cui] = 1
    else:
        bio_cui_occ[cui] += 1

    if 'tui' in item["_source"]:
        tui = item['_source']['tui']
        if tui not in bio_tui_arr:
             bio_tui_arr[tui] = set()
        bio_tui_arr[tui].add(cui)
    

print_plot(bio_cui_occ)

In [None]:
# example 2:
# - retrieve all the documents matching the query criteria 
# - calculate the CUI codes occurencies
# - visualize the results

def get_tui_cui_occ(es, index_name, cui_field_name, tui_field_name, query_field_name, query_phrase):
    # query the elasticsearch
    query = {
        "query": {
            "match": {
               query_field_name : query_phrase}
        }
    }
    results = opensearchpy.helpers.scan(es, index=index_name, query=query)

    # calculate the occurencies
    cui_occ = {}
    tui_arr = {}
    for item in results:
        cui = item['_source'][cui_field_name]
        if cui not in cui_occ: 
            cui_occ[cui] = 1
        else:
            cui_occ[cui] += 1

        if tui_field_name in item["_source"]:
            tui = item['_source'][tui_field_name]
            if tui not in tui_arr:
                 tui_arr[tui] = set()
            tui_arr[tui].add(cui)
    
    return (cui_occ, tui_arr)

In [None]:
medcat_results = get_tui_cui_occ(es, 'medical_reports_anns_medcat_medmen_cancer', 'cui', 'tui', 'source_value', 'skin')

print_plot(medcat_results[0])

### Using Pandas

In [None]:
# query template
#
query_body_medcat = {
    "query": {
        "match": {
            "source_value": "skin"}
    }
}

index_to_query_medcat = 'medical_reports_anns_medcat_medmen_cancer'


# need to re-query ES to fetch the results
#
results = opensearchpy.helpers.scan(es, index=index_to_query_medcat, query=query_body_medcat)


# create a data frame from the results
#
es_df = pd.DataFrame.from_dict([item['_source'] for item in results])
es_df.head()

NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [medical_reports_anns_medcat_medmen_cancer]', medical_reports_anns_medcat_medmen_cancer, index_or_alias)

In [None]:
# select only a subset of columns and perform filtering based on TUI and CUI
es_df_sub = es_df.filter(items=['docid', 'source_value', 'cui', 'tui'])
rows = es_df_sub.loc[(es_df_sub['tui'] == 'T191')]

print(rows[0:10])

NameError: name 'es_df' is not defined