In [None]:
import csv
import json
import os
import re
import time

import pandas as pd
import requests
from dotenv import load_dotenv
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from requests.auth import HTTPBasicAuth
from sentence_transformers import SentenceTransformer


## Define ES Client

In [None]:
load_dotenv()
ES_ENDPOINT = os.getenv("ES_ENDPOINT")
ES_USER = os.getenv("ES_USER")
ES_PASSWORD = os.getenv("ES_PASSWORD")

In [None]:
# create an elasticsearch client
class Elastic:
    """
    A convenience object to send HTTP requests to Elasticsearch
    """
    def __init__(self, endpoint, username, password):
        """
        @param endpoint: the URL of the Elasticsearch instance
        @param username: the Elasticsearch username 
        @param password: the Elasticsearch password
        """
        self.header = {'Content-Type': 'application/json', 'charset':'UTF-8'}
        #self.header={'Content-Type': '--data-binary application/x-ndjson'}
        self.endpoint = endpoint
        self.username = username
        self.password = password
        self.methods_mapping = {'get': requests.get, 
                                'put':requests.put, 
                                'post':requests.post, 
                                'delete':requests.delete}
        
    def curl(self, method, handle, json=None):
        """
        Sends an HTTP request to the Elasticsearch instanc
        @param method: can be 'get', 'put', 'post', 'delete'
        @param handle: the API handle to be appended to the Elasticsearch url
        @param json: the json payload of the HTTP request
        """
        http_method = self.methods_mapping[method.lower()]
        r = http_method(f'{self.endpoint}/{handle}', auth=HTTPBasicAuth(self.username, self.password), 
                        headers=self.header, json=json)
        return r

In [None]:
# instantiate the Elastic object
e = Elastic(
    endpoint=ES_ENDPOINT,
    username=ES_USER,
    password=ES_PASSWORD
)

## Create ICD Code index

In [None]:
# Define mappings, settings, and types for the index
create_index_json={
  "mappings" : {
      "properties" : {
        "icd_code" : {
          "type" : "keyword", # point 4
        },
        "icd_description" : {
          "type" : "text",
      },
      "icd_embedding" : {
        "type": "dense_vector",
        "dims": 768
    },
  }
},
  "settings": {
    "number_of_shards": 4, 
    "number_of_replicas": 3, # point 3 
    "index.max_result_window": 20000,
    "index.refresh_interval": "-1", # point 2
    "index" : {
        "similarity" : {
          "default" : {
            "type" : "BM25", "b": 0.75, "k1": 1.2 # point 5
          }
        }
    },
    "analysis": {
      "analyzer": {
        "std_danish": {"type": "standard", "stopwords": "_danish_" }
      }
    }
  }
}

index_name = 'icd_codes'

# create an index
# r = e.curl('put', index_name, json=create_index_json)
# r.json()

In [None]:
# Load data and embed text
icd_codes = {}
with open('../../data/icd_codes_danish/d_diagnosis_codes.csv') as codes:
    f = csv.reader(codes, delimiter=';', )
    for i in f: 
        icd_code, icd_description = i[0], i[1]
        if icd_code[1:] == '':
            continue
        icd_codes[icd_code[1:]] = icd_description # First char is not ICD

model = SentenceTransformer('Geotrend/distilbert-base-da-cased')
embeddings = model.encode(list(icd_codes.values()), normalize_embeddings=True)

In [None]:
docs, doc_ids = [], []
for i, data in enumerate(zip(list(icd_codes.keys()), list(icd_codes.values()), embeddings.tolist())):
    icd_code, icd_description, embedding = data
    doc = {
        'icd_code': icd_code,
        'icd_description': icd_description,
        'icd_embedding': embedding
    }
    docs.append(doc)
    doc_ids.append(icd_code)

In [None]:
# Usse the elasticsearch wrapper client to bulk index the data
es = Elasticsearch([ES_ENDPOINT], http_auth=(ES_USER, ES_PASSWORD))

actions = [
  {
    "_index": index_name,
    "_id": doc_id,
    "_source": doc
  }
  for doc_id, doc in list(zip(doc_ids, docs))
]

# the API takes care of chunking them optimally
bulk(es, actions) # point 1

# Refresh whenever we have indexed the data
# r = e.curl('post', f'{index_name}/_refresh')
# r.json()