In [None]:
from opensearchpy import OpenSearch

In [None]:
#Initialize connection to opensearch
host = 'localhost'
port = 9200
auth = ('admin', 'admin') 

client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    timeout=100
)
#check status
print(client.info())

In [None]:
#create VectorDB index:

index_name = "med_data"
index_body = {
    "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": {
    "properties": {
        "vector": {
          "type": "knn_vector",
          "dimension": 1024,     #Thats the output dimension of the e5 model
          "method": {
            "name": "hnsw",
            "space_type": "l2",
            "engine": "nmslib",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
          }
        }
    }
  }
}

response = client.indices.create(index_name, body = index_body)
print(response)

In [None]:
import pickle

#Load data that was created by embedding\strategy_3
file_path = '../embedding/strategy_3/data.txt'


with open(file_path, 'rb') as file:
    loaded_list = pickle.load(file)


print(len(loaded_list))


In [None]:
def divide_list(input_list, n):
    chunk_size = len(input_list) // n
    remainder = len(input_list) % n

    start = 0
    result = []

    for i in range(n):
        end = start + chunk_size + (1 if i < remainder else 0)
        result.append(input_list[start:end])
        start = end

    return result


In [None]:
result = divide_list(loaded_list, 600)

In [None]:
print(len(result[0]))
import uuid
from tqdm import tqdm
data_for_bulk_insert = []

for batch in tqdm(result):
    for chunk in batch:
        data_for_bulk_insert.append({"index": {"_index": index_name, "_id": str(uuid.uuid4())}})
        data_for_bulk_insert.append({"vector" : chunk[0], "text" : chunk[1], "PMID" : chunk[2], "TI" : chunk[3], "PB": chunk[4], "FAU": chunk[5], "FED": chunk[6], "DP": chunk[7], "OTO": chunk[8], "ISBN" : chunk[9] })
    response = client.bulk(data_for_bulk_insert)
    data_for_bulk_insert = []