In [3]:
from elasticsearch import Elasticsearch
import json
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer


In [15]:
es_client = Elasticsearch('http://localhost:9200') 


In [16]:
response = es_client.info()
print(response)

{'name': 'w-along-llmproject-7526af7482814462acb59e22664fd36f-546f87dfjs2', 'cluster_name': 'elasticsearch', 'cluster_uuid': '5fAvx7RkR0O_tZxiYwB-2w', 'version': {'number': '8.9.1', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'a813d015ef1826148d9d389bd1c0d781c6e349f0', 'build_date': '2023-08-10T05:02:32.517455352Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [17]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "answer": {"type": "text"},
            "question": {"type": "text"},
            "id": {"type": "keyword"},
            "qa_text_embeddings": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "diabetes-questions"

es_client.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [diabetes-questions/GiLqkqfgSFSZUr_54V3YvQ] already exists')

In [18]:
### Load Documents
with open('diabetes_data_with_vectors', 'r') as f_in:
    diabetes_data_with_vectors = json.load(f_in)

In [19]:
for doc in tqdm(diabetes_data_with_vectors):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/1071 [00:00<?, ?it/s]

In [9]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["answer", "question", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []

    for hit in es_results['hits']['hits']:
        result_docs.append((hit['_source']['id'],hit['_source']))

    return result_docs

In [10]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

user_question = "What are the recommended food for a diabetes patients?"

user_question_embedding = embedding_model.encode(user_question)



In [20]:
elastic_search_knn('qa_text_embeddings',user_question_embedding)


[('b86f8575',
  {'question': 'What types of foods are emphasized in a healthy eating plan for diabetes?',
   'answer': 'A healthy eating plan for diabetes emphasizes a variety of foods including breads, cereals, rice, whole grains, fruits, vegetables, meat and meat substitutes, dairy products, and healthy fats.',
   'id': 'b86f8575'}),
 ('b86f8575',
  {'question': 'What types of foods are emphasized in a healthy eating plan for diabetes?',
   'answer': 'A healthy eating plan for diabetes emphasizes a variety of foods including breads, cereals, rice, whole grains, fruits, vegetables, meat and meat substitutes, dairy products, and healthy fats.',
   'id': 'b86f8575'}),
 ('c070f69a',
  {'question': 'What are the key components of a healthy eating plan for diabetes?',
   'answer': 'A healthy eating plan for diabetes emphasizes breads, cereals, rice, whole grains, fruits, vegetables, meat and meat substitutes, dairy products, and healthy fats, while focusing on appropriate portion sizes.',
