In [1]:
!pip install elasticsearch sentence_transformers tqdm 

Collecting elasticsearch
  Downloading elasticsearch-8.15.0-py3-none-any.whl.metadata (8.7 kB)
Collecting sentence_transformers
  Using cached sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting elastic-transport<9,>=8.13 (from elasticsearch)
  Downloading elastic_transport-8.15.0-py3-none-any.whl.metadata (3.6 kB)
Downloading elasticsearch-8.15.0-py3-none-any.whl (523 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m523.3/523.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
Downloading elastic_transport-8.15.0-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elastic-transport, elasticsearch, sentence_transformers
Successfully installed elastic-transport-8.15.0 elasticsearch-8.15.0 sentence_transformers-3.0.1


In [1]:
from elasticsearch import Elasticsearch
import json
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer


In [2]:
es_client = Elasticsearch('http://localhost:9200') 


In [3]:
response = es_client.info()
print(response)

{'name': 'w-along-llmproject-7526af7482814462acb59e22664fd36f-5974f449r2d', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'FOLbzfWySoGx267TM6asDg', 'version': {'number': '8.9.1', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'a813d015ef1826148d9d389bd1c0d781c6e349f0', 'build_date': '2023-08-10T05:02:32.517455352Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [4]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "answer": {"type": "text"},
            "question": {"type": "text"},
            "id": {"type": "keyword"},
            "qa_text_embeddings": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "diabetes-questions"
try:
    es_client.indices.create(index=index_name, body=index_settings, ignore=400)
    print(f"Index '{index_name}' created or already exists.")
except Exception as e:
    print(f"An error occurred: {e}")

Index 'diabetes-questions' created or already exists.


  es_client.indices.create(index=index_name, body=index_settings, ignore=400)


In [6]:
### Load Documents
with open('diabetes_data_with_vectors', 'r') as f_in:
    diabetes_data_with_vectors = json.load(f_in)

In [7]:
for doc in tqdm(diabetes_data_with_vectors):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/1071 [00:00<?, ?it/s]

In [9]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["answer", "question", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []

    for hit in es_results['hits']['hits']:
 
        result_docs.append({"Question": hit['_source']['question'] , "Answer": hit['_source']['answer'], "ID": hit['_source']['id']})

    return result_docs

In [14]:
def hybrid_search(query_text, query_vector, top_k=5):
    script_query = {
        "script_score": {
            "query": {
                "bool": {
                    "should": [
                        {"match": {"question": query_text}},
                        {"match": {"answer": query_text}}
                    ]
                }
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'qa_text_embeddings') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }

    response = es_client.search(
        index=index_name,
        body={
            "size": top_k,
            "query": script_query
        }
    )
    result_docs = []

    for hit in response['hits']['hits']:
 
        result_docs.append({"Question": hit['_source']['question'] , "Answer": hit['_source']['answer'], "ID": hit['_source']['id']})

    return result_docs

In [11]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

user_question = "What are the recommended food for a diabetes patients?"

user_question_embedding = embedding_model.encode(user_question)



In [12]:
elastic_search_knn('qa_text_embeddings',user_question_embedding)


[{'Question': 'What types of foods are emphasized in a healthy eating plan for diabetes?',
  'Answer': 'A healthy eating plan for diabetes emphasizes a variety of foods including breads, cereals, rice, whole grains, fruits, vegetables, meat and meat substitutes, dairy products, and healthy fats.',
  'ID': 'b86f8575'},
 {'Question': 'What are the key components of a healthy eating plan for diabetes?',
  'Answer': 'A healthy eating plan for diabetes emphasizes breads, cereals, rice, whole grains, fruits, vegetables, meat and meat substitutes, dairy products, and healthy fats, while focusing on appropriate portion sizes.',
  'ID': 'c070f69a'},
 {'Question': 'Question: Can you list some specific examples of foods rich in healthy fats that are suitable for individuals with diabetes?',
  'Answer': 'Answer:  Yes, some great examples of healthy fat-rich foods suitable for individuals with diabetes include olive oil, avocados, almonds, walnuts, salmon, tuna, and flaxseeds. These foods provide e

In [29]:
hybrid_search(user_question, user_question_embedding)[0]

{'Question': 'What types of foods are emphasized in a healthy eating plan for diabetes?',
 'Answer': 'A healthy eating plan for diabetes emphasizes a variety of foods including breads, cereals, rice, whole grains, fruits, vegetables, meat and meat substitutes, dairy products, and healthy fats.',
 'ID': 'b86f8575'}

In [53]:
import pandas as pd
data = pd.read_csv('retrieval_evaluation.csv')

In [54]:
retrieval_evaluation_dict = data.to_dict(orient = 'records')


In [46]:
def hit_rate_function(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr_function(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


In [49]:
def retrieval_evaluator_hybrid(data_dictionary):
    relevance_total = []

    for question in tqdm(data_dictionary):
        question_id = question['id']
        vector = embedding_model.encode(question['question'])
        results = hybrid_search(question['question'], vector)
        relevance = [d['ID'] == question_id for d in results]
        relevance_total.append(relevance)
        
    return hit_rate_function(relevance_total),mrr_function(relevance_total)


def retrieval_evaluator_vector(data_dictionary):
    relevance_total = []

    for question in tqdm(data_dictionary):
        question_id = question['id']
        vector = embedding_model.encode(question['question'])
        results = elastic_search_knn('qa_text_embeddings', vector)
        relevance = [d['ID'] == question_id for d in results]
        relevance_total.append(relevance)

    hit_rate_value = hit_rate_function(relevance_total)
    mrr_value = mrr_function(relevance_total)
    return hit_rate_value, mrr_value

In [55]:
hitrate_hybrid,mrr_hybrid = retrieval_evaluator_hybrid(retrieval_evaluation_dict)


  0%|          | 0/160 [00:00<?, ?it/s]

In [56]:
hitrate_vector,mrr_vector = retrieval_evaluator_vector(retrieval_evaluation_dict)


  0%|          | 0/160 [00:00<?, ?it/s]

In [57]:
print(f'Hit Rate for hybrid search Is : {hitrate_hybrid}')
print(f'MRR for hybrid search : {mrr_hybrid}')

Hit Rate for hybrid search Is : 0.64375
MRR for hybrid search : 0.46052083333333343


In [58]:
print(f'Hit Rate for vector search Is : {hitrate_vector}')
print(f'MRR for vector search : {mrr_vector}')

Hit Rate for vector search Is : 0.6375
MRR for vector search : 0.4592708333333335


In [59]:
es_client.indices.delete(index=index_name, body=index_settings, ignore=400)

  es_client.indices.delete(index=index_name, body=index_settings, ignore=400)


ObjectApiResponse({'acknowledged': True})

In [None]:
with open('es_client.pkl', 'wb') as file:
    pickle.dump(es_client, file)