### Load Data

In [35]:
import pandas as pd
from tqdm.auto import tqdm
import json
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from sentence_transformers import SentenceTransformer
import cohere

# Load the .env file
load_dotenv()


True

### Instantiate Vector DB and LLM

In [7]:
api_key = os.getenv('QDRANT_API_KEY')


client = QdrantClient(
    url="https://8999b86c-f8b2-4d60-bdfa-8c68d39daae7.europe-west3-0.gcp.cloud.qdrant.io:6333", 
    api_key=api_key,timeout=200
)

print(api_key)

r09oP5L3SaVcMi_MvBC3yBOuRpcYChNeZjzTiEHr5cyoyHiP00CyEw


In [14]:
#Instantiate LLM

cohere_api_key = os.getenv('COHERE_API_KEY')
co = cohere.Client(cohere_api_key)


HmZ8nzR3n7wSfzoWAKbW3Iy0iYbkpg4lsR7OL0s4


In [8]:
#Load Data
with open('diabetes_data_with_vectors', 'r') as file:
    diabetes_data_with_vectors = json.load(file)

### Develop Prompt

In [141]:
prompt_template = '''
You are emulating a diabetes patient.
Formulate just 5 insightful questions based on the following diabetes conversation record. 
Use the context provided to generate relevant questions, but avoid directly copying words from the response.
Be as brief as possible
Dont't repeat question
Record:
Context: {question}
return result in this format
[question1, question2, question3]

Dont include the instruction or anyother thing in the response
Return only the generated response as a python list


'''

In [142]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    result = co.chat(message= prompt).text

    json_response = result
    return json_response

In [144]:
import random
#Select 25 Random Data to evaluate Retrieval
sampledata = random.sample(diabetes_data_with_vectors, 35)


In [145]:
#Generate The List

results = {}
for doc in tqdm(sampledata): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions
    if doc_id == 'bedc96fa':
        print(questions)

  0%|          | 0/35 [00:00<?, ?it/s]

### Create Dataframe for retrieval evaluation

In [150]:
results[ids[33]] = "['How can I manage my diabetes effectively on a daily basis?', 'What are the potential risks if I dont inform others about my diabetes?', 'Are there any legal implications if I dont disclose my diabetes to, say, my employer or my school?', 'In what specific emergency situations would sharing my diabetes status be beneficial?', 'How can I explain my diabetes to those close to me in a simple and understandable way?']"

In [151]:
import ast

In [152]:
ids= list(results.keys())
main_rows = []

for document in tqdm(ids):
    for question in ast.literal_eval(results[document]):
        main_rows.append((question,document))
        
        
df = pd.DataFrame(main_rows, columns=["question", "id"])


  0%|          | 0/35 [00:00<?, ?it/s]

In [153]:
df.to_csv('retrieval_evaluation.csv', index=False)

### Evaluate Retrieval

In [154]:
def search_query(query_vector):
    hits = client.search(
        collection_name="diabetes",
        query_vector=query_vector,
        limit=5
    )
    return hits

In [155]:
data = pd.read_csv('retrieval_evaluation.csv')

In [156]:
retrieval_evaluation_dict = data.to_dict(orient = 'records')


In [None]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [206]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


In [216]:
def retrieval_evaluator(data_dictionary):
    relevance_total = []

    for question in tqdm(data_dictionary):
        question_id = question['id']
        vector = embedding_model.encode(question['question'])
        results = search_query(vector)
        relevance = [d.payload['id'] == question_id for d in results]
        relevance_total.append(relevance)
        
    return hit_rate(relevance_total),mrr(relevance_total)

In [217]:
hitrate,mrr = retrieval_evaluator(retrieval_evaluation_dict)
    

  0%|          | 0/173 [00:00<?, ?it/s]

In [218]:
print(f'Hit Rate Is : {hitrate}')
print(f'MRR Is : {mrr}')

Hit Rate Is : 0.47398843930635837
MRR Is : 0.26425818882466284


In [209]:
print(hit_rate(result))
print(mrr(result))

0.4508670520231214
0.2784200385356455
