In [5]:
import json
import random
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import os
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from dotenv import load_dotenv
import cohere

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


In [6]:
# Load environment variables
load_dotenv()

# Initialize cohere clientCOHERE_API_KEY
cohere_api_key = os.getenv('COHERE_API_KEY')

co = cohere.Client(cohere_api_key)

# Initialize models
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

api_key = os.getenv('QDRANT_API_KEY')

client = QdrantClient(
    url="https://8999b86c-f8b2-4d60-bdfa-8c68d39daae7.europe-west3-0.gcp.cloud.qdrant.io:6333", 
    api_key=api_key, 
    timeout=200
)

def search_query(query_vector):
    hits = client.search(
        collection_name="diabetes",
        query_vector=query_vector,
        limit=5
    )
    return hits

def generate_answer(prompt, context):
    if context[0].score < 0.4:
        return 'I\'m a diabetes chat bot assistant ask questions related to diabetes alone....Thank You.'
    else:
        context_str = "\n".join([f"Question: {doc.payload['question']}\nAnswer: {doc.payload['answer']}" for doc in context])
        full_prompt = f"{prompt}\n\nContext:\n{context_str}\n\nAnswer:"
        response = co.chat(message=full_prompt)
        return response

def rag_function(user_question):
    user_question_embedding = embedding_model.encode(user_question)
    context = search_query(user_question_embedding)
    prompt = "As a diabetes consultant, provide a brief answer based on the following context and return only the answer"
    try:
        answer = generate_answer(prompt, context).text
    except:
        answer = generate_answer(prompt, context)
    return answer



In [7]:
with open('diabetes_data_with_vectors', 'r') as f:
    data = json.load(f)

In [9]:

# Select 30 random entries
sampled_data = random.sample(data, 30)

# Prepare lists to store data
ids = []
questions = []
original_answers = []
llm_answers = []
cosine_similarities = []

# Process each sampled entry
for entry in sampled_data:
    question = entry['question']
    original_answer = entry['answer']
    question_id = entry['id']
    
    # Generate LLM answer
    llm_answer = rag_function(question)
    
    # Compute embeddings
    original_embedding = embedding_model.encode(original_answer)
    llm_embedding = embedding_model.encode(llm_answer)
    
    # Compute cosine similarity
    similarity = cosine_similarity([original_embedding], [llm_embedding])[0][0]
    
    # Store data
    ids.append(question_id)
    questions.append(question)
    original_answers.append(original_answer)
    llm_answers.append(llm_answer)
    cosine_similarities.append(similarity)

# Create DataFrame
df = pd.DataFrame({
    'ID': ids,
    'Question': questions,
    'Original Answer': original_answers,
    'LLM Answer': llm_answers,
    'Cosine Similarity': cosine_similarities
})


In [10]:
df.head()

Unnamed: 0,ID,Question,Original Answer,LLM Answer,Cosine Similarity
0,033bfec1,How can individuals with diabetes contribute t...,"Answer: Supporting diabetes organizations, par...",Individuals with diabetes can play a vital rol...,0.839034
1,8b266304,"Beyond healthy eating and exercise, what other...",Answer: Raising awareness about diabetes and ...,"Awareness, education, and early intervention a...",0.798016
2,72822bbc,What are some crucial aspects to consider when...,Answer: It's essential to take necessary vacci...,"When traveling with diabetes, it is crucial to...",0.779293
3,1812b7b3,In what ways does smoking cessation contribute...,Quitting smoking improves overall health and r...,Quitting smoking improves blood sugar control ...,0.966807
4,abcc536e,What type of data does continuous glucose moni...,Answer: Continuous glucose monitoring provides...,Continuous glucose monitoring provides real-ti...,0.954846


In [11]:
df['Cosine Similarity'].mean()

0.8522774