# MCQ_Questions_Evaluation

In [1]:
import os
import sys

# Use the current working directory as the base
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import json
import pandas as pd
from weaviate_rag.rag_system import KGRAGSystem
import ollama
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextRecall
import re
from ragas import EvaluationDataset

  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.5.1 available.
INFO:datasets:TensorFlow version 2.18.0 available.


In [2]:
# Load your JSON data
with open('/Users/alexlecu/PycharmProjects/LLMKGraph/backend/evaluation/data/grok_evaluation_dataset_mini/MCQ_Questions_mini.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame for RAGAS
df = pd.DataFrame(data)
df = df[['question', 'answer']]

# Later, add retrieved contexts and generated answers after querying your RAG
df['contexts'] = None  # Will be filled with retrieved passages
df['generated_answer'] = None  # Will be filled with RAG responses

In [3]:
def retrieve(user_input):
    rag_system = KGRAGSystem()
    kg_result = rag_system.query(user_input)

    return [kg_result["context"]]

In [28]:
def generate_answer(question, context, model):
    system_prompt = f"""
    You are a trusted medical research assistant specializing in age-related macular degeneration (AMD). Your task is to provide thorough, accurate, and detailed answers about AMD research based on the following additional relevant data:
    
    {context}
    
    Please adhere to these guidelines when formulating your response:
    
    1. Express Uncertainty Transparently:
    If the available information is insufficient to answer confidently, acknowledge this and specify what additional data or details would be needed to provide a more complete response.
    2. Maintain Accuracy and Integrity:
    Base your answer solely on verified data and the provided context. Do not fabricate any information or references.
    3. Communicate Professionally:
    Present your response in a clear, well-organized, and professional manner, ensuring complex information is accessible and easy to understand.
    
    Important: Your response must consist of only the correct answer—nothing more. Do not include any additional explanation, commentary, or extraneous text
    """
    
    response = ollama.chat(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question}
        ],
        stream=False
    )
    return response['message']['content']

In [29]:
def remove_think_tags(response):
    cleaned_content = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)

    return cleaned_content

In [30]:
dataset = []

for index, row in df.iterrows():
    context = retrieve(row["question"])
    response = remove_think_tags(generate_answer(row["question"], context, "deepseek-r1"))

    dataset.append(
        {
            "user_input":row["question"],
            "retrieved_contexts":context,
            "response":response,
            "reference":row["answer"]
        }
    )

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
INFO:KGRAG:Starting hybrid search for: What part of the eye does AMD primarily affect? a) Cornea b) Lens c) Macula d) Optic nerve
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://local

In [31]:
print(dataset)

[{'user_input': 'What part of the eye does AMD primarily affect? a) Cornea b) Lens c) Macula d) Optic nerve', 'retrieved_contexts': ['Bevacizumab (TREATMENT) affects Macular Area Of The Eye (BODY_PART). Ranibizumab (TREATMENT) affects Macular Area Of The Eye (BODY_PART). Side Effects (SYMPTOM) presents with Inflammation Of The Eye (SYMPTOM). Age Related Macular Degeneration (DISEASE) affects Retinal Pigment Epithelium (BODY_PART). Choroidal Neovascularization (DISEASE) can cause Retinal Pigment Epithelium (BODY_PART). Age Related Macular Degeneration (DISEASE) affects Retinal Pigment Epithelium (BODY_PART). Group 4 (DISEASE) affects Retinal Pigment Epithelium (BODY_PART). Choroidal Neovascular Membrane (BIOMARKER) affects Retinal Pigment Epithelium (BODY_PART). Clinical Morphological Examination (TEST) diagnoses Eye Sections (BODY_PART). Corticosteroid Injection (TREATMENT) affects Eye (BODY_PART). Age Related Macular Degeneration (DISEASE) presents with Eye (BODY_PART). Mp (BIOMARKER)

In [46]:
responses = []
references = []

In [47]:
for entry in dataset:
    match_response = re.search(r'[a-d]\)', entry["response"])
    match_reference = re.search(r'[a-d]\)', entry["reference"])
    responses.append(match_response.group(0).replace("\n\n", ""))
    references.append(match_reference.group(0))
    print(entry["response"].replace("\n\n", "") + " " + str(entry["reference"]))

AMD primarily affects the Macula (option c). c) Macula
d) Neither a) Dry AMD
b) Blurred central vision b) Blurred central vision
c) Smoking c) Smoking
The leading cause of vision loss in older adults among the listed options is AMD (Age-Related Macular Degeneration).Answer: c) AMD c) AMD


In [48]:
print(responses)
print(references)

print(type(responses[1]))
print(type(references[1]))

['c)', 'd)', 'b)', 'c)', 'c)']
['c)', 'a)', 'b)', 'c)', 'c)']
<class 'str'>
<class 'str'>


In [50]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(references, responses)

In [51]:
# Print evaluation results
print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.2f} ({accuracy * 100:.1f}%)")


Evaluation Metrics:
Accuracy: 0.80 (80.0%)


In [52]:
# Log incorrect answers for analysis
print("\nIncorrect Answers:")
for i, (reference, response, item) in enumerate(zip(references, responses, dataset)):
    if response != reference:
        print(f"Question {i+1}: {item['user_input']}")
        print(f"Ground Truth: {reference}, RAG Predicted: {response}")
        print("-" * 50)


Incorrect Answers:
Question 2: Which type of AMD is more common? a) Dry AMD b) Wet AMD c) Both equally d) Neither
Ground Truth: a), RAG Predicted: d)
--------------------------------------------------
