# True or False Questions - Evaluation

In [41]:
import os
import sys

# Use the current working directory as the base
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import json
import pandas as pd
from weaviate_rag.rag_system import KGRAGSystem
import ollama
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextRecall
import re
from ragas import EvaluationDataset

In [10]:
# Load your JSON data
with open('/Users/alexlecu/PycharmProjects/LLMKGraph/backend/evaluation/data/grok_evaluation_dataset_mini/True_or_False_questions_mini.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame for RAGAS
df = pd.DataFrame(data)
df = df[['question', 'answer']]

# Later, add retrieved contexts and generated answers after querying your RAG
df['contexts'] = None  # Will be filled with retrieved passages
df['generated_answer'] = None  # Will be filled with RAG responses

In [22]:
def retrieve(user_input):
    rag_system = KGRAGSystem()
    kg_result = rag_system.query(user_input)

    return [kg_result["context"]]

In [28]:
def generate_answer(question, context, model):
    system_prompt = f"""
    You are a trusted medical research assistant specializing in age-related macular degeneration (AMD). Your task is to provide thorough, accurate, and detailed answers about AMD research based on the following additional relevant data:
    
    {context}
    
    Please adhere to these guidelines when formulating your response:
    
    1. Express Uncertainty Transparently:
    If the available information is insufficient to answer confidently, acknowledge this and specify what additional data or details would be needed to provide a more complete response.
    2. Maintain Accuracy and Integrity:
    Base your answer solely on verified data and the provided context. Do not fabricate any information or references.
    3. Communicate Professionally:
    Present your response in a clear, well-organized, and professional manner, ensuring complex information is accessible and easy to understand.
    
    Important: Respond only with "True" or "False." Do not include additional information beyond this.
    """
    
    response = ollama.chat(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question}
        ],
        stream=False
    )
    return response['message']['content']

In [38]:
def remove_think_tags(response):
    cleaned_content = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)

    return cleaned_content

In [39]:
dataset = []

for index, row in df.iterrows():
    context = retrieve(row["question"])
    response = remove_think_tags(generate_answer(row["question"], context, "deepseek-r1"))

    dataset.append(
        {
            "user_input":row["question"],
            "retrieved_contexts":context,
            "response":response,
            "reference":row["answer"]
        }
    )

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
INFO:KGRAG:Starting hybrid search for: Age-related macular degeneration (AMD) affects the cornea of the eye.
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HT

In [40]:
print(dataset)

[{'user_input': 'Age-related macular degeneration (AMD) affects the cornea of the eye.', 'retrieved_contexts': ['Age Related Macular Degeneration (DISEASE) can cause Blindness (SYMPTOM). Age Related Macular Degeneration (DISEASE) can cause Choroidal Neovascular Membrane Formation (PROGRESSION). Age Related Macular Degeneration (DISEASE) presents with Ill Defined Choroidal Neovascularization (DISEASE). Age Related Macular Degeneration (DISEASE) presents with Well Defined Choroidal Neovascularization (BIOMARKER). Age Related Macular Degeneration (DISEASE) can cause Legal Blindness (SYMPTOM). Manipulate Mechanisms (TREATMENT) treats Age Related Macular Degeneration (DISEASE). Medical History (TREATMENT) diagnoses Age Related Macular Degeneration (DISEASE). Physical Examination (TREATMENT) diagnoses Age Related Macular Degeneration (DISEASE). Eye Examination (TREATMENT) diagnoses Age Related Macular Degeneration (DISEASE). Study (TREATMENT) improves Age Related Macular Degeneration (DISEAS

In [77]:
responses = []
references = []

In [78]:
for entry in dataset:
    responses.append(bool(entry["response"].replace("\n\n", "")))
    references.append(entry["reference"])
    print(entry["response"].replace("\n\n", "") + " " + str(entry["reference"]))

False False
False True
False True
False False
False True


In [79]:
print(responses)
print(references)

print(type(responses[1]))
print(type(references[1]))

[True, True, True, True, True]
[False, True, True, False, True]
<class 'bool'>
<class 'bool'>


In [80]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate metrics
accuracy = accuracy_score(references, responses)
precision = precision_score(references, responses)
recall = recall_score(references, responses)
f1 = f1_score(references, responses)

In [82]:
# Print evaluation results
print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.2f} ({accuracy * 100:.1f}%)")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")


Evaluation Metrics:
Accuracy: 0.60 (60.0%)
Precision: 0.60
Recall: 1.00
F1-Score: 0.75


In [92]:
# Log incorrect answers for analysis
print("\nIncorrect Answers:")
for i, (reference, response, item) in enumerate(zip(references, responses, dataset)):
    if response != reference:
        print(f"Question {i+1}: {item['user_input']}")
        print(f"Ground Truth: {reference}, RAG Predicted: {response}")
        print("-" * 50)


Incorrect Answers:
Question 1: Age-related macular degeneration (AMD) affects the cornea of the eye.
Ground Truth: False, RAG Predicted: True
--------------------------------------------------
Question 4: Wet AMD progresses more slowly than dry AMD.
Ground Truth: False, RAG Predicted: True
--------------------------------------------------
