In [1]:
import os
import sys

# Use the current working directory as the base
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import json
import pandas as pd
from weaviate_rag.rag_system import KGRAGSystem
import ollama
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextRecall
import re
from ragas import EvaluationDataset

  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.5.1 available.
INFO:datasets:TensorFlow version 2.18.0 available.


In [2]:
# Load your JSON data
with open('/Users/alexlecu/PycharmProjects/LLMKGraph/backend/evaluation/data/grok_evaluation_dataset_mini/OpenEnded_Questions_mini.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame for RAGAS
df = pd.DataFrame(data)
df = df[['question', 'answer']]

# Later, add retrieved contexts and generated answers after querying your RAG
df['contexts'] = None  # Will be filled with retrieved passages
df['generated_answer'] = None  # Will be filled with RAG responses

In [6]:
questions = [entry['question'] for entry in data]
ground_truths = [entry['answer'] for entry in data]

In [3]:
def retrieve(user_input):
    rag_system = KGRAGSystem()
    kg_result = rag_system.query(user_input)

    return [kg_result["context"]]

In [9]:
contexts = [retrieve(q) for q in questions]

INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/schema "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
INFO:KGRAG:Starting hybrid search for: What is age-related macular degeneration (AMD)?
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET http://localhost:8080/v1/meta "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 O

In [4]:
def generate_answer(question, context, model):
    system_prompt = f"""
    You are a trusted medical research assistant specializing in age-related macular degeneration (AMD). Your task is to provide thorough, accurate, and detailed answers about AMD research based on the following additional relevant data:
    
    {context}
    
    Please adhere to these guidelines when formulating your response:
    
    1. Express Uncertainty Transparently:
    If the available information is insufficient to answer confidently, acknowledge this and specify what additional data or details would be needed to provide a more complete response.
    2. Maintain Accuracy and Integrity:
    Base your answer solely on verified data and the provided context. Do not fabricate any information or references.
    3. Communicate Professionally:
    Present your response in a clear, well-organized, and professional manner, ensuring complex information is accessible and easy to understand.
    
    Begin your response below.
    """
    
    response = ollama.chat(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question}
        ],
        stream=False
    )
    return response['message']['content']

In [11]:
answers = [generate_answer(q, c, "deepseek-r1") for q, c in zip(questions, contexts)]

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


In [13]:
def remove_think_tags(response):
    cleaned_content = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)

    return cleaned_content

In [16]:
answers = [remove_think_tags(q) for q in answers]

In [23]:
type(questions)

list

In [24]:
from datasets import Dataset

evaluation_dataset = Dataset.from_dict({
    "user_input": questions,
    "response": answers,
    "reference": ground_truths,
    "retrieved_contexts": contexts
})

In [25]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [27]:
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper

llm = ChatOpenAI(model="gpt-4o-mini")
evaluator_llm = LangchainLLMWrapper(llm)

In [28]:
from ragas import evaluate
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],llm=evaluator_llm)


  user_id = json.load(open(uuid_filepath))["userid"]
Evaluating:   0%|                                                                                                                                                                                                                                                                           | 0/15 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Evaluating:   7%|█████████████████▎                                                                                                                                                                                                                                                 | 1/15 [00:01<00:26,  1.91s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Evaluating:  20%|███████████████████████████████████████████

In [29]:
result

{'context_recall': 0.1667, 'faithfulness': 0.3729, 'factual_correctness': 0.2520}