In [1]:
import pandas as pd
import ast
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextRecall
from ragas import EvaluationDataset
from datasets import Dataset
from dotenv import load_dotenv
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    faithfulness,
    context_precision,
    context_recall,
    answer_similarity,
)
import os

load_dotenv()

True

In [2]:
def parse_context(context):
    try:
        if isinstance(context, list):
            return [str(c) for c in context]
        if isinstance(context, str):
            context = context.strip()
            parsed = ast.literal_eval(context)
            if isinstance(parsed, list):
                return [str(item) for item in parsed]
            else:
                return [str(parsed)]
        return [str(context)]
    except (ValueError, SyntaxError) as e:
        print(f"Warning: Failed to parse context '{context}' with error: {e}")
        return [str(context)]


In [3]:
df = pd.read_csv('/Users/alexlecu/Documents/ExecutedNotebook_Llama3.2+CSVs/New/ragas_2hop_evaluation_data.csv')

df = df.rename(columns={'context': 'retrieved_contexts'})
df = df.rename(columns={'generated_answer': 'response'})

In [4]:
df['retrieved_contexts'] = df['retrieved_contexts'].apply(parse_context)

In [5]:
evaluation_dataset = Dataset.from_pandas(df)

In [6]:
result = evaluate(
    dataset=evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        answer_similarity,
        faithfulness,
        context_precision,
        context_recall,
    ])

Evaluating:   0%|          | 0/3000 [00:00<?, ?it/s]

In [7]:
result

{'answer_correctness': 0.5482, 'answer_relevancy': 0.9032, 'semantic_similarity': 0.9231, 'faithfulness': 0.8576, 'context_precision': 0.8900, 'context_recall': 0.5020}

In [8]:
results_df = result.to_pandas()

In [9]:
results_df.to_csv('2hop_ragas_evaluation.csv', index=False)