In [None]:
import pandas as pd
import ast
from ragas import evaluate
from ragas.metrics import Faithfulness, AnswerRelevancy, ContextRecall
from ragas import EvaluationDataset
from datasets import Dataset
from dotenv import load_dotenv
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    faithfulness,
    context_precision,
    context_recall,
    answer_similarity,
)
import os

load_dotenv()

In [None]:
def parse_context(context):
    try:
        if isinstance(context, list):
            return [str(c) for c in context]
        if isinstance(context, str):
            context = context.strip()
            parsed = ast.literal_eval(context)
            if isinstance(parsed, list):
                return [str(item) for item in parsed]
            else:
                return [str(parsed)]
        return [str(context)]
    except (ValueError, SyntaxError) as e:
        print(f"Warning: Failed to parse context '{context}' with error: {e}")
        return [str(context)]


In [None]:
import re

def remove_think_tags(response):
    cleaned_content = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)

    return cleaned_content

In [None]:
df = pd.read_csv('data/2hop_ragas.csv')

df = df.rename(columns={'context': 'retrieved_contexts'})
df = df.rename(columns={'generated_answer': 'response'})

In [None]:
df['retrieved_contexts'] = df['retrieved_contexts'].apply(parse_context)

In [None]:
df['response'] = df['response'].apply(remove_think_tags)

In [None]:
evaluation_dataset = Dataset.from_pandas(df)

In [None]:
result = evaluate(
    dataset=evaluation_dataset,
    metrics=[
        answer_correctness,
        answer_relevancy,
        answer_similarity,
        faithfulness,
        context_precision,
        context_recall,
    ])

In [None]:
result

In [None]:
results_df = result.to_pandas()

In [None]:
results_df.to_csv('2hop_ragas_smollm_evaluation.csv', index=False)