In [None]:
# faithfullness  will be used in here.

import pandas as pd
from openai import AsyncOpenAI
from ragas import experiment, EvaluationDataset
from ragas.llms import llm_factory
from ragas.embeddings import HuggingFaceEmbeddings 
from ragas.dataset_schema import SingleTurnSample
from ragas.backends import LocalCSVBackend
from ragas.metrics.collections import (
    Faithfulness     
)
from ragas.metrics.collections.faithfulness import metric

In [2]:
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
llm = llm_factory("qwen2.5:3b", provider="openai", client=client)
#embeddings = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

###### Since ragas does not provide traces and reasons in **ascore** method and **MetricResult class**, I needed to  rewrite them  with inheriting original class.

In [18]:
from ragas.metrics.result import MetricResult

class DetailedMetricResult(MetricResult):
    def __repr__(self):
        base_repr = super().__repr__()
        return f"{base_repr[:-1]}, traces={self.traces})"

    def __str__(self):
        return (f"Value: {self.value}\n"
                f"Reason: {self.reason}\n"
                f"Traces: {self.traces}")

In [19]:
from ragas.metrics.result import MetricResult
from typing import List

class TraceableFaithfulness(Faithfulness):
    async def ascore(
        self, user_input: str, response: str, retrieved_contexts: List[str]
    ) -> DetailedMetricResult:
        # Input validation
        if not response:
            raise ValueError(
                "response is missing. Please add response to the test sample."
            )
        if not user_input:
            raise ValueError(
                "user_input is missing. Please add user_input to the test sample."
            )
        if not retrieved_contexts:
            raise ValueError(
                "retrieved_contexts is missing. Please add retrieved_contexts to the test sample."
            )

        #Break response into atomic statements
        statements = await self._create_statements(user_input, response)

        if not statements:
            # No statements generated - return NaN like legacy
            return MetricResult(value=float("nan"))

        #Join all contexts and evaluate statements against them
        context_str = "\n".join(retrieved_contexts)
        verdicts = await self._create_verdicts(statements, context_str)

        #Compute faithfulness score
        score = self._compute_score(verdicts)
        
        statements=[v.statement for v in verdicts.statements]
        reasons = ", ".join([v.reason for v in verdicts.statements])

        return DetailedMetricResult(
            value=float(score),
            reason= reasons,
            traces={
                "input": {"statements": statements},
                "output": {"verdicts": [v.verdict for v in verdicts.statements]}
            }
        )

faithfulness_metric = TraceableFaithfulness(llm=llm)

In [20]:
samples = [
    {
        "user_input": "What is Ragas 0.3?", 
        "retrieved_contexts": ["Ragas 0.3 is an evaluation framework for RAG pipelines."],
        "response": "Ragas 0.3 is a tool to evaluate LLM applications.",
        "reference": "Ragas 0.3 is a library for evaluating LLM applications."
    },
    {
        "user_input": "How do I install Ragas?", 
        "retrieved_contexts": ["To install ragas, run pip install ragas."],
        "response": "The weather is quite nice for a walk today.", # low "Relevance"
        "reference": "Install using pip install ragas."
    }
]

dataset = EvaluationDataset.from_pandas(pd.DataFrame(samples))

In [21]:
@experiment(
    name_prefix="ragas_faithfullness_test_",
    backend=LocalCSVBackend(root_dir=".")
)
async def run_evaluation(row):   
    f_result = await faithfulness_metric.ascore(
        user_input=row.user_input,
        response=row.response,
        retrieved_contexts=row.retrieved_contexts
    )
    return {
        "user_input": row.user_input,
        "faithfulness_score": f_result.value,
        "reasoning": f_result.reason,
        "statements": " | ".join(f_result.traces["input"]["statements"]),
        "verdicts": str(f_result.traces["output"]["verdicts"])
    }

results = await run_evaluation.arun(dataset=dataset)

print("Evaluation Results")
print(results.to_pandas())

Running experiment: 100%|██████████| 2/2 [00:51<00:00, 25.94s/it]

Evaluation Results
                user_input  faithfulness_score  \
0       What is Ragas 0.3?                 0.0   
1  How do I install Ragas?                 0.0   

                                           reasoning  \
0  The context only mentions that Ragas 0.3 is an...   
1  There is no mention of the weather in the cont...   

                                          statements verdicts  
0  Ragas 0.3 is described as a tool used for eval...   [0, 0]  
1  The weather is described as being quite nice. ...   [0, 0]  





**Creation of Statements Prompt**
###### To see the ragas prompt for creating statements from the answer, this cell can be used.

In [6]:
from ragas.metrics.collections.faithfulness.util import StatementGeneratorInput

sample_input = StatementGeneratorInput(
    question="What is Ragas 0.3?", 
    answer="Ragas 0.3 is a tool to evaluate LLM applications."
)

print("FULL RAGAS PROMPT")
print(faithfulness_metric.statement_generator_prompt.to_string(sample_input))

FULL RAGAS PROMPT
Given a question and an answer, analyze the complexity of each sentence in the answer. Break down each sentence into one or more fully understandable statements. Ensure that no pronouns are used in any statement.
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
{"description": "Structured output for statement generation.", "properties": {"statements": {"description": "The generated statements from the answer", "items": {"type": "string"}, "title": "Statements", "type": "array"}}, "required": ["statements"], "title": "StatementGeneratorOutput", "type": "object"}Do not use single quotes in your response but double quotes,properly escaped with a backslash.

--------EXAMPLES-----------
Example 1
Input: {
    "question": "Who was Albert Einstein and what is he best known for?",
    "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists