In [1]:
import pandas as pd
from typing import List
from ragas.metrics.collections import ContextPrecision
from openai import AsyncOpenAI
from typing import List
from ragas import EvaluationDataset, experiment
from ragas.llms import llm_factory
from ragas.backends import LocalCSVBackend
from ragas.metrics.collections import ContextPrecision
from ragas.metrics.result import MetricResult
from ragas_utils import DetailedMetricResult
from ragas.metrics.collections.context_precision.util import (
    ContextPrecisionInput, 
    ContextPrecisionOutput
)

In [7]:
class TraceableContextPrecision(ContextPrecision):
     async def ascore(
        self, user_input: str, reference: str, retrieved_contexts: List[str]
    ) -> DetailedMetricResult:
        """
        Calculate context precision score using reference.

        Args:
            user_input: The question being asked
            reference: The reference answer to compare against
            retrieved_contexts: The retrieved contexts to evaluate

        Returns:
            MetricResult with context precision score (0.0-1.0, higher is better)
        """
        # Input validation
        if not user_input:
            raise ValueError("user_input cannot be empty")
        if not reference:
            raise ValueError("reference cannot be empty")
        if not retrieved_contexts:
            raise ValueError("retrieved_contexts cannot be empty")

        # Evaluate each retrieved context
        verdicts = []
        reasons=[]
        for context in retrieved_contexts:
            # Create input data and generate prompt
            input_data = ContextPrecisionInput(
                question=user_input, context=context, answer=reference
            )
            prompt_string = self.prompt.to_string(input_data)
            print("prompt",prompt_string)
            result = await self.llm.agenerate(prompt_string, ContextPrecisionOutput)
            print("result",result)
            verdicts.append(result.verdict)
            reasons.append(result.reason)

        # Calculate average precision
        score = self._calculate_average_precision(verdicts)
        return DetailedMetricResult(
            value=float(score),
            reason=", ".join([r for r in reasons]),
            traces={
                "input": {"contexts": [context for context in  retrieved_contexts]},
                "output": {"verdicts": [v for v in verdicts]}
            }
        )


In [8]:
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
llm = llm_factory("qwen2.5:3b", provider="openai", client=client)

context_precision_metric = TraceableContextPrecision(llm=llm)

In [9]:
samples = [
    {
        "user_input": "What is the capital of France?",
        "reference": "The capital of France is Paris.",
        "retrieved_contexts": [
            "Paris is the capital and largest city of France.", # Relevant
            "Marseille is a city in southern France.",          # Irrelevant
            "Lyon is known for its cuisine."                   # Irrelevant
        ]
    }
]
dataset = EvaluationDataset.from_pandas(pd.DataFrame(samples))

In [11]:
@experiment(
    name_prefix="ragas_context_precision_test_",
    backend=LocalCSVBackend(root_dir=".")
)
async def run_evaluation(row):   
    cp_result = await context_precision_metric.ascore(
        user_input=row.user_input,
        reference=row.reference,
        retrieved_contexts=row.retrieved_contexts
    )
    return {
        "user_input": row.user_input,
        "contexts": row.retrieved_contexts,
        "context_precision_score": cp_result.value,
        "reasoning": cp_result.reason,
        "verdicts": str(cp_result.traces["output"]["verdicts"])
    }

results = await run_evaluation.arun(dataset=dataset)
print(results.to_pandas())

Running experiment:   0%|          | 0/1 [00:00<?, ?it/s]

prompt Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
{"description": "Structured output for context precision evaluation.", "properties": {"reason": {"description": "Reason for verification", "title": "Reason", "type": "string"}, "verdict": {"description": "Binary (0/1) verdict of verification", "title": "Verdict", "type": "integer"}}, "required": ["reason", "verdict"], "title": "ContextPrecisionOutput", "type": "object"}Do not use single quotes in your response but double quotes,properly escaped with a backslash.

--------EXAMPLES-----------
Example 1
Input: {
    "question": "What can you tell me about Albert Einstein?",
    "context": "Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greate

Running experiment: 100%|██████████| 1/1 [00:21<00:00, 21.08s/it]

result reason="The provided context about Lyon and its cuisine does not provide any information relevant to determining the capital of France. The answer given ('Paris') aligns with common knowledge, but the context does not support this claim." verdict=0
                       user_input  \
0  What is the capital of France?   

                                            contexts  context_precision_score  \
0  [Paris is the capital and largest city of Fran...                      1.0   

                                           reasoning   verdicts  
0  the context provided directly states that Pari...  [1, 0, 0]  



