In [3]:
import json
import os
from typing import List, Any, Tuple, Dict, Optional
from dataclasses import dataclass
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate

@dataclass
class RAGEvaluationResult:
    """Structure to hold evaluation results for a single question"""
    question: str
    context: str
    generated_answer: str
    relevance_score: float
    correctness_score: float
    faithfulness_score: float
    hallucination_score: float
    detailed_feedback: Dict[str, str]
    overall_score: float

In [7]:
import logging
logging.basicConfig(level=logging.INFO)

def score(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)

        # Handle JSON string
        if isinstance(result, str):
            result = json.loads(result)

        if "score" in result:
            raw_score = result["score"]
            normalized_score = (raw_score - 1) / 4.0
            result["score"] = normalized_score

            # Log the transformation
            logging.info(f"Raw score: {raw_score}, Normalized: {normalized_score}")
        else:
            logging.warning("Score key not found in result.")

        return result
    return wrapper


In [8]:
@dataclass
class OpenRouterModel:
    api_key: Optional[str] = os.getenv("OPENROUTER_API_KEY")
    model: str = "meta-llama/llama-3.3-70b-instruct"

    def __post_init__(self):
        if not self.api_key:
            raise ValueError("Please set a valid api key in the environment variables")
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=self.api_key,
        )
        

    def invoke(self, prompt: str, temperature: float = 0.1, max_tokens: int = 2000) -> str:
        """Generate response using Llama 3.3 70B Instruct"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error calling OpenRouter API: {e}")
            return ""
        
    def invoke_with_json_output(self, prompt_template, input_variables):
        """Generate structured JSON output from the model"""
        
        formatted_prompt = prompt_template.format(**input_variables)
        
        # Get the raw response from the model
        raw_response = self.invoke(formatted_prompt)
        
        # Extract JSON from the response
        try:
            # Find JSON content within the response
            json_start = raw_response.find('{')
            json_end = raw_response.rfind('}') + 1
            
            if json_start >= 0 and json_end > json_start:
                json_str = raw_response[json_start:json_end]
                result = json.loads(json_str)
                return result
            else:
                # If no JSON found, try to parse the whole response
                return json.loads(raw_response)
        except json.JSONDecodeError:
            print("Failed to parse JSON from response. Raw response:")
            print(raw_response)
            # Return a default structure if parsing fails
            return {"score": 0, "reasoning": "Failed to parse response"}

In [9]:
from functools import wraps
@dataclass
class Evaluation:
    question:str
    context: Optional[str] = None
    generated_answer: str
    llm: OpenRouterModel

    @score
    def evaluate_relevance(self):
        prompt = PromptTemplate(
            input_variables=["question", "context", "generated_answer"],
            template = """
            Evaluate the RELEVANCE of the retrieved context to the given question.
            
            Question: {question}
            
            Retrieved Context: {context}

            Generated Answer: {generated_answer}
            
            Rate the relevance on a scale of 1-5 where:
            - 1: Completely irrelevant, no connection to the question
            - 2: Minimally relevant, tangential connection
            - 3: Somewhat relevant, partial connection
            - 4: Highly relevant, strong connection
            - 5: Perfectly relevant, directly addresses the question
            
            Provide your response in this format:
            {{
                "score": <number>,
                "reasoning": "<detailed explanation of why you gave this score>"
            }}
            
            Ensure your response is valid JSON.
            """
        )

        # Use the custom method for structured output
        return self.llm.invoke_with_json_output(
            prompt_template=prompt.template,
            input_variables={
                "question": self.question,
                "context": self.context,
                "generated_answer": self.generated_answer
            }
        )

    @score
    def evaluate_correctness(self):
        prompt_template = """
        Evaluate the FACTUAL CORRECTNESS of the given answer to the question.
        
        Question: {question}
        
        Answer: {generated_answer}
        """
        
        # Add context if provided
        if self.context:
            prompt_template += f"\nContext (for reference): {context}\n"
        else:
            prompt_template += "\n"
            
        prompt_template += """
        Rate the correctness on a scale of 1-5 where:
        - 1: Completely incorrect, contains major factual errors
        - 2: Mostly incorrect, some facts but significant errors
        - 3: Partially correct, mix of correct and incorrect information
        - 4: Mostly correct, minor errors or omissions
        - 5: Completely correct, factually accurate and comprehensive
        
        Focus on:
        - Factual accuracy of claims made
        - Logical consistency
        - Completeness of the answer
        
        Provide your response in this exact JSON format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation focusing on specific facts>"
        }}
        """
        
        # Create input variables dict based on whether context is provided
        input_variables = {
            "question": self.question,
            "generated_answer": self.generated_answer
        }
        
        # Only add context to input variables if it's provided
        if self.context:
            input_variables["context"] = self.context
        
        return self.llm.invoke_with_json_output(
            prompt_template=prompt_template,
            input_variables=input_variables
        )

    
    @score
    def evaluate_faithfulness(self):
        prompt = PromptTemplate(
            input_variables = ["question", "context", "generated_answer"],
            template = """
            Evaluate the FAITHFULNESS of the answer to the provided context. The answer should only contain information that can be supported by or reasonably inferred from the context.
            
            Question: {question}
            
            Context: {context}
            
            Answer: {generated_answer}
            
            Rate the faithfulness on a scale of 1-5 where:
            - 1: Completely unfaithful, answer contradicts or ignores context
            - 2: Mostly unfaithful, some alignment but significant deviations
            - 3: Partially faithful, generally aligned but some unsupported claims
            - 4: Mostly faithful, well-grounded with minor unsupported details
            - 5: Completely faithful, all claims supported by or inferable from context
            
            Check for:
            - Claims that go beyond what's stated in the context
            - Information that contradicts the context
            - Proper grounding of all assertions
            
            Provide your response in this exact JSON format:
            {{
                "score": <number>,
                "reasoning": "<detailed explanation with specific examples>"
            }}
            """
        )
        
        return self.llm.invoke_with_json_output(
            prompt_template=prompt.template,
            input_variables={
                "question": self.question,
                "context": self.context,
                "generated_answer": self.generated_answer
            }
        )
        
    
    @score
    def evaluate_hallucination(self):
        prompt = PromptTemplate(
            input_variables = ["question", "context", "generated_answer"],
            template = """
            Evaluate whether the answer contains HALLUCINATED information - facts, claims, or details that are NOT present in the provided context.
            
            Question: {question}
            
            Context: {context}
            
            Answer: {generated_answer}
            
            Rate hallucination on a scale of 1-5 where:
            - 1: Severe hallucination, answer contains mostly fabricated information
            - 2: Significant hallucination, multiple unsupported claims
            - 3: Moderate hallucination, some fabricated details
            - 4: Minor hallucination, mostly grounded with few unsupported claims
            - 5: No hallucination, all information comes from or is inferable from context
            
            Specifically look for:
            - Facts mentioned in answer but not in context
            - Specific numbers, dates, names not in context
            - Detailed explanations beyond what context provides
            - Claims that go beyond reasonable inference
            
            Provide your response in this exact JSON format:
            {{
                "score": <number>,
                "reasoning": "<detailed explanation identifying specific hallucinations if any>"
            }}
            """
        )
        
        return self.llm.invoke_with_json_output(
            prompt_template=prompt.template,
            input_variables={
                "question": self.question,
                "context": self.context,
                "generated_answer": self.generated_answer
            }
        )
    
    
    


TypeError: non-default argument 'generated_answer' follows default argument 'context'

In [11]:
@dataclass
class Evaluation:
    question: str
    generated_answer: str
    llm: OpenRouterModel
    context: Optional[str] = None

    def _evaluate_with_template(self, template: str, input_vars: Dict[str, str]) -> Dict:
        """Generic method to handle all evaluations with templates."""
        return self.llm.invoke_with_json_output(
            prompt_template=template,
            input_variables=input_vars
        )

    def _get_base_input_variables(self) -> Dict[str, str]:
        """Get the base input variables used by most evaluation methods."""
        return {
            "question": self.question,
            "context": self.context,
            "generated_answer": self.generated_answer
        }

    @score
    def evaluate_relevance(self) -> Dict:
        template = """
        Evaluate the RELEVANCE of the retrieved context to the given question.
        
        Question: {question}
        
        Retrieved Context: {context}

        Generated Answer: {generated_answer}
        
        Rate the relevance on a scale of 1-5 where:
        - 1: Completely irrelevant, no connection to the question
        - 2: Minimally relevant, tangential connection
        - 3: Somewhat relevant, partial connection
        - 4: Highly relevant, strong connection
        - 5: Perfectly relevant, directly addresses the question
        
        Provide your response in this format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation of why you gave this score>"
        }}
        
        Ensure your response is valid JSON.
        """
        
        return self._evaluate_with_template(template, self._get_base_input_variables())

    @score
    def evaluate_correctness(self) -> Dict:
        template = """
        Evaluate the FACTUAL CORRECTNESS of the given answer to the question.
        
        Question: {question}
        
        Answer: {generated_answer}
        """
        
        # Add context if provided
        if self.context:
            template += f"\nContext (for reference): {{context}}\n"
        else:
            template += "\n"
            
        template += """
        Rate the correctness on a scale of 1-5 where:
        - 1: Completely incorrect, contains major factual errors
        - 2: Mostly incorrect, some facts but significant errors
        - 3: Partially correct, mix of correct and incorrect information
        - 4: Mostly correct, minor errors or omissions
        - 5: Completely correct, factually accurate and comprehensive
        
        Focus on:
        - Factual accuracy of claims made
        - Logical consistency
        - Completeness of the answer
        
        Provide your response in this exact JSON format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation focusing on specific facts>"
        }}
        """
        
        # Create input variables dict based on whether context is provided
        input_variables = {
            "question": self.question,
            "generated_answer": self.generated_answer
        }
        
        # Only add context to input variables if it's provided
        if self.context:
            input_variables["context"] = self.context
        
        return self._evaluate_with_template(template, input_variables)

    @score
    def evaluate_faithfulness(self) -> Dict:
        template = """
        Evaluate the FAITHFULNESS of the answer to the provided context. The answer should only contain information that can be supported by or reasonably inferred from the context.
        
        Question: {question}
        
        Context: {context}
        
        Answer: {generated_answer}
        
        Rate the faithfulness on a scale of 1-5 where:
        - 1: Completely unfaithful, answer contradicts or ignores context
        - 2: Mostly unfaithful, some alignment but significant deviations
        - 3: Partially faithful, generally aligned but some unsupported claims
        - 4: Mostly faithful, well-grounded with minor unsupported details
        - 5: Completely faithful, all claims supported by or inferable from context
        
        Check for:
        - Claims that go beyond what's stated in the context
        - Information that contradicts the context
        - Proper grounding of all assertions
        
        Provide your response in this exact JSON format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation with specific examples>"
        }}
        """
        
        return self._evaluate_with_template(template, self._get_base_input_variables())

    @score
    def evaluate_hallucination(self) -> Dict:
        template = """
        Evaluate whether the answer contains HALLUCINATED information - facts, claims, or details that are NOT present in the provided context.
        
        Question: {question}
        
        Context: {context}
        
        Answer: {generated_answer}
        
        Rate hallucination on a scale of 1-5 where:
        - 1: Severe hallucination, answer contains mostly fabricated information
        - 2: Significant hallucination, multiple unsupported claims
        - 3: Moderate hallucination, some fabricated details
        - 4: Minor hallucination, mostly grounded with few unsupported claims
        - 5: No hallucination, all information comes from or is inferable from context
        
        Specifically look for:
        - Facts mentioned in answer but not in context
        - Specific numbers, dates, names not in context
        - Detailed explanations beyond what context provides
        - Claims that go beyond reasonable inference
        
        Provide your response in this exact JSON format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation identifying specific hallucinations if any>"
        }}
        """
        
        return self._evaluate_with_template(template, self._get_base_input_variables())

In [13]:
from openai import OpenAI
llm = OpenRouterModel()
question = "What causes climate change?"
context = "Climate change is primarily caused by human activities that increase greenhouse gas concentrations in the atmosphere. The main contributors include burning fossil fuels (coal, oil, gas) for electricity, heat, and transportation, which releases carbon dioxide. Deforestation reduces the Earth's capacity to absorb CO2. Industrial processes and agriculture also contribute significantly."
generated_answer = """
Climate change is caused by both natural and human factors. While volcanic eruptions and variations in solar radiation have long-term impacts, the current changes are largely due to human activities such as burning fossil fuels and deforestation. Increased greenhouse gases from industrial processes and transportation play a significant role."""


In [14]:
eval1 = Evaluation(question = question, context = context, generated_answer = generated_answer, llm = llm)

In [20]:
eval1.evaluate_correctness()

INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0


{'score': 1.0,
 'reasoning': 'The answer provided is factually accurate, logically consistent, and comprehensive. It correctly identifies both natural and human factors as contributors to climate change, with a focus on the dominant role of human activities such as burning fossil fuels and deforestation. The mention of volcanic eruptions and variations in solar radiation as natural factors is also accurate, as these can influence climate patterns. The answer highlights the significant role of increased greenhouse gases from industrial processes and transportation, which aligns with the context provided. The context emphasizes that climate change is primarily caused by human activities, particularly the burning of fossil fuels, deforestation, and industrial processes, all of which are correctly noted in the answer. Therefore, the answer is completely correct, with no major factual errors, omissions, or inconsistencies.'}