In [1]:
import json
import os
from typing import List, Any, Tuple, Dict, Optional
from dataclasses import dataclass
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate

# Your existing classes
@dataclass
class RAGEvaluationResult:
    """Structure to hold evaluation results for a single question"""
    question: str
    context: str
    generated_answer: str
    relevance_score: float
    correctness_score: float
    faithfulness_score: float
    hallucination_score: float
    detailed_feedback: Dict[str, str]
    overall_score: float

In [2]:
@dataclass
class OpenRouterModel:
    api_key: Optional[str] = os.getenv("OPENROUTER_API_KEY")

    def __post_init__(self):
        if not self.api_key:
            raise ValueError("Please set a valid api key in the environment variables")
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=self.api_key,
        )
        
        self.model = "meta-llama/llama-3.3-70b-instruct"

    def invoke(self, prompt: str, temperature: float = 0.0, max_tokens: int = 2000) -> str:
        """Generate response using Llama 3.3 70B Instruct"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error calling OpenRouter API: {e}")
            return ""
        
    def invoke_with_json_output(self, prompt_template, input_variables):
        """Generate structured JSON output from the model"""
        
        formatted_prompt = prompt_template.format(**input_variables)
        
        # Get the raw response from the model
        raw_response = self.invoke(formatted_prompt)
        
        # Extract JSON from the response
        try:
            # Find JSON content within the response
            json_start = raw_response.find('{')
            json_end = raw_response.rfind('}') + 1
            
            if json_start >= 0 and json_end > json_start:
                json_str = raw_response[json_start:json_end]
                result = json.loads(json_str)
                return result
            else:
                # If no JSON found, try to parse the whole response
                return json.loads(raw_response)
        except json.JSONDecodeError:
            print("Failed to parse JSON from response. Raw response:")
            print(raw_response)
            # Return a default structure if parsing fails
            return {"score": 0, "reasoning": "Failed to parse response"}

In [3]:
class ResultScore(BaseModel):
    score: float = Field(..., description="The score of the result, ranging from 0 to 1 where 1 is the best possible score.")
    reasoning: str = Field(..., description="An extensive explanation of the score.")


In [None]:
def evaluate_relevance(question: str, context: str, generated_answer: str, llm: OpenRouterModel):
    prompt = PromptTemplate(
        input_variables=["question", "context", "generated_answer"],
        template = """
        Evaluate the RELEVANCE of the retrieved context to the given question.
        
        Question: {question}
        
        Retrieved Context: {context}

        Generated Answer: {generated_answer}
        
        Rate the relevance on a scale of 1-5 where:
        - 1: Completely irrelevant, no connection to the question
        - 2: Minimally relevant, tangential connection
        - 3: Somewhat relevant, partial connection
        - 4: Highly relevant, strong connection
        - 5: Perfectly relevant, directly addresses the question
        
        Provide your response in this format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation of why you gave this score>"
        }}
        
        Ensure your response is valid JSON.
        """
    )

    # Use the custom method for structured output
    result = llm.invoke_with_json_output(
        prompt_template=prompt.template,
        input_variables={
            "question": question,
            "context": context,
            "generated_answer": generated_answer
        }
    )
    
    # Maps 1–5 to 0.0–1.0
    if "score" in result:
        result["score"] = (result["score"] - 1) / 4.0 
        
    return result

In [None]:
def evaluate_correctness(question: str, generated_answer: str, llm: OpenRouterModel, context: Optional[str] = None):
    prompt_template = """
    Evaluate the FACTUAL CORRECTNESS of the given answer to the question.
    
    Question: {question}
    
    Answer: {generated_answer}
    """
    
    # Add context if provided
    if context:
        prompt_template += f"\nContext (for reference): {context}\n"
    else:
        prompt_template += "\n"
        
    prompt_template += """
    Rate the correctness on a scale of 1-5 where:
    - 1: Completely incorrect, contains major factual errors
    - 2: Mostly incorrect, some facts but significant errors
    - 3: Partially correct, mix of correct and incorrect information
    - 4: Mostly correct, minor errors or omissions
    - 5: Completely correct, factually accurate and comprehensive
    
    Focus on:
    - Factual accuracy of claims made
    - Logical consistency
    - Completeness of the answer
    
    Provide your response in this exact JSON format:
    {{
        "score": <number>,
        "reasoning": "<detailed explanation focusing on specific facts>"
    }}
    """
    
    # Create input variables dict based on whether context is provided
    input_variables = {
        "question": question,
        "generated_answer": generated_answer
    }
    
    # Only add context to input variables if it's provided
    if context:
        input_variables["context"] = context
    
    result = llm.invoke_with_json_output(
        prompt_template=prompt_template,
        input_variables=input_variables
    )
    
    # Maps 1–5 to 0.0–1.0
    if "score" in result:
        result["score"] = (result["score"] - 1) / 4.0 
        
    return result

In [23]:
def evaluate_faithfulness(question: str, generated_answer: str, llm: OpenRouterModel, context: Optional[str] = None):
    prompt = PromptTemplate(
        input_variables = ["question", "context", "generated_answer"],
        template = """
        Evaluate the FAITHFULNESS of the answer to the provided context. The answer should only contain information that can be supported by or reasonably inferred from the context.
        
        Question: {question}
        
        Context: {context}
        
        Answer: {generated_answer}
        
        Rate the faithfulness on a scale of 1-5 where:
        - 1: Completely unfaithful, answer contradicts or ignores context
        - 2: Mostly unfaithful, some alignment but significant deviations
        - 3: Partially faithful, generally aligned but some unsupported claims
        - 4: Mostly faithful, well-grounded with minor unsupported details
        - 5: Completely faithful, all claims supported by or inferable from context
        
        Check for:
        - Claims that go beyond what's stated in the context
        - Information that contradicts the context
        - Proper grounding of all assertions
        
        Provide your response in this exact JSON format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation with specific examples>"
        }}
        """
    )
    # Use the custom method for structured output
    result = llm.invoke_with_json_output(
        prompt_template=prompt.template,
        input_variables={
            "question": question,
            "context": context,
            "generated_answer": generated_answer
        }
    )
    
    # Maps 1–5 to 0.0–1.0
    if "score" in result:
        result["score"] = (result["score"] - 1) / 4.0 
        
    return result
    

In [27]:
def evaluate_hallucination(question: str, generated_answer: str, llm: OpenRouterModel, context: Optional[str] = None):
    prompt = PromptTemplate(
        input_variables = ["question", "context", "generated_answer"],
        template = """
        Evaluate whether the answer contains HALLUCINATED information - facts, claims, or details that are NOT present in the provided context.
        
        Question: {question}
        
        Context: {context}
        
        Answer: {generated_answer}
        
        Rate hallucination on a scale of 1-5 where:
        - 1: Severe hallucination, answer contains mostly fabricated information
        - 2: Significant hallucination, multiple unsupported claims
        - 3: Moderate hallucination, some fabricated details
        - 4: Minor hallucination, mostly grounded with few unsupported claims
        - 5: No hallucination, all information comes from or is inferable from context
        
        Specifically look for:
        - Facts mentioned in answer but not in context
        - Specific numbers, dates, names not in context
        - Detailed explanations beyond what context provides
        - Claims that go beyond reasonable inference
        
        Provide your response in this exact JSON format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation identifying specific hallucinations if any>"
        }}
        """
    )
    # Use the custom method for structured output
    result = llm.invoke_with_json_output(
        prompt_template=prompt.template,
        input_variables={
            "question": question,
            "context": context,
            "generated_answer": generated_answer
        }
    )
    
    # Maps 1–5 to 0.0–1.0
    if "score" in result:
        result["score"] = (result["score"] - 1) / 4.0 
        
    return result
    

In [28]:
from openai import OpenAI
llm = OpenRouterModel()

# Example data
question = "What causes climate change?"
context = "Climate change is primarily caused by human activities that increase greenhouse gas concentrations in the atmosphere. The main contributors include burning fossil fuels (coal, oil, gas) for electricity, heat, and transportation, which releases carbon dioxide. Deforestation reduces the Earth's capacity to absorb CO2. Industrial processes and agriculture also contribute significantly."
generated_answer = """
Climate change is caused by both natural and human factors. While volcanic eruptions and variations in solar radiation have long-term impacts, the current changes are largely due to human activities such as burning fossil fuels and deforestation. Increased greenhouse gases from industrial processes and transportation play a significant role."""

# Evaluate relevance
relevance_result = evaluate_relevance(
    question=question, 
    context=context, 
    generated_answer=generated_answer, 
    llm=llm
)
print(f"Relevance Score: {relevance_result['score']}")
print(f"Reasoning: {relevance_result['reasoning']}")

# Evaluate correctness
correctness_result = evaluate_correctness(
    question=question, 
    generated_answer=generated_answer, 
    llm=llm, 
    context=context
)
print(f"Correctness Score: {correctness_result['score']}")
print(f"Reasoning: {correctness_result['reasoning']}")

# Evaluate faithfulness
faithfulness_result = evaluate_faithfulness(
    question=question, 
    generated_answer=generated_answer, 
    llm=llm, 
    context=context
)
print(f"Faithfulness Score: {faithfulness_result['score']}")
print(f"Reasoning: {faithfulness_result['reasoning']}")

# Evaluate faithfulness
hallucination_result = evaluate_hallucination(
    question=question, 
    generated_answer=generated_answer, 
    llm=llm, 
    context=context
)
print(f"Hallucination Score: {hallucination_result['score']}")
print(f"Reasoning: {hallucination_result['reasoning']}")

Relevance Score: 1.0
Reasoning: The retrieved context directly addresses the question by stating that climate change is primarily caused by human activities such as burning fossil fuels, deforestation, industrial processes, and agriculture, which increase greenhouse gas concentrations in the atmosphere. The generated answer also aligns closely with this context, mentioning both human factors (burning fossil fuels, deforestation, industrial processes, and transportation) and briefly acknowledging natural factors. The information provided in the context and the generated answer comprehensively covers the causes of climate change as requested by the question, making the retrieved context perfectly relevant to the given question.
Correctness Score: 1.0
Reasoning: The answer provided accurately identifies both natural and human factors contributing to climate change. It correctly highlights the significant role of human activities such as burning fossil fuels and deforestation in the curren