In [1]:
import json
import os
from typing import List, Any, Tuple, Dict, Optional
from dataclasses import dataclass
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate

# Your existing classes
@dataclass
class RAGEvaluationResult:
    """Structure to hold evaluation results for a single question"""
    question: str
    context: str
    generated_answer: str
    relevance_score: float
    correctness_score: float
    faithfulness_score: float
    hallucination_score: float
    detailed_feedback: Dict[str, str]
    overall_score: float

In [8]:
@dataclass
class OpenRouterModel:
    api_key: Optional[str] = os.getenv("OPENROUTER_API_KEY")

    def __post_init__(self):
        if not self.api_key:
            raise ValueError("Please set a valid api key in the environment variables")
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=self.api_key,
        )
        
        self.model = "meta-llama/llama-3.3-70b-instruct"

    def invoke(self, prompt: str, temperature: float = 0.0, max_tokens: int = 2000) -> str:
        """Generate response using Llama 3.3 70B Instruct"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error calling OpenRouter API: {e}")
            return ""
        
    def invoke_with_json_output(self, prompt_template, input_variables):
        """Generate structured JSON output from the model"""
        
        formatted_prompt = prompt_template.format(**input_variables)
        
        # Get the raw response from the model
        raw_response = self.invoke(formatted_prompt)
        
        # Extract JSON from the response
        try:
            # Find JSON content within the response
            json_start = raw_response.find('{')
            json_end = raw_response.rfind('}') + 1
            
            if json_start >= 0 and json_end > json_start:
                json_str = raw_response[json_start:json_end]
                result = json.loads(json_str)
                return result
            else:
                # If no JSON found, try to parse the whole response
                return json.loads(raw_response)
        except json.JSONDecodeError:
            print("Failed to parse JSON from response. Raw response:")
            print(raw_response)
            # Return a default structure if parsing fails
            return {"score": 0, "reasoning": "Failed to parse response"}

In [9]:
class ResultScore(BaseModel):
    score: float = Field(..., description="The score of the result, ranging from 0 to 1 where 1 is the best possible score.")
    reasoning: str = Field(..., description="An extensive explanation of the score.")


In [10]:
def evaluate_relevance(question: str, context: str, generated_answer: str, llm: OpenRouterModel):
    prompt = PromptTemplate(
        input_variables=["question", "context", "generated_answer"],
        template = """
        Evaluate the RELEVANCE of the retrieved context to the given question.
        
        Question: {question}
        
        Retrieved Context: {context}

        Generated Answer: {generated_answer}
        
        Rate the relevance on a scale of 1-5 where:
        - 1: Completely irrelevant, no connection to the question
        - 2: Minimally relevant, tangential connection
        - 3: Somewhat relevant, partial connection
        - 4: Highly relevant, strong connection
        - 5: Perfectly relevant, directly addresses the question
        
        Provide your response in this format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation of why you gave this score>"
        }}
        
        Ensure your response is valid JSON.
        """
    )

    # Use the custom method for structured output
    result = llm.invoke_with_json_output(
        prompt_template=prompt.template,
        input_variables={
            "question": question,
            "context": context,
            "generated_answer": generated_answer
        }
    )
    
    # Normalize the score to 0-1 range since it's on a 1-5 scale
    if "score" in result and result["score"] > 1:
        result["score"] = result["score"] / 5.0
        
    return result

In [11]:
from openai import OpenAI
llm = OpenRouterModel()
question = "What causes climate change?"
context = "Climate change is primarily caused by human activities that increase greenhouse gas concentrations in the atmosphere. The main contributors include burning fossil fuels (coal, oil, gas) for electricity, heat, and transportation, which releases carbon dioxide. Deforestation reduces the Earth's capacity to absorb CO2. Industrial processes and agriculture also contribute significantly."
generated_answer = "Climate change is primarily caused by human activities, especially burning fossil fuels for energy and transportation, which releases carbon dioxide into the atmosphere. Deforestation and industrial processes also contribute by increasing greenhouse gas concentrations."

result = evaluate_relevance(question, context, generated_answer, llm)
print(f"Relevance Score: {result['score']}")
print(f"Reasoning: {result['reasoning']}")


Relevance Score: 1.0
Reasoning: The retrieved context directly addresses the question 'What causes climate change?' by providing a clear and concise explanation of the primary causes of climate change, including human activities such as burning fossil fuels, deforestation, and industrial processes. The generated answer accurately summarizes the main points from the retrieved context, demonstrating a perfect connection between the question, the context, and the answer. Therefore, the relevance of the retrieved context to the given question is perfectly relevant, warranting a score of 5.
