In [3]:
import json
import os
from typing import List, Any, Tuple, Dict, Optional
from dataclasses import dataclass
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate

@dataclass
class RAGEvaluationResult:
    """Structure to hold evaluation results for a single question"""
    question: str
    context: str
    generated_answer: str
    relevance_score: float
    correctness_score: float
    faithfulness_score: float
    hallucination_score: float
    detailed_feedback: Dict[str, str]
    overall_score: float

In [7]:
import logging
logging.basicConfig(level=logging.INFO)

def score(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)

        # Handle JSON string
        if isinstance(result, str):
            result = json.loads(result)

        if "score" in result:
            raw_score = result["score"]
            normalized_score = (raw_score - 1) / 4.0
            result["score"] = normalized_score

            # Log the transformation
            logging.info(f"Raw score: {raw_score}, Normalized: {normalized_score}")
        else:
            logging.warning("Score key not found in result.")

        return result
    return wrapper


In [8]:
@dataclass
class OpenRouterModel:
    api_key: Optional[str] = os.getenv("OPENROUTER_API_KEY")
    model: str = "meta-llama/llama-3.3-70b-instruct"

    def __post_init__(self):
        if not self.api_key:
            raise ValueError("Please set a valid api key in the environment variables")
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=self.api_key,
        )
        

    def invoke(self, prompt: str, temperature: float = 0.1, max_tokens: int = 2000) -> str:
        """Generate response using Llama 3.3 70B Instruct"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error calling OpenRouter API: {e}")
            return ""
        
    def invoke_with_json_output(self, prompt_template, input_variables):
        """Generate structured JSON output from the model"""
        
        formatted_prompt = prompt_template.format(**input_variables)
        
        # Get the raw response from the model
        raw_response = self.invoke(formatted_prompt)
        
        # Extract JSON from the response
        try:
            # Find JSON content within the response
            json_start = raw_response.find('{')
            json_end = raw_response.rfind('}') + 1
            
            if json_start >= 0 and json_end > json_start:
                json_str = raw_response[json_start:json_end]
                result = json.loads(json_str)
                return result
            else:
                # If no JSON found, try to parse the whole response
                return json.loads(raw_response)
        except json.JSONDecodeError:
            print("Failed to parse JSON from response. Raw response:")
            print(raw_response)
            # Return a default structure if parsing fails
            return {"score": 0, "reasoning": "Failed to parse response"}

In [35]:
@dataclass
class Evaluation:
    question: str
    generated_answer: str
    llm: OpenRouterModel
    context: Optional[str] = None

    def _evaluate_with_template(self, template: str, input_vars: Dict[str, str]) -> Dict:
        """Generic method to handle all evaluations with templates."""
        return self.llm.invoke_with_json_output(
            prompt_template=template,
            input_variables=input_vars
        )

    def _get_base_input_variables(self) -> Dict[str, str]:
        """Get the base input variables used by most evaluation methods."""
        return {
            "question": self.question,
            "context": self.context,
            "generated_answer": self.generated_answer
        }

    @score
    def evaluate_relevance(self) -> Dict:
        template = """
        Evaluate the RELEVANCE of the retrieved context to the given question.
        
        Question: {question}
        
        Retrieved Context: {context}

        Generated Answer: {generated_answer}
        
        Rate the relevance on a scale of 1-5 where:
        - 1: Completely irrelevant, no connection to the question
        - 2: Minimally relevant, tangential connection
        - 3: Somewhat relevant, partial connection
        - 4: Highly relevant, strong connection
        - 5: Perfectly relevant, directly addresses the question
        
        Provide your response in this format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation of why you gave this score>"
        }}
        
        Ensure your response is valid JSON.
        """
        
        return self._evaluate_with_template(template, self._get_base_input_variables())

    @score
    def evaluate_correctness(self) -> Dict:
        template = """
        Evaluate the FACTUAL CORRECTNESS of the given answer to the question.
        
        Question: {question}
        
        Answer: {generated_answer}
        """
        
        # Add context if provided
        if self.context:
            template += f"\nContext (for reference): {{context}}\n"
        else:
            template += "\n"
            
        template += """
        Rate the correctness on a scale of 1-5 where:
        - 1: Completely incorrect, contains major factual errors
        - 2: Mostly incorrect, some facts but significant errors
        - 3: Partially correct, mix of correct and incorrect information
        - 4: Mostly correct, minor errors or omissions
        - 5: Completely correct, factually accurate and comprehensive
        
        Focus on:
        - Factual accuracy of claims made
        - Logical consistency
        - Completeness of the answer
        
        Provide your response in this exact JSON format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation focusing on specific facts>"
        }}
        """
        
        # Create input variables dict based on whether context is provided
        input_variables = {
            "question": self.question,
            "generated_answer": self.generated_answer
        }
        
        # Only add context to input variables if it's provided
        if self.context:
            input_variables["context"] = self.context
        
        return self._evaluate_with_template(template, input_variables)

    @score
    def evaluate_faithfulness(self) -> Dict:
        template = """
        Evaluate the FAITHFULNESS of the answer to the provided context. The answer should only contain information that can be supported by or reasonably inferred from the context.
        
        Question: {question}
        
        Context: {context}
        
        Answer: {generated_answer}
        
        Rate the faithfulness on a scale of 1-5 where:
        - 1: Completely unfaithful, answer contradicts or ignores context
        - 2: Mostly unfaithful, some alignment but significant deviations
        - 3: Partially faithful, generally aligned but some unsupported claims
        - 4: Mostly faithful, well-grounded with minor unsupported details
        - 5: Completely faithful, all claims supported by or inferable from context
        
        Check for:
        - Claims that go beyond what's stated in the context
        - Information that contradicts the context
        - Proper grounding of all assertions
        
        Provide your response in this exact JSON format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation with specific examples>"
        }}
        """
        
        return self._evaluate_with_template(template, self._get_base_input_variables())

    @score
    def evaluate_hallucination(self) -> Dict:
        template = """
        Evaluate whether the answer contains HALLUCINATED information - facts, claims, or details that are NOT present in the provided context.
        
        Question: {question}
        
        Context: {context}
        
        Answer: {generated_answer}
        
        Rate hallucination on a scale of 1-5 where:
        - 1: Severe hallucination, answer contains mostly fabricated information
        - 2: Significant hallucination, multiple unsupported claims
        - 3: Moderate hallucination, some fabricated details
        - 4: Minor hallucination, mostly grounded with few unsupported claims
        - 5: No hallucination, all information comes from or is inferable from context
        
        Specifically look for:
        - Facts mentioned in answer but not in context
        - Specific numbers, dates, names not in context
        - Detailed explanations beyond what context provides
        - Claims that go beyond reasonable inference
        
        Provide your response in this exact JSON format:
        {{
            "score": <number>,
            "reasoning": "<detailed explanation identifying specific hallucinations if any>"
        }}
        """
        
        return self._evaluate_with_template(template, self._get_base_input_variables())

    def comprehensive_evaluation(self):
        print(f"Evaluating {self.question[:500]}...")
        # Extract Each metric
        relevance_score, relevance_reasoning = self.evaluate_relevance().values()
        correctness_score, correctness_reasoning = self.evaluate_correctness().values()
        faithfulness_score, faithfulness_reasoning = self.evaluate_faithfulness().values()
        hallucination_score, hallucination_reasoning = self.evaluate_hallucination().values()

        # Perform a weighted average
        overall_score = (
            relevance_score * 0.25 + 
            correctness_score * 0.30 + 
            faithfulness_score * 0.25 + 
            hallucination_score * 0.20
        )

        return RAGEvaluationResult(
            question=self.question,
            context=self.context[:500] + "..." if len(self.context) > 500 else self.context,
            generated_answer=self.generated_answer,
            relevance_score=relevance_score,
            correctness_score=correctness_score,
            faithfulness_score=faithfulness_score,
            hallucination_score=hallucination_score,
            detailed_feedback={
                "relevance": relevance_reasoning,
                "correctness": correctness_reasoning,
                "faithfulness": faithfulness_reasoning,
                "hallucination": hallucination_reasoning
            },
            overall_score=overall_score
        )
        


In [None]:
@dataclass
class TestCase:
    question: str
    context: str
    expected_quality: str  # "high", "medium", "low" for evaluation testing
    notes: str = ""

TEST_CASES = [
    TestCase(
        question="What is the capital of France?",
        context="France is a country in Western Europe. Paris is the capital and largest city of France, located in the north-central part of the country along the Seine River. The city has a population of over 2 million people within its administrative limits.",
        expected_quality="high",
        notes="Simple factual question with directly relevant context"
    ),
    
    TestCase(
        question="How does photosynthesis work?",
        context="Photosynthesis is the process by which plants convert light energy into chemical energy. During photosynthesis, plants use sunlight, carbon dioxide from the air, and water from their roots to produce glucose and oxygen. The process occurs mainly in the chloroplasts of plant cells, specifically in structures called thylakoids. Chlorophyll, the green pigment in plants, captures light energy to drive this process.",
        expected_quality="high",
        notes="Scientific explanation with comprehensive context"
    ),
    
    TestCase(
        question="What are the side effects of aspirin?",
        context="Aspirin is a common pain reliever and anti-inflammatory medication. It belongs to a class of drugs called nonsteroidal anti-inflammatory drugs (NSAIDs). Common side effects include stomach upset, heartburn, and nausea. More serious side effects can include stomach bleeding, ulcers, and increased risk of bleeding disorders. Aspirin should not be given to children due to risk of Reye's syndrome.",
        expected_quality="high",
        notes="Medical information with relevant safety context"
    ),
    
    TestCase(
        question="What is the GDP of Brazil in 2024?",
        context="Brazil is the largest economy in South America. The country has experienced various economic challenges in recent years, including inflation and political instability. Brazil's main exports include soybeans, coffee, and iron ore. The Brazilian real is the country's currency.",
        expected_quality="low",
        notes="Specific data question with irrelevant context - should test hallucination"
    ),
    
    TestCase(
        question="How do you bake chocolate chip cookies?",
        context="Chocolate chip cookies were invented by Ruth Wakefield in 1938 at the Toll House Inn. The original recipe called for butter, sugar, eggs, flour, and chocolate chips. Baking typically requires preheating the oven and careful timing to achieve the right texture.",
        expected_quality="medium",
        notes="Recipe question with historical context but missing specific instructions"
    ),
    
    TestCase(
        question="What causes climate change?",
        context="The Earth's atmosphere contains greenhouse gases like carbon dioxide, methane, and water vapor. These gases trap heat from the sun, creating a greenhouse effect. Human activities such as burning fossil fuels, deforestation, and industrial processes have significantly increased the concentration of greenhouse gases in the atmosphere since the Industrial Revolution, leading to global warming.",
        expected_quality="high",
        notes="Environmental science with comprehensive explanation"
    ),
    
    TestCase(
        question="Who won the 2023 FIFA World Cup?",
        context="The FIFA World Cup is held every four years and is the most prestigious tournament in international football. The tournament features 32 national teams competing over several weeks. Previous winners include Brazil, Germany, Argentina, and France. The tournament generates billions in revenue and attracts viewers worldwide.",
        expected_quality="low",
        notes="Specific event question with general context - tests factual accuracy"
    ),
    
    TestCase(
        question="What are the benefits of exercise?",
        context="Regular physical activity has numerous health benefits. Exercise can improve cardiovascular health by strengthening the heart and improving circulation. It also helps maintain healthy weight, builds muscle strength and bone density, and can reduce the risk of chronic diseases like diabetes and hypertension. Additionally, exercise releases endorphins which can improve mood and reduce stress.",
        expected_quality="high",
        notes="Health question with comprehensive relevant context"
    ),
    
    TestCase(
        question="How does blockchain technology work?",
        context="Cryptocurrency markets have been volatile in recent years. Bitcoin was the first cryptocurrency, created by an anonymous person or group known as Satoshi Nakamoto. Many companies are now accepting cryptocurrency as payment. The value of cryptocurrencies can fluctuate dramatically based on market sentiment.",
        expected_quality="low",
        notes="Technical question with tangentially related context about crypto"
    ),
    
    TestCase(
        question="What is the treatment for type 2 diabetes?",
        context="Type 2 diabetes is a chronic condition that affects how the body processes blood sugar (glucose). Treatment typically involves lifestyle changes such as diet modification and regular exercise. Medications may include metformin, insulin, or other blood sugar-lowering drugs. Regular monitoring of blood glucose levels is important. Some patients may also benefit from weight loss surgery in severe cases.",
        expected_quality="high",
        notes="Medical treatment question with appropriate clinical context"
    ),
]

async def generate_answer_with_llm_async(question: str, context: str, llm_model) -> str:
    """
    Asynchronously generate answers with the llm
    
    Args:
        question: The question to answer
        context: The context/background information
        llm_model: Your LLM model instance 
    
    Returns:
        Generated answer string
    """
    prompt_template = """
    Based on the provided context, please answer the following question. 
    Use only the information provided in the context. If the context doesn't 
    contain enough information to fully answer the question, say so explicitly.
    
    Context: {context}
    
    Question: {question}
    
    Answer:
    """
    
    # Format the prompt
    formatted_prompt = prompt_template.format(
        context=context,
        question=question
    )
    
    try:
        # Use async invoke else switch to .to_thread for non_async models
        if hasattr(llm_model, 'ainvoke'):
            response = await llm_model.ainvoke(formatted_prompt)
        else:
            # Fallback for non-async models - run in thread pool
            response = await asyncio.to_thread(llm_model.invoke, formatted_prompt)
        
        return response.strip()
    except Exception as e:
        return f"Error generating answer: {str(e)}"

async def run_evaluation_test_async(test_case: TestCase, llm_model, evaluator_llm) -> Dict:
    """
    Asynchronusly run the evaluation tests.
    
    Args:
        test_case: TestCase instance with question and context
        llm_model: LLM for generating answers
        evaluator_llm: LLM for evaluation (can be the same as llm_model)
    
    Returns:
        Dictionary with test results
    """
    print(f"\n{'='*80}")
    print(f"Testing: {test_case.question}")
    print(f"Expected Quality: {test_case.expected_quality}")
    print(f"Notes: {test_case.notes}")
    print(f"{'='*80}")
    
    # Generate answer asynchronously
    generated_answer = await generate_answer_with_llm_async(
        test_case.question, 
        test_case.context, 
        llm_model
    )
    
    print(f"\nGenerated Answer: {generated_answer[:200]}...")
    
    # Create evaluation instance
    evaluation = Evaluation(
        question=test_case.question,
        generated_answer=generated_answer,
        llm=evaluator_llm,
        context=test_case.context
    )
    
    # Run comprehensive evaluation asynchronously
    if hasattr(evaluation, 'comprehensive_evaluation_async'): # I'll need to fix this in the morning
        result = await evaluation.comprehensive_evaluation_async()
    else:
        # If evaluation doesn't have async method, run in thread pool
        result = await asyncio.to_thread(evaluation.comprehensive_evaluation)
    
    return {
        "test_case": test_case,
        "generated_answer": generated_answer,
        "evaluation_result": result,
        "expected_quality": test_case.expected_quality
    }

async def run_full_evaluation_suite_async(
    llm_model, 
    evaluator_llm, 
    test_cases: List[TestCase] = None,
    max_concurrent: int = 5
) -> List[Dict]:
    """
    Run evaluation on all test cases concurrently using semaphores.
    
    Args:
        llm_model: LLM for generating answers
        evaluator_llm: LLM for evaluation
        test_cases: Optional custom test cases (defaults to TEST_CASES)
        max_concurrent: Maximum number of concurrent requests
    
    Returns:
        List of evaluation results
    """
    if test_cases is None:
        test_cases = TEST_CASES
    
    # Create semaphore to limit concurrent requests
    semaphore = asyncio.Semaphore(max_concurrent)
    completed = 0
    total = len(test_cases)
    
    async def run_single_test_with_semaphore(i, test_case):
        """Run a single test with semaphore control"""
        nonlocal completed
        
        async with semaphore:  # This limits concurrent execution
            print(f"\nStarting Test {i+1}/{total}")
            
            try:
                result = await run_evaluation_test_async(test_case, llm_model, evaluator_llm)
                
                completed += 1
                # Print summary
                eval_result = result["evaluation_result"]
                print(f"\nTest {i+1}/{total} completed ({completed}/{total} done)")
                print(f"SCORES:")
                print(f"   Relevance: {eval_result.relevance_score}")
                print(f"   Correctness: {eval_result.correctness_score}")
                print(f"   Faithfulness: {eval_result.faithfulness_score}")
                print(f"   Hallucination: {eval_result.hallucination_score}")
                print(f"   Overall: {eval_result.overall_score:.2f}")
                
                return result
                
            except Exception as e:
                completed += 1
                error_result = {
                    "test_case": test_case,
                    "error": str(e),
                    "evaluation_result": None
                }
                print(f"Error in test {i+1}: {str(e)} ({completed}/{total} done)")
                return error_result
    
    # Create tasks for all test cases
    tasks = [
        run_single_test_with_semaphore(i, test_case) 
        for i, test_case in enumerate(test_cases)
    ]
    
    # Run all tasks concurrently
    print(f"Starting {total} evaluations with max {max_concurrent} concurrent requests...")
    results = await asyncio.gather(*tasks, return_exceptions=True)
    
    # Handle any exceptions that weren't caught
    processed_results = []
    for i, result in enumerate(results):
        if isinstance(result, Exception):
            processed_results.append({
                "test_case": test_cases[i],
                "error": str(result),
                "evaluation_result": None
            })
        else:
            processed_results.append(result)
    
    return processed_results

# Synchronous wrapper for easy use
def run_full_evaluation_suite_concurrent(
    llm_model, 
    evaluator_llm, 
    test_cases: List[TestCase] = None,
    max_concurrent: int = 5
) -> List[Dict]:
    """
    Synchronous wrapper for async evaluation suite.
    
    Args:
        llm_model: LLM for generating answers
        evaluator_llm: LLM for evaluation
        test_cases: Optional custom test cases (defaults to TEST_CASES)
        max_concurrent: Maximum number of concurrent requests
    
    Returns:
        List of evaluation results
    """
    return asyncio.run(
        run_full_evaluation_suite_async(llm_model, evaluator_llm, test_cases, max_concurrent)
    )

def print_evaluation_summary(results: List[Dict]):
    """Print a summary of all evaluation results."""
    print(f"\n\n{'='*80}")
    print("EVALUATION SUMMARY")
    print(f"{'='*80}")
    
    successful_results = [r for r in results if r.get("evaluation_result")]
    
    if not successful_results:
        print("No successful evaluations to summarize")
        return
    
    # Calculate averages
    avg_relevance = sum(r["evaluation_result"].relevance_score for r in successful_results) / len(successful_results)
    avg_correctness = sum(r["evaluation_result"].correctness_score for r in successful_results) / len(successful_results)
    avg_faithfulness = sum(r["evaluation_result"].faithfulness_score for r in successful_results) / len(successful_results)
    avg_hallucination = sum(r["evaluation_result"].hallucination_score for r in successful_results) / len(successful_results)
    avg_overall = sum(r["evaluation_result"].overall_score for r in successful_results) / len(successful_results)
    
    print(f"Average Scores (n={len(successful_results)}):")
    print(f"Relevance: {avg_relevance:.2f}")
    print(f"Correctness: {avg_correctness:.2f}") 
    print(f"Faithfulness: {avg_faithfulness:.2f}")
    print(f"Hallucination: {avg_hallucination:.2f}")
    print(f"Overall: {avg_overall:.2f}")
    
    # Identify problematic cases
    low_scoring = [r for r in successful_results if r["evaluation_result"].overall_score < 0.6]
    if low_scoring:
        print(f"\nLow-scoring cases ({len(low_scoring)} cases with overall < 0.6):")
        for result in low_scoring:
            question = result["test_case"].question[:50] + "..."
            score = result["evaluation_result"].overall_score
            print(f"{question} (Score: {score:.2f})")


In [50]:
import nest_asyncio
# Patch the current event loop
nest_asyncio.apply()
# Initialize LLMs
llm_model = OpenRouterModel()  
evaluator_llm = OpenRouterModel()  

results = run_full_evaluation_suite_concurrent(
    llm_model, 
    evaluator_llm,
    max_concurrent=5  
)


print_evaluation_summary(results)

Starting 10 evaluations with max 5 concurrent requests...

Starting Test 1/10

Testing: What is the capital of France?
Expected Quality: high
Notes: Simple factual question with directly relevant context

Starting Test 2/10

Testing: How does photosynthesis work?
Expected Quality: high
Notes: Scientific explanation with comprehensive context

Starting Test 3/10

Testing: What are the side effects of aspirin?
Expected Quality: high
Notes: Medical information with relevant safety context

Starting Test 4/10

Testing: What is the GDP of Brazil in 2024?
Expected Quality: low
Notes: Specific data question with irrelevant context - should test hallucination

Starting Test 5/10

Testing: How do you bake chocolate chip cookies?
Expected Quality: medium
Notes: Recipe question with historical context but missing specific instructions


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



Generated Answer: The capital of France is Paris....
Evaluating What is the capital of France?...


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:openai._base_client:Retrying request to /chat/completions in 0.489964 seconds
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



Generated Answer: Photosynthesis works by plants using sunlight, carbon dioxide from the air, and water from their roots to produce glucose and oxygen. This process occurs mainly in the chloroplasts of plant cells, spe...
Evaluating How does photosynthesis work?...

Generated Answer: To bake chocolate chip cookies, you need to preheat the oven and use careful timing to achieve the right texture. The original recipe calls for ingredients such as butter, sugar, eggs, flour, and choc...
Evaluating How do you bake chocolate chip cookies?...


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



Generated Answer: The context does not contain enough information to answer the question about Brazil's GDP in 2024. The provided information discusses Brazil's economy, exports, and currency but does not include speci...
Evaluating What is the GDP of Brazil in 2024?...


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



Generated Answer: The side effects of aspirin include:

1. Stomach upset
2. Heartburn
3. Nausea
4. Stomach bleeding
5. Ulcers
6. Increased risk of bleeding disorders

Note that these side effects can be categorized int...
Evaluating What are the side effects of aspirin?...


INFO:root:Raw score: 5, Normalized: 1.0
INFO:root:Raw score: 5, Normalized: 1.0
INFO:root:Raw score: 3, Normalized: 0.5
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 2, Normalized: 0.25
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:root:Raw score: 


Test 1/10 completed (1/10 done)
SCORES:
   Relevance: 1.0
   Correctness: 1.0
   Faithfulness: 1.0
   Hallucination: 1.0
   Overall: 1.00

Starting Test 6/10

Testing: What causes climate change?
Expected Quality: high
Notes: Environmental science with comprehensive explanation


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:root:Raw score: 5, Normalized: 1.0



Test 2/10 completed (2/10 done)
SCORES:
   Relevance: 1.0
   Correctness: 1.0
   Faithfulness: 1.0
   Hallucination: 1.0
   Overall: 1.00

Starting Test 7/10

Testing: Who won the 2023 FIFA World Cup?
Expected Quality: low
Notes: Specific event question with general context - tests factual accuracy


INFO:root:Raw score: 5, Normalized: 1.0
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:root:Raw score: 5, Normalized: 1.0



Test 4/10 completed (3/10 done)
SCORES:
   Relevance: 0.25
   Correctness: 1.0
   Faithfulness: 1.0
   Hallucination: 1.0
   Overall: 0.81

Starting Test 8/10

Testing: What are the benefits of exercise?
Expected Quality: high
Notes: Health question with comprehensive relevant context

Test 5/10 completed (4/10 done)
SCORES:
   Relevance: 0.5
   Correctness: 0.75
   Faithfulness: 1.0
   Hallucination: 1.0
   Overall: 0.80

Starting Test 9/10

Testing: How does blockchain technology work?
Expected Quality: low
Notes: Technical question with tangentially related context about crypto


INFO:root:Raw score: 5, Normalized: 1.0



Test 3/10 completed (5/10 done)
SCORES:
   Relevance: 1.0
   Correctness: 1.0
   Faithfulness: 1.0
   Hallucination: 1.0
   Overall: 1.00

Starting Test 10/10

Testing: What is the treatment for type 2 diabetes?
Expected Quality: high
Notes: Medical treatment question with appropriate clinical context


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



Generated Answer: The benefits of exercise include:

1. Improving cardiovascular health by strengthening the heart and improving circulation.
2. Maintaining a healthy weight.
3. Building muscle strength and bone densit...
Evaluating What are the benefits of exercise?...

Generated Answer: There is not enough information in the context to answer the question. The context does not mention the 2023 FIFA World Cup or its winner. It only provides general information about the tournament....
Evaluating Who won the 2023 FIFA World Cup?...

Generated Answer: The treatment for type 2 diabetes typically involves lifestyle changes, such as diet modification and regular exercise, and may also include medications like metformin, insulin, or other blood sugar-l...
Evaluating What is the treatment for type 2 diabetes?...


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"



Generated Answer: The context doesn't contain enough information to fully answer the question. The provided context discusses the volatility of cryptocurrency markets, the creation of Bitcoin, and the acceptance of cry...
Evaluating How does blockchain technology work?...


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:root:Raw score: 2, Normalized: 0.25
INFO:root:Raw score: 2, Normalized: 0.25
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0
INFO:root:Raw score: 5, Normalized: 1.0
INFO:r


Test 7/10 completed (6/10 done)
SCORES:
   Relevance: 0.25
   Correctness: 1.0
   Faithfulness: 1.0
   Hallucination: 1.0
   Overall: 0.81


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0



Test 8/10 completed (7/10 done)
SCORES:
   Relevance: 1.0
   Correctness: 1.0
   Faithfulness: 1.0
   Hallucination: 1.0
   Overall: 1.00


INFO:root:Raw score: 5, Normalized: 1.0



Generated Answer: Based on the provided context, the cause of global warming (which is related to climate change) is the increase in greenhouse gases in the atmosphere, primarily due to human activities such as burning...
Evaluating What causes climate change?...


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0



Test 10/10 completed (8/10 done)
SCORES:
   Relevance: 1.0
   Correctness: 1.0
   Faithfulness: 1.0
   Hallucination: 1.0
   Overall: 1.00


INFO:root:Raw score: 5, Normalized: 1.0



Test 9/10 completed (9/10 done)
SCORES:
   Relevance: 0.25
   Correctness: 1.0
   Faithfulness: 1.0
   Hallucination: 1.0
   Overall: 0.81


INFO:root:Raw score: 4, Normalized: 0.75
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 4, Normalized: 0.75
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 4, Normalized: 0.75
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Raw score: 5, Normalized: 1.0



Test 6/10 completed (10/10 done)
SCORES:
   Relevance: 0.75
   Correctness: 0.75
   Faithfulness: 0.75
   Hallucination: 1.0
   Overall: 0.80


EVALUATION SUMMARY
Average Scores (n=10):
Relevance: 0.70
Correctness: 0.95
Faithfulness: 0.97
Hallucination: 1.00
Overall: 0.90
