In [None]:
import os
import time
import matplotlib.pyplot as plt
from dotenv import load_dotenv

load_dotenv()

from embedding import GeminiEmbeddings
from document_processor import DocumentProcessor, ContextualHeaderProcessor
from vector_store import VectorstoreManager
from retrieval import StandardRetriever, ContextualHeaderRetriever
from llm_interface import get_openrouter_llm, StandardRAGChain, ContextualHeaderRAGChain
from evaluation import RAGEvaluator

# Check if environment variables are set
required_env_vars = ["GEMINI_API_KEY", "PINECONE_API_KEY", "OPENROUTER_API_KEY"]
missing_vars = [var for var in required_env_vars if not os.getenv(var)]

if missing_vars:
    print(f"Missing environment variables: {', '.join(missing_vars)}")
    print("Please set these variables in your .env file.")
else:
    print("All required environment variables are set.")

ImportError: cannot import name 'ContextualHeaderRAGChain' from 'llm_interface' (/home/olande/Desktop/Rag_Techniques/Contextual Chunk Headers/llm_interface.py)

In [None]:
"""
Optimized evaluation metrics module for the RAG experiment.
Implements metrics for evaluating RAG performance using established libraries.
"""
import time
from typing import List, Dict, Any, Callable, Optional, Tuple, Union

import numpy as np
import matplotlib.pyplot as plt
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from tqdm.auto import tqdm

# Import specialized libraries for metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score
import pandas as pd

# Optional: Import RAGAS if available (recommended installation)
try:
    import ragas
    from ragas.metrics import (
        context_precision,
        context_relevancy,
        faithfulness,
        answer_relevancy
    )
    RAGAS_AVAILABLE = True
except ImportError:
    RAGAS_AVAILABLE = False
    print("RAGAS library not found. Some advanced metrics will not be available.")
    print("Install with: pip install ragas")


class RAGEvaluator:
    """
    Class for evaluating RAG system performance using optimized metrics.
    """
    
    def __init__(self, embeddings: Embeddings, use_ragas: bool = True):
        """
        Initialize the RAGEvaluator.
        
        Args:
            embeddings: Embedding model to use for semantic similarity
            use_ragas: Whether to use RAGAS library if available
        """
        self.embeddings = embeddings
        self.use_ragas = use_ragas and RAGAS_AVAILABLE
        self.results = {
            "standard": {},
            "contextual": {}
        }
    
    def evaluate_retrieval_accuracy(
        self,
        query: str,
        retrieved_docs: List[Document],
        relevant_docs: List[Document]
    ) -> float:
        """
        Evaluate retrieval accuracy by comparing retrieved documents to relevant documents.
        
        Args:
            query: Query string
            retrieved_docs: Documents retrieved by the system
            relevant_docs: Documents known to be relevant
            
        Returns:
            Precision score (0-1)
        """
        if self.use_ragas:
            # Convert to RAGAS format (would require implementation depending on exact API)
            # This is a placeholder for the actual implementation
            try:
                # Convert docs to RAGAS format and use context_precision
                precision = self._ragas_context_precision(query, retrieved_docs, relevant_docs)
                return precision
            except Exception as e:
                print(f"RAGAS precision calculation failed: {e}. Falling back to standard method.")
                # Fall back to standard method
                pass
        
        # Standard method (optimized from original)
        # Get document IDs for comparison
        retrieved_ids = [doc.metadata.get("id", doc.metadata.get("source", "")) for doc in retrieved_docs]
        relevant_ids = [doc.metadata.get("id", doc.metadata.get("source", "")) for doc in relevant_docs]
        
        # Calculate precision (proportion of retrieved documents that are relevant)
        if not retrieved_ids:
            return 0.0
        
        # Create binary arrays for precision calculation
        y_true = [1 if doc_id in relevant_ids else 0 for doc_id in retrieved_ids]
        y_pred = [1] * len(retrieved_ids)  # All retrieved docs are "predicted" relevant
        
        # Calculate precision using sklearn
        if sum(y_true) == 0:  # No relevant docs retrieved
            return 0.0
        
        return sum(y_true) / len(y_true)  # Average precision
    
    def _ragas_context_precision(self, query, retrieved_docs, relevant_docs):
        """
        Helper method to calculate precision using RAGAS if available.
        Implementation would depend on the exact RAGAS API.
        """
        if not RAGAS_AVAILABLE:
            raise ImportError("RAGAS not available")
            
        # This is a placeholder - actual implementation would convert to RAGAS format
        # and call the appropriate metric
        
        # Example implementation pattern (adjust based on actual RAGAS API):
        # from ragas.metrics import context_precision
        # dataset = convert_to_ragas_format(query, retrieved_docs, relevant_docs)
        # result = context_precision.compute(dataset)
        # return result["context_precision"]
        
        # For now, we'll just use our standard implementation
        retrieved_ids = [doc.metadata.get("id", doc.metadata.get("source", "")) for doc in retrieved_docs]
        relevant_ids = [doc.metadata.get("id", doc.metadata.get("source", "")) for doc in relevant_docs]
        
        if not retrieved_ids:
            return 0.0
            
        relevant_retrieved = [doc_id for doc_id in retrieved_ids if doc_id in relevant_ids]
        precision = len(relevant_retrieved) / len(retrieved_ids)
        
        return precision
    
    def evaluate_semantic_relevance(
        self,
        query: str,
        retrieved_docs: List[Document]
    ) -> float:
        """
        Evaluate semantic relevance of retrieved documents to the query.
        
        Args:
            query: Query string
            retrieved_docs: Documents retrieved by the system
            
        Returns:
            Average cosine similarity score (0-1)
        """
        if not retrieved_docs:
            return 0.0
            
        # Get query embedding
        query_embedding = self.embeddings.embed_query(query)
        
        # Get document embeddings
        doc_contents = [doc.page_content for doc in retrieved_docs]
        doc_embeddings = self.embeddings.embed_documents(doc_contents)
        
        # Calculate cosine similarities using sklearn
        query_embedding_reshaped = np.array(query_embedding).reshape(1, -1)
        doc_embeddings_array = np.array(doc_embeddings)
        
        # Use sklearn's cosine_similarity for efficient calculation
        similarities = cosine_similarity(query_embedding_reshaped, doc_embeddings_array)[0]
        
        # Return average similarity
        return np.mean(similarities)
    
    def evaluate_response_quality(
        self,
        query: str,
        response: str,
        reference_answer: Optional[str] = None,
        llm_evaluator = None
    ) -> float:
        """
        Evaluate the quality of the response.
        If a reference answer is provided, compare the response to it.
        Otherwise, use an LLM to evaluate the response quality.
        
        Args:
            query: Query string
            response: Response from the RAG system
            reference_answer: Optional reference answer
            llm_evaluator: Optional LLM for evaluation
            
        Returns:
            Quality score (0-1)
        """
        if self.use_ragas and reference_answer:
            try:
                # Use RAGAS for evaluation if available
                # This would need implementation based on the RAGAS API
                score = self._ragas_answer_evaluation(query, response, reference_answer)
                return score
            except Exception as e:
                print(f"RAGAS answer evaluation failed: {e}. Falling back to standard method.")
                # Fall back to standard method
                pass
                
        if reference_answer and self.embeddings:
            # Compare response to reference answer using semantic similarity
            response_embedding = self.embeddings.embed_query(response)
            reference_embedding = self.embeddings.embed_query(reference_answer)
            
            # Use sklearn's cosine_similarity
            response_reshaped = np.array(response_embedding).reshape(1, -1)
            reference_reshaped = np.array(reference_embedding).reshape(1, -1)
            similarity = cosine_similarity(response_reshaped, reference_reshaped)[0][0]
            
            return similarity
        
        elif llm_evaluator:
            # Use LLM to evaluate response quality
            evaluation = llm_evaluator.evaluate_response(query, response)
            return evaluation
            
        else:
            # If no reference answer or LLM evaluator, return None
            return None
            
    def _ragas_answer_evaluation(self, query, response, reference_answer):
        """
        Helper method to evaluate answer using RAGAS metrics if available.
        Implementation would depend on the exact RAGAS API.
        """
        if not RAGAS_AVAILABLE:
            raise ImportError("RAGAS not available")
            
        # This is a placeholder - actual implementation would convert to RAGAS format
        # and call the appropriate metrics
        
        # Example implementation pattern (adjust based on actual RAGAS API):
        # from ragas.metrics import answer_relevancy, faithfulness
        # dataset = convert_to_ragas_format(query, response, reference_answer)
        # relevancy_result = answer_relevancy.compute(dataset)
        # faithfulness_result = faithfulness.compute(dataset)
        # return (relevancy_result["answer_relevancy"] + faithfulness_result["faithfulness"]) / 2
        
        # For now, we'll calculate cosine similarity using embeddings
        response_embedding = self.embeddings.embed_query(response)
        reference_embedding = self.embeddings.embed_query(reference_answer)
        
        # Use sklearn's cosine_similarity
        response_reshaped = np.array(response_embedding).reshape(1, -1)
        reference_reshaped = np.array(reference_embedding).reshape(1, -1)
        similarity = cosine_similarity(response_reshaped, reference_reshaped)[0][0]
        
        return similarity
    
    def measure_query_time(
        self,
        query_func: Callable[[str], Any],
        query: str
    ) -> float:
        """
        Measure the time taken to process a query.
        
        Args:
            query_func: Function that processes the query
            query: Query string
            
        Returns:
            Time taken in seconds
        """
        start_time = time.time()
        query_func(query)
        end_time = time.time()
        
        return end_time - start_time
    
    def run_evaluation(
        self,
        queries: List[str],
        standard_rag,
        contextual_rag,
        relevant_docs: Optional[Dict[str, List[Document]]] = None,
        reference_answers: Optional[Dict[str, str]] = None,
        llm_evaluator = None
    ):
        """
        Run a comprehensive evaluation on both RAG systems.
        
        Args:
            queries: List of query strings
            standard_rag: Standard RAG system
            contextual_rag: Contextual header RAG system
            relevant_docs: Optional dictionary mapping queries to relevant documents
            reference_answers: Optional dictionary mapping queries to reference answers
            llm_evaluator: Optional LLM for evaluation
        """
        print(f"Running evaluation on {len(queries)} queries...")
        for query in tqdm(queries, desc="Evaluating queries"):
            # Evaluate standard RAG
            standard_time = self.measure_query_time(standard_rag.invoke, query)
            standard_response = standard_rag.invoke(query)
            standard_docs = standard_rag.retriever.get_relevant_documents(query)
            
            # Evaluate contextual header RAG
            contextual_time = self.measure_query_time(contextual_rag.invoke, query)
            contextual_response = contextual_rag.invoke(query)
            contextual_docs = contextual_rag.retriever.get_relevant_documents(query)
            
            # Store results for this query
            self.results["standard"][query] = {
                "response": standard_response,
                "retrieved_docs": standard_docs,
                "query_time": standard_time,
                "semantic_relevance": self.evaluate_semantic_relevance(query, standard_docs)
            }
            
            self.results["contextual"][query] = {
                "response": contextual_response,
                "retrieved_docs": contextual_docs,
                "query_time": contextual_time,
                "semantic_relevance": self.evaluate_semantic_relevance(query, contextual_docs)
            }
            
            # Add retrieval accuracy if relevant docs are provided
            if relevant_docs and query in relevant_docs:
                self.results["standard"][query]["retrieval_accuracy"] = self.evaluate_retrieval_accuracy(
                    query, standard_docs, relevant_docs[query]
                )
                self.results["contextual"][query]["retrieval_accuracy"] = self.evaluate_retrieval_accuracy(
                    query, contextual_docs, relevant_docs[query]
                )
            
            # Add response quality if reference answers or LLM evaluator are provided
            if reference_answers and query in reference_answers:
                self.results["standard"][query]["response_quality"] = self.evaluate_response_quality(
                    query, standard_response, reference_answers[query]
                )
                self.results["contextual"][query]["response_quality"] = self.evaluate_response_quality(
                    query, contextual_response, reference_answers[query]
                )
            elif llm_evaluator:
                self.results["standard"][query]["response_quality"] = self.evaluate_response_quality(
                    query, standard_response, llm_evaluator=llm_evaluator
                )
                self.results["contextual"][query]["response_quality"] = self.evaluate_response_quality(
                    query, contextual_response, llm_evaluator=llm_evaluator
                )
    
    def get_summary_metrics(self) -> Dict[str, Dict[str, float]]:
        """
        Get summary metrics for both RAG systems.
        
        Returns:
            Dictionary of summary metrics
        """
        summary = {
            "standard": {},
            "contextual": {}
        }
        
        for system in ["standard", "contextual"]:
            # Calculate average query time
            query_times = [result["query_time"] for result in self.results[system].values()]
            summary[system]["avg_query_time"] = np.mean(query_times)
            
            # Calculate average semantic relevance
            semantic_relevances = [result["semantic_relevance"] for result in self.results[system].values()]
            summary[system]["avg_semantic_relevance"] = np.mean(semantic_relevances)
            
            # Calculate average retrieval accuracy if available
            retrieval_accuracies = [
                result.get("retrieval_accuracy") 
                for result in self.results[system].values() 
                if "retrieval_accuracy" in result
            ]
            if retrieval_accuracies:
                summary[system]["avg_retrieval_accuracy"] = np.mean(retrieval_accuracies)
            
            # Calculate average response quality if available
            response_qualities = [
                result.get("response_quality") 
                for result in self.results[system].values() 
                if "response_quality" in result
            ]
            if response_qualities:
                summary[system]["avg_response_quality"] = np.mean(response_qualities)
        
        return summary
    
    def visualize_results(self, output_path: Optional[str] = None):
        """
        Visualize the evaluation results.
        
        Args:
            output_path: Optional path to save the visualization
        """
        summary = self.get_summary_metrics()
        
        # Use pandas and seaborn for better visualizations
        import seaborn as sns
        sns.set_style("whitegrid")
        
        # Prepare data for plotting
        metrics = []
        values = []
        systems = []
        
        for system in ["standard", "contextual"]:
            for metric, value in summary[system].items():
                metrics.append(metric)
                values.append(value)
                systems.append("Standard RAG" if system == "standard" else "Contextual Header RAG")
        
        # Create DataFrame
        df = pd.DataFrame({
            "Metric": metrics,
            "Value": values,
            "System": systems
        })
        
        # Create figure with subplots
        fig, axs = plt.subplots(2, 2, figsize=(14, 10))
        fig.suptitle('RAG System Comparison: Standard vs. Contextual Header', fontsize=16)
        
        # Get unique metrics
        unique_metrics = df["Metric"].unique()
        
        # Plot each metric
        plot_positions = [(0, 0), (0, 1), (1, 0), (1, 1)]
        
        for i, metric in enumerate(unique_metrics):
            if i < len(plot_positions):
                row, col = plot_positions[i]
                metric_data = df[df["Metric"] == metric]
                
                # Use seaborn for better looking bars
                sns.barplot(x="System", y="Value", data=metric_data, ax=axs[row, col])
                
                # Format the plot
                axs[row, col].set_title(f'Average {" ".join(metric.split("_")[1:]).title()}')
                axs[row, col].set_ylabel(self._get_metric_unit(metric))
                
                # Add value labels on top of bars
                for j, p in enumerate(axs[row, col].patches):
                    axs[row, col].annotate(f'{p.get_height():.3f}', 
                                         (p.get_x() + p.get_width() / 2., p.get_height()),
                                         ha='center', va='bottom')
        
        # Fill empty plots if any
        for i in range(len(unique_metrics), len(plot_positions)):
            row, col = plot_positions[i]
            axs[row, col].text(0.5, 0.5, 'Metric Not Available', 
                             horizontalalignment='center', verticalalignment='center')
        
        # Adjust layout
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Save or show the figure
        if output_path:
            plt.savefig(output_path, dpi=300, bbox_inches='tight')
        else:
            plt.show()
    
    def _get_metric_unit(self, metric: str) -> str:
        """Helper function to get appropriate y-axis label for metrics"""
        if "time" in metric:
            return "Time (seconds)"
        elif "relevance" in metric:
            return "Cosine Similarity"
        elif "accuracy" in metric:
            return "Precision"
        elif "quality" in metric:
            return "Quality Score"
        else:
            return "Value"
            
    def export_results(self, output_path: str):
        """
        Export evaluation results to CSV.
        
        Args:
            output_path: Path to save the CSV
        """
        # Prepare data for export
        data = []
        
        for system in ["standard", "contextual"]:
            for query, result in self.results[system].items():
                row = {
                    "system": system,
                    "query": query,
                    "query_time": result["query_time"],
                    "semantic_relevance": result["semantic_relevance"]
                }
                
                # Add optional metrics if available
                if "retrieval_accuracy" in result:
                    row["retrieval_accuracy"] = result["retrieval_accuracy"]
                
                if "response_quality" in result:
                    row["response_quality"] = result["response_quality"]
                
                data.append(row)
        
        # Create DataFrame and export
        df = pd.DataFrame(data)
        df.to_csv(output_path, index=False)
        print(f"Results exported to {output_path}")

1368000

In [None]:
ggggggggggggggggggggggggggggggggggggggggffffgghghghghghggggggggggggggggggggggggggggggggggggggggggggggghuhuhuhuh