**# SETUP AND INSTALLATION**

In [None]:
# Install required packages
!pip install ragas langchain openai sentence-transformers datasets
!pip install nltk rouge-score sacrebleu

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datasets import Dataset
import json
from typing import List, Dict, Any
import os

# RAGAS imports
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness,
    answer_similarity
)

# LangChain imports
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

# Traditional metrics
import nltk
from rouge_score import rouge_scorer
from sacrebleu import BLEU
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK data
nltk.download('punkt', quiet=True)

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "your-openai-api-key-here"

# Initialize models for RAGAS
embeddings = OpenAIEmbeddings()
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

print("‚úÖ Setup complete! Ready for RAGAS evaluation.")

**# 14.2.2 ESSENTIAL RAGAS METRICS - SAMPLE DATA**

In [None]:
# Create sample RAG evaluation dataset
sample_data = [
    {
        "question": "What is the capital of France?",
        "contexts": [
            "Paris is the capital and most populous city of France. It is located in northern France.",
            "France is a country in Western Europe with several major cities including Lyon and Marseille."
        ],
        "answer": "The capital of France is Paris, which is also its most populous city.",
        "ground_truth": "Paris is the capital of France."
    },
    {
        "question": "How does photosynthesis work?",
        "contexts": [
            "Photosynthesis is the process by which plants convert sunlight, carbon dioxide, and water into glucose and oxygen.",
            "Chlorophyll in plant leaves captures light energy to drive the photosynthetic process.",
            "The chemical equation for photosynthesis is: 6CO2 + 6H2O + light energy ‚Üí C6H12O6 + 6O2"
        ],
        "answer": "Photosynthesis is how plants make food using sunlight, CO2, and water to produce glucose and oxygen.",
        "ground_truth": "Photosynthesis converts light energy, carbon dioxide and water into glucose and oxygen."
    },
    {
        "question": "What causes earthquakes?",
        "contexts": [
            "Earthquakes are caused by the sudden release of energy stored in rocks beneath Earth's surface.",
            "Tectonic plates moving against each other create stress that builds up over time.",
            "When the stress exceeds the strength of rocks, they break suddenly, releasing energy as seismic waves."
        ],
        "answer": "Earthquakes happen when tectonic plates move and create stress underground. When rocks can't handle the stress anymore, they break and release energy as seismic waves.",
        "ground_truth": "Earthquakes are caused by tectonic plate movement and sudden rock fracture releasing seismic energy."
    },
    {
        "question": "What is machine learning?",
        "contexts": [
            "Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
            "Popular machine learning algorithms include neural networks, decision trees, and support vector machines.",
            "Deep learning is a specialized form of machine learning using neural networks with multiple layers."
        ],
        "answer": "Machine learning is part of AI where computers learn patterns from data automatically, using algorithms like neural networks to make predictions without explicit programming.",
        "ground_truth": "Machine learning is a branch of AI that allows computers to learn patterns from data."
    },
    {
        "question": "How do vaccines work?",
        "contexts": [
            "Vaccines contain weakened or inactive parts of a pathogen that trigger an immune response.",
            "The immune system creates antibodies and memory cells when exposed to vaccine antigens.",
            "If the real pathogen is encountered later, memory cells quickly produce antibodies for protection."
        ],
        "answer": "Vaccines work by training your immune system with safe versions of germs, so it can recognize and fight the real disease later.",
        "ground_truth": "Vaccines stimulate immune system to create antibodies and memory for future pathogen protection."
    }
]

# Convert to RAGAS dataset format
ragas_dataset = Dataset.from_dict({
    "question": [item["question"] for item in sample_data],
    "contexts": [item["contexts"] for item in sample_data],
    "answer": [item["answer"] for item in sample_data],
    "ground_truth": [item["ground_truth"] for item in sample_data]
})

print("üìä Sample RAG evaluation dataset created!")
print(f"Dataset size: {len(sample_data)} examples")


**# 14.2.2 ESSENTIAL RAGAS METRICS - IMPLEMENTATION**

In [None]:
class RAGASEvaluator:
    """Complete RAGAS evaluation implementation with detailed explanations."""

    def __init__(self, llm=None, embeddings=None):
        self.llm = llm or ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
        self.embeddings = embeddings or OpenAIEmbeddings()

    def evaluate_faithfulness(self, dataset):
        """
        Faithfulness measures how factually accurate the answer is based on the given context.
        Score range: 0.0 to 1.0 (higher is better)

        How it works:
        1. LLM identifies claims in the generated answer
        2. LLM verifies each claim against the provided context
        3. Score = (verified claims) / (total claims)
        """
        print("üîç Evaluating Faithfulness...")
        print("This measures if the answer is factually grounded in the provided context.")

        result = evaluate(
            dataset,
            metrics=[faithfulness],
            llm=self.llm,
            embeddings=self.embeddings
        )

        score = result['faithfulness']
        print(f"‚úÖ Faithfulness Score: {score:.3f}")

        # Interpretation guide
        if score >= 0.9:
            print("üåü Excellent: Answers are highly faithful to context")
        elif score >= 0.7:
            print("üëç Good: Most answers are grounded in context")
        elif score >= 0.5:
            print("‚ö†Ô∏è  Moderate: Some hallucination issues detected")
        else:
            print("üö® Poor: Significant hallucination problems")

        return score

    def evaluate_answer_relevancy(self, dataset):
        """
        Answer Relevancy measures how well the answer addresses the specific question.
        Score range: 0.0 to 1.0 (higher is better)

        How it works:
        1. LLM generates potential questions that the answer could address
        2. Computes similarity between original question and generated questions
        3. Higher similarity indicates better relevancy
        """
        print("\nüéØ Evaluating Answer Relevancy...")
        print("This measures how well answers address the specific questions asked.")

        result = evaluate(
            dataset,
            metrics=[answer_relevancy],
            llm=self.llm,
            embeddings=self.embeddings
        )

        score = result['answer_relevancy']
        print(f"‚úÖ Answer Relevancy Score: {score:.3f}")

        # Interpretation guide
        if score >= 0.9:
            print("üåü Excellent: Answers directly address questions")
        elif score >= 0.7:
            print("üëç Good: Answers are mostly relevant")
        elif score >= 0.5:
            print("‚ö†Ô∏è  Moderate: Some answers drift from the question")
        else:
            print("üö® Poor: Answers frequently miss the point")

        return score

    def evaluate_context_precision(self, dataset):
        """
        Context Precision measures the signal-to-noise ratio in retrieved contexts.
        Score range: 0.0 to 1.0 (higher is better)

        How it works:
        1. LLM determines which contexts are relevant to answering the question
        2. Evaluates if relevant contexts are ranked higher than irrelevant ones
        3. Higher precision means better retrieval ranking
        """
        print("\nüìã Evaluating Context Precision...")
        print("This measures the quality of retrieved context ranking.")

        result = evaluate(
            dataset,
            metrics=[context_precision],
            llm=self.llm,
            embeddings=self.embeddings
        )

        score = result['context_precision']
        print(f"‚úÖ Context Precision Score: {score:.3f}")

        # Interpretation guide
        if score >= 0.9:
            print("üåü Excellent: Highly relevant contexts ranked first")
        elif score >= 0.7:
            print("üëç Good: Mostly relevant contexts in top positions")
        elif score >= 0.5:
            print("‚ö†Ô∏è  Moderate: Mixed relevant/irrelevant contexts")
        else:
            print("üö® Poor: Retrieval ranking needs improvement")

        return score

    def evaluate_context_recall(self, dataset):
        """
        Context Recall measures completeness of retrieved context.
        Score range: 0.0 to 1.0 (higher is better)

        How it works:
        1. LLM identifies information needed to answer the question (from ground truth)
        2. Checks if this information is present in retrieved contexts
        3. Score = (relevant info retrieved) / (total relevant info needed)
        """
        print("\nüîÑ Evaluating Context Recall...")
        print("This measures how completely the retrieval captures relevant information.")

        result = evaluate(
            dataset,
            metrics=[context_recall],
            llm=self.llm,
            embeddings=self.embeddings
        )

        score = result['context_recall']
        print(f"‚úÖ Context Recall Score: {score:.3f}")

        # Interpretation guide
        if score >= 0.9:
            print("üåü Excellent: Retrieval captures nearly all relevant info")
        elif score >= 0.7:
            print("üëç Good: Most relevant information is retrieved")
        elif score >= 0.5:
            print("‚ö†Ô∏è  Moderate: Some relevant information is missing")
        else:
            print("üö® Poor: Significant gaps in retrieved information")

        return score

    def comprehensive_evaluation(self, dataset):
        """Run all RAGAS metrics and provide comprehensive analysis."""
        print("üöÄ Running Comprehensive RAGAS Evaluation")
        print("=" * 60)

        # Run all metrics together (more efficient)
        result = evaluate(
            dataset,
            metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
            llm=self.llm,
            embeddings=self.embeddings
        )

        # Extract scores
        scores = {
            'faithfulness': result['faithfulness'],
            'answer_relevancy': result['answer_relevancy'],
            'context_precision': result['context_precision'],
            'context_recall': result['context_recall']
        }

        # Extract scores (RAGAS returns lists, so we take the mean)
        faithfulness_score = np.mean(result['faithfulness']) if isinstance(result['faithfulness'], list) else result['faithfulness']
        relevancy_score = np.mean(result['answer_relevancy']) if isinstance(result['answer_relevancy'], list) else result['answer_relevancy']
        precision_score = np.mean(result['context_precision']) if isinstance(result['context_precision'], list) else result['context_precision']
        recall_score = np.mean(result['context_recall']) if isinstance(result['context_recall'], list) else result['context_recall']

        # Calculate overall RAG score (weighted average)
        overall_score = (
            faithfulness_score * 0.3 +
            relevancy_score * 0.3 +
            precision_score * 0.2 +
            recall_score * 0.2
        )

        # Update scores dictionary with numeric values
        scores = {
            'faithfulness': faithfulness_score,
            'answer_relevancy': relevancy_score,
            'context_precision': precision_score,
            'context_recall': recall_score
        }

        # Display results
        print("\nüìä RAGAS Evaluation Results:")
        print("-" * 40)
        for metric, score in scores.items():
            print(f"{metric:20}: {score:.3f}")
        print("-" * 40)
        print(f"{'Overall RAG Score':20}: {overall_score:.3f}")

        # Detailed analysis
        print("\nüîç Detailed Analysis:")
        self._analyze_scores(scores)

        return scores, overall_score

    def _analyze_scores(self, scores):
        """Provide detailed analysis of RAGAS scores."""

        # Identify strengths and weaknesses
        strengths = []
        weaknesses = []

        for metric, score in scores.items():
            if score >= 0.8:
                strengths.append(f"{metric} ({score:.3f})")
            elif score < 0.6:
                weaknesses.append(f"{metric} ({score:.3f})")

        if strengths:
            print(f"‚úÖ Strengths: {', '.join(strengths)}")
        if weaknesses:
            print(f"‚ö†Ô∏è  Areas for improvement: {', '.join(weaknesses)}")

        # Specific recommendations
        print("\nüí° Recommendations:")

        if scores['faithfulness'] < 0.7:
            print("- Address hallucination: Improve prompt engineering or add fact-checking")

        if scores['answer_relevancy'] < 0.7:
            print("- Improve relevancy: Better query understanding or response filtering")

        if scores['context_precision'] < 0.7:
            print("- Enhance retrieval: Improve ranking algorithm or embedding quality")

        if scores['context_recall'] < 0.7:
            print("- Increase coverage: Expand knowledge base or improve search recall")

# Run RAGAS evaluation
evaluator = RAGASEvaluator()
scores, overall_score = evaluator.comprehensive_evaluation(ragas_dataset)

# ================================================================
# 14.2.3 CUSTOM METRICS IMPLEMENTATION
# ================================================================

class CustomRAGASMetrics:
    """Implementation of custom domain-specific RAGAS metrics."""

    def __init__(self, llm=None):
        self.llm = llm or ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

    def create_domain_specific_metric(self, domain="medical"):
        """
        Create a custom metric for domain-specific evaluation.
        This example shows medical domain evaluation.
        """
        print(f"\nüè• Creating Custom {domain.title()} Domain Metric")

        if domain == "medical":
            return self._create_medical_accuracy_metric()
        elif domain == "legal":
            return self._create_legal_precision_metric()
        elif domain == "financial":
            return self._create_financial_compliance_metric()
        else:
            return self._create_generic_domain_metric(domain)

    def _create_medical_accuracy_metric(self):
        """Custom metric for medical information accuracy."""

        medical_evaluation_prompt = """
        You are a medical expert evaluating the accuracy and safety of health information.

        Given:
        - Question: {question}
        - Context: {context}
        - Answer: {answer}

        Evaluate the answer on these medical criteria:
        1. Medical accuracy: Are the facts correct?
        2. Safety: Could this information harm someone if followed?
        3. Appropriate scope: Does it avoid diagnosing or prescribing?
        4. Disclaimers: Does it appropriately direct to healthcare professionals?

        Provide a score from 0.0 to 1.0 and brief explanation.
        Format: Score: X.X | Explanation: [brief explanation]
        """

        def medical_accuracy(row):
            prompt = medical_evaluation_prompt.format(
                question=row['question'],
                context='\n'.join(row['contexts']),
                answer=row['answer']
            )

            response = self.llm.predict(prompt)

            # Parse score (simplified - in production, use more robust parsing)
            try:
                score_part = response.split('Score:')[1].split('|')[0].strip()
                score = float(score_part)
                return score
            except:
                return 0.5  # Default if parsing fails

        return medical_accuracy

    def _create_legal_precision_metric(self):
        """Custom metric for legal information precision."""

        legal_evaluation_prompt = """
        You are a legal expert evaluating legal information quality.

        Given:
        - Question: {question}
        - Context: {context}
        - Answer: {answer}

        Evaluate on:
        1. Legal accuracy: Are legal concepts correctly explained?
        2. Jurisdiction awareness: Does it acknowledge legal variations?
        3. Disclaimers: Does it appropriately advise consulting lawyers?
        4. Clarity: Is complex legal language made accessible?

        Score from 0.0 to 1.0.
        Format: Score: X.X
        """

        def legal_precision(row):
            prompt = legal_evaluation_prompt.format(
                question=row['question'],
                context='\n'.join(row['contexts']),
                answer=row['answer']
            )

            response = self.llm.predict(prompt)

            try:
                score = float(response.split('Score:')[1].strip())
                return score
            except:
                return 0.5

        return legal_precision

    def evaluate_with_custom_metrics(self, dataset, custom_metrics):
        """Evaluate dataset using custom metrics."""

        results = {}

        for metric_name, metric_func in custom_metrics.items():
            print(f"\nüîß Evaluating custom metric: {metric_name}")

            scores = []
            for i in range(len(dataset)):
                row = {
                    'question': dataset[i]['question'],
                    'contexts': dataset[i]['contexts'],
                    'answer': dataset[i]['answer'],
                    'ground_truth': dataset[i]['ground_truth']
                }
                score = metric_func(row)
                scores.append(score)

            avg_score = np.mean(scores)
            results[metric_name] = {
                'scores': scores,
                'average': avg_score
            }

            print(f"‚úÖ {metric_name}: {avg_score:.3f}")

        return results

# Demonstrate custom metrics
print("\n" + "="*60)
print("üîß CUSTOM METRICS DEMONSTRATION")
print("="*60)

custom_evaluator = CustomRAGASMetrics()

# Create domain-specific metrics
medical_metric = custom_evaluator.create_domain_specific_metric("medical")
legal_metric = custom_evaluator.create_domain_specific_metric("legal")

# Example evaluation (commented out due to API costs)
# custom_metrics = {
#     'medical_accuracy': medical_metric,
#     'legal_precision': legal_metric
# }
# custom_results = custom_evaluator.evaluate_with_custom_metrics(ragas_dataset, custom_metrics)

print("üí° Custom metrics created! Uncomment evaluation code to run with your API key.")


**# 14.3.2 TRADITIONAL GENERATION QUALITY METRICS**

In [None]:
class TraditionalMetrics:
    """Implementation of traditional text generation quality metrics."""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

    def calculate_bleu_score(self, predictions, references):
        """Calculate BLEU scores for generated text."""
        print("\nüìä Calculating BLEU Scores...")
        print("BLEU measures n-gram overlap between prediction and reference")

        bleu = BLEU()
        scores = []

        for pred, ref in zip(predictions, references):
            # BLEU expects tokenized input
            pred_tokens = pred.split()
            ref_tokens = [ref.split()]  # List of reference tokenizations

            score = bleu.sentence_score(pred, [ref]).score / 100.0  # Normalize to 0-1
            scores.append(score)

        avg_bleu = np.mean(scores)
        print(f"‚úÖ Average BLEU Score: {avg_bleu:.3f}")

        # Interpretation
        if avg_bleu >= 0.4:
            print("üåü Excellent lexical similarity")
        elif avg_bleu >= 0.2:
            print("üëç Good lexical overlap")
        elif avg_bleu >= 0.1:
            print("‚ö†Ô∏è  Moderate similarity")
        else:
            print("üö® Low lexical similarity")

        return scores, avg_bleu

    def calculate_rouge_scores(self, predictions, references):
        """Calculate ROUGE scores for generated text."""
        print("\nüìä Calculating ROUGE Scores...")
        print("ROUGE measures recall-oriented overlap with reference summaries")

        rouge1_scores = []
        rouge2_scores = []
        rougeL_scores = []

        for pred, ref in zip(predictions, references):
            scores = self.rouge_scorer.score(ref, pred)
            rouge1_scores.append(scores['rouge1'].fmeasure)
            rouge2_scores.append(scores['rouge2'].fmeasure)
            rougeL_scores.append(scores['rougeL'].fmeasure)

        avg_rouge1 = np.mean(rouge1_scores)
        avg_rouge2 = np.mean(rouge2_scores)
        avg_rougeL = np.mean(rougeL_scores)

        print(f"‚úÖ ROUGE-1 (unigram): {avg_rouge1:.3f}")
        print(f"‚úÖ ROUGE-2 (bigram):  {avg_rouge2:.3f}")
        print(f"‚úÖ ROUGE-L (longest): {avg_rougeL:.3f}")

        return {
            'rouge1': (rouge1_scores, avg_rouge1),
            'rouge2': (rouge2_scores, avg_rouge2),
            'rougeL': (rougeL_scores, avg_rougeL)
        }

    def calculate_semantic_similarity(self, predictions, references):
        """Calculate semantic similarity using sentence embeddings."""
        print("\nüìä Calculating Semantic Similarity...")
        print("Measures semantic closeness using sentence embeddings")

        # Get embeddings
        pred_embeddings = self.sentence_model.encode(predictions)
        ref_embeddings = self.sentence_model.encode(references)

        # Calculate cosine similarities
        similarities = []
        for pred_emb, ref_emb in zip(pred_embeddings, ref_embeddings):
            similarity = cosine_similarity([pred_emb], [ref_emb])[0][0]
            similarities.append(similarity)

        avg_similarity = np.mean(similarities)
        print(f"‚úÖ Average Semantic Similarity: {avg_similarity:.3f}")

        # Interpretation
        if avg_similarity >= 0.8:
            print("üåü Excellent semantic alignment")
        elif avg_similarity >= 0.6:
            print("üëç Good semantic similarity")
        elif avg_similarity >= 0.4:
            print("‚ö†Ô∏è  Moderate semantic overlap")
        else:
            print("üö® Low semantic similarity")

        return similarities, avg_similarity

    def comprehensive_traditional_evaluation(self, predictions, references):
        """Run all traditional metrics and compare with RAGAS."""
        print("\nüîÑ COMPREHENSIVE TRADITIONAL METRICS EVALUATION")
        print("="*60)

        # Calculate all metrics
        bleu_scores, avg_bleu = self.calculate_bleu_score(predictions, references)
        rouge_results = self.calculate_rouge_scores(predictions, references)
        sem_scores, avg_sem = self.calculate_semantic_similarity(predictions, references)

        # Create comparison summary
        print("\nüìã Traditional Metrics Summary:")
        print("-" * 40)
        print(f"{'BLEU':20}: {avg_bleu:.3f}")
        print(f"{'ROUGE-1':20}: {rouge_results['rouge1'][1]:.3f}")
        print(f"{'ROUGE-2':20}: {rouge_results['rouge2'][1]:.3f}")
        print(f"{'ROUGE-L':20}: {rouge_results['rougeL'][1]:.3f}")
        print(f"{'Semantic Similarity':20}: {avg_sem:.3f}")

        # Analysis
        print("\nüîç Traditional vs RAGAS Analysis:")
        print("üìå Traditional metrics focus on surface-level similarity")
        print("üìå RAGAS metrics focus on semantic correctness and context usage")
        print("üìå Use both for comprehensive evaluation:")
        print("   - Traditional: Quick quality checks, benchmark comparisons")
        print("   - RAGAS: Deep quality assessment, hallucination detection")

        return {
            'bleu': (bleu_scores, avg_bleu),
            'rouge': rouge_results,
            'semantic_similarity': (sem_scores, avg_sem)
        }

# Run traditional metrics evaluation
print("\n" + "="*60)
print("üìä TRADITIONAL METRICS EVALUATION")
print("="*60)

traditional_evaluator = TraditionalMetrics()

# Extract predictions and references from sample data
predictions = [item["answer"] for item in sample_data]
references = [item["ground_truth"] for item in sample_data]

traditional_results = traditional_evaluator.comprehensive_traditional_evaluation(predictions, references)


**# COMPARATIVE ANALYSIS**

In [None]:
def compare_evaluation_approaches():
    """Compare RAGAS vs Traditional metrics with analysis."""
    print("\nüî¨ COMPARATIVE ANALYSIS: RAGAS vs TRADITIONAL")
    print("="*60)

    comparison_data = {
        'Aspect': [
            'Evaluation Focus',
            'Context Awareness',
            'Hallucination Detection',
            'Semantic Understanding',
            'Reference Requirement',
            'Computational Cost',
            'Human Alignment',
            'RAG-Specific Design'
        ],
        'RAGAS': [
            'End-to-end RAG quality',
            'Fully context-aware',
            'Excellent detection',
            'Deep semantic analysis',
            'Optional (reference-free)',
            'Higher (LLM calls)',
            'High correlation',
            'Purpose-built for RAG'
        ],
        'Traditional': [
            'Surface text similarity',
            'Context-unaware',
            'Cannot detect',
            'Limited to n-grams',
            'Required',
            'Lower (simple metrics)',
            'Moderate correlation',
            'General-purpose'
        ]
    }

    comparison_df = pd.DataFrame(comparison_data)
    print(comparison_df.to_string(index=False))

    print("\nüí° Key Insights:")
    print("1. RAGAS provides deeper, more meaningful evaluation for RAG systems")
    print("2. Traditional metrics remain useful for quick checks and benchmarking")
    print("3. Best practice: Use both approaches complementarily")
    print("4. RAGAS better correlates with human judgment for RAG tasks")
    print("5. Traditional metrics are faster but miss critical RAG-specific issues")

compare_evaluation_approaches()

**# PRACTICAL EVALUATION PIPELINE**

In [None]:
class ProductionEvaluationPipeline:
    """Complete evaluation pipeline for production RAG systems."""

    def __init__(self):
        self.ragas_evaluator = RAGASEvaluator()
        self.traditional_evaluator = TraditionalMetrics()
        self.custom_evaluator = CustomRAGASMetrics()

    def evaluate_rag_system(self, dataset, include_traditional=True, custom_metrics=None):
        """Run complete evaluation pipeline."""

        print("üöÄ PRODUCTION RAG EVALUATION PIPELINE")
        print("="*60)

        results = {}

        # 1. RAGAS Evaluation
        print("\n1Ô∏è‚É£ Running RAGAS Evaluation...")
        ragas_scores, overall_ragas = self.ragas_evaluator.comprehensive_evaluation(dataset)
        results['ragas'] = {'scores': ragas_scores, 'overall': overall_ragas}

        # 2. Traditional Metrics (if requested)
        if include_traditional:
            print("\n2Ô∏è‚É£ Running Traditional Metrics...")
            predictions = [dataset[i]['answer'] for i in range(len(dataset))]
            references = [dataset[i]['ground_truth'] for i in range(len(dataset))]
            traditional_scores = self.traditional_evaluator.comprehensive_traditional_evaluation(
                predictions, references
            )
            results['traditional'] = traditional_scores

        # 3. Custom Metrics (if provided)
        if custom_metrics:
            print("\n3Ô∏è‚É£ Running Custom Metrics...")
            custom_scores = self.custom_evaluator.evaluate_with_custom_metrics(
                dataset, custom_metrics
            )
            results['custom'] = custom_scores

        # 4. Generate Report
        self._generate_evaluation_report(results)

        return results

    def _generate_evaluation_report(self, results):
        """Generate comprehensive evaluation report."""

        print("\nüìã EVALUATION REPORT")
        print("="*50)

        # RAGAS Summary
        if 'ragas' in results:
            print(f"\nüéØ RAGAS Overall Score: {results['ragas']['overall']:.3f}")
            print("   Component Scores:")
            for metric, score in results['ragas']['scores'].items():
                print(f"   ‚Ä¢ {metric}: {score:.3f}")

        # Traditional Summary
        if 'traditional' in results:
            print(f"\nüìä Traditional Metrics:")
            trad = results['traditional']
            print(f"   ‚Ä¢ BLEU: {trad['bleu'][1]:.3f}")
            print(f"   ‚Ä¢ ROUGE-L: {trad['rouge']['rougeL'][1]:.3f}")
            print(f"   ‚Ä¢ Semantic Similarity: {trad['semantic_similarity'][1]:.3f}")

        # Custom Summary
        if 'custom' in results:
            print(f"\nüîß Custom Metrics:")
            for metric_name, metric_data in results['custom'].items():
                print(f"   ‚Ä¢ {metric_name}: {metric_data['average']:.3f}")

        # Recommendations
        print(f"\nüí° Recommendations:")
        if 'ragas' in results:
            ragas_scores = results['ragas']['scores']
            if ragas_scores['faithfulness'] < 0.7:
                print("   ‚Ä¢ Address hallucination issues")
            if ragas_scores['context_precision'] < 0.7:
                print("   ‚Ä¢ Improve retrieval ranking")
            if ragas_scores['context_recall'] < 0.7:
                print("   ‚Ä¢ Expand knowledge base coverage")
            if ragas_scores['answer_relevancy'] < 0.7:
                print("   ‚Ä¢ Enhance query understanding")

# Run complete evaluation pipeline
pipeline = ProductionEvaluationPipeline()

# Example: Run full evaluation (uncomment to execute with API key)
# full_results = pipeline.evaluate_rag_system(
#     ragas_dataset,
#     include_traditional=True,
#     custom_metrics=None  # Add custom metrics here if needed
# )

print("\nüéâ Notebook Complete!")
print("You've learned how to:")
print("‚úÖ Implement all four core RAGAS metrics")
print("‚úÖ Create custom domain-specific evaluation metrics")
print("‚úÖ Apply traditional generation quality metrics")
print("‚úÖ Build comprehensive evaluation pipelines")
print("‚úÖ Compare different evaluation approaches")


**# ADVANCED EVALUATION TECHNIQUES**

In [None]:
class AdvancedEvaluationTechniques:
    """Advanced techniques for sophisticated RAG evaluation."""

    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

    def aspect_based_evaluation(self, dataset, aspects=None):
        """
        Evaluate RAG responses across multiple specific aspects.
        Useful for nuanced quality assessment.
        """
        if aspects is None:
            aspects = [
                "Accuracy", "Completeness", "Clarity", "Relevance",
                "Consistency", "Timeliness", "Neutrality"
            ]

        print(f"\nüé≠ Aspect-Based Evaluation")
        print(f"Evaluating across {len(aspects)} aspects: {', '.join(aspects)}")

        aspect_prompt = """
        Evaluate the following RAG response across these specific aspects:

        Question: {question}
        Context: {context}
        Answer: {answer}

        Rate each aspect from 1-5 (5=excellent, 1=poor):
        {aspect_list}

        Format your response as:
        Aspect1: X | Aspect2: Y | ... | Overall: Z
        Brief justification: [explain your ratings]
        """

        results = {aspect: [] for aspect in aspects}
        results['overall'] = []

        for i in range(min(3, len(dataset))):  # Limit for demo
            row = dataset[i]

            aspect_list = " | ".join([f"{aspect}: [1-5]" for aspect in aspects])

            prompt = aspect_prompt.format(
                question=row['question'],
                context='\n'.join(row['contexts']),
                answer=row['answer'],
                aspect_list=aspect_list
            )

            # In production, you would call the LLM here
            # response = self.llm.predict(prompt)

            # For demo, simulate scores
            import random
            random.seed(42 + i)  # Reproducible for demo

            for aspect in aspects:
                score = random.uniform(3.5, 4.8)  # Simulate good scores
                results[aspect].append(score)

            overall = random.uniform(3.8, 4.5)
            results['overall'].append(overall)

        # Calculate averages
        avg_results = {aspect: np.mean(scores) for aspect, scores in results.items()}

        print("\nüìä Aspect-Based Results:")
        for aspect, avg_score in avg_results.items():
            print(f"   {aspect:12}: {avg_score:.2f}/5.0")

        return avg_results

    def confidence_calibrated_evaluation(self, dataset):
        """
        Evaluate both answer quality and model confidence.
        Helps identify when the model is uncertain.
        """
        print(f"\nüéØ Confidence-Calibrated Evaluation")
        print("Assessing both answer quality and model confidence")

        confidence_prompt = """
        Evaluate this RAG response and provide both quality and confidence assessments:

        Question: {question}
        Context: {context}
        Answer: {answer}

        Provide:
        1. Quality Score (0.0-1.0): How good is this answer?
        2. Confidence Score (0.0-1.0): How confident should we be in this assessment?
        3. Uncertainty Factors: What makes this evaluation uncertain?

        Format: Quality: X.X | Confidence: Y.Y | Factors: [list main uncertainty sources]
        """

        results = {'quality': [], 'confidence': [], 'factors': []}

        # Simulate results for demo (in production, use LLM)
        for i in range(min(3, len(dataset))):
            import random
            random.seed(42 + i)

            quality = random.uniform(0.7, 0.95)
            confidence = random.uniform(0.6, 0.9)

            # Simulate uncertainty factors
            factor_options = [
                "Ambiguous context", "Multiple valid interpretations",
                "Limited context", "Complex technical topic", "Subjective question"
            ]
            factors = random.sample(factor_options, random.randint(1, 2))

            results['quality'].append(quality)
            results['confidence'].append(confidence)
            results['factors'].append(factors)

        avg_quality = np.mean(results['quality'])
        avg_confidence = np.mean(results['confidence'])

        print(f"\nüìä Confidence-Calibrated Results:")
        print(f"   Average Quality:    {avg_quality:.3f}")
        print(f"   Average Confidence: {avg_confidence:.3f}")
        print(f"   Quality-Confidence Gap: {abs(avg_quality - avg_confidence):.3f}")

        if abs(avg_quality - avg_confidence) > 0.2:
            print("‚ö†Ô∏è  Large gap suggests calibration issues")
        else:
            print("‚úÖ Good calibration between quality and confidence")

        return results

    def error_categorization(self, dataset):
        """
        Categorize different types of errors in RAG responses.
        Helps identify systematic issues.
        """
        print(f"\nüîç Error Categorization Analysis")
        print("Identifying and categorizing response errors")

        error_categories = {
            'Factual Error': 'Answer contains incorrect factual information',
            'Hallucination': 'Answer includes information not in context',
            'Incomplete': 'Answer misses important information from context',
            'Irrelevant': 'Answer does not address the question',
            'Inconsistent': 'Answer contradicts itself or the context',
            'Unclear': 'Answer is confusing or poorly structured'
        }

        # Simulate error analysis (in production, use LLM)
        error_counts = {category: 0 for category in error_categories}
        total_errors = 0

        for i in range(len(dataset)):
            # Simulate finding 0-2 errors per response
            import random
            random.seed(42 + i)

            num_errors = random.choices([0, 1, 2], weights=[0.7, 0.25, 0.05])[0]

            if num_errors > 0:
                errors = random.sample(list(error_categories.keys()), num_errors)
                for error in errors:
                    error_counts[error] += 1
                    total_errors += 1

        print(f"\nüìä Error Analysis Results:")
        print(f"   Total Responses: {len(dataset)}")
        print(f"   Total Errors Found: {total_errors}")
        print(f"   Error Rate: {total_errors/len(dataset):.2f} errors per response")

        print(f"\nüè∑Ô∏è  Error Categories:")
        for category, count in error_counts.items():
            percentage = (count / total_errors * 100) if total_errors > 0 else 0
            print(f"   {category:15}: {count:2d} ({percentage:4.1f}%)")

        # Identify top issues
        if total_errors > 0:
            top_errors = sorted(error_counts.items(), key=lambda x: x[1], reverse=True)[:3]
            print(f"\nüéØ Top Issues to Address:")
            for i, (category, count) in enumerate(top_errors, 1):
                if count > 0:
                    print(f"   {i}. {category}: {error_categories[category]}")

        return error_counts

# Run advanced evaluation techniques
print("\n" + "="*60)
print("üî¨ ADVANCED EVALUATION TECHNIQUES")
print("="*60)

advanced_evaluator = AdvancedEvaluationTechniques()

# Aspect-based evaluation
aspect_results = advanced_evaluator.aspect_based_evaluation(ragas_dataset)

# Confidence-calibrated evaluation
confidence_results = advanced_evaluator.confidence_calibrated_evaluation(ragas_dataset)

# Error categorization
error_analysis = advanced_evaluator.error_categorization(ragas_dataset)


**# EVALUATION BEST PRACTICES AND TIPS**

In [None]:
def evaluation_best_practices():
    """Comprehensive guide to RAG evaluation best practices."""

    print("\nüìö RAG EVALUATION BEST PRACTICES")
    print("="*50)

    practices = {
        "üéØ Metric Selection": [
            "Use RAGAS for comprehensive RAG-specific evaluation",
            "Include traditional metrics for benchmark comparison",
            "Add domain-specific metrics for specialized applications",
            "Balance automated and human evaluation"
        ],

        "üìä Dataset Design": [
            "Include diverse question types and difficulty levels",
            "Ensure representative coverage of your domain",
            "Include edge cases and challenging scenarios",
            "Maintain balanced positive and negative examples"
        ],

        "üîÑ Evaluation Frequency": [
            "Continuous evaluation in development cycles",
            "Regular production monitoring (daily/weekly)",
            "Deep evaluation before major releases",
            "A/B testing for system changes"
        ],

        "‚öñÔ∏è Bias Mitigation": [
            "Use multiple evaluators (human and automated)",
            "Rotate evaluation datasets regularly",
            "Monitor for demographic and topical biases",
            "Validate automated metrics against human judgment"
        ],

        "üöÄ Production Considerations": [
            "Set up automated alerting for quality drops",
            "Monitor user feedback and satisfaction",
            "Track performance across user segments",
            "Implement gradual rollouts with evaluation gates"
        ]
    }

    for category, tips in practices.items():
        print(f"\n{category}")
        for tip in tips:
            print(f"   ‚Ä¢ {tip}")

    print(f"\nüí° Key Takeaway:")
    print("Effective RAG evaluation is ongoing, multi-faceted, and combines")
    print("automated sophisticated metrics with human insight and domain expertise.")

evaluation_best_practices()

**# EVALUATION CHECKLIST**

In [None]:
def create_evaluation_checklist():
    """Create a practical checklist for RAG evaluation implementation."""

    checklist = {
        "Pre-Evaluation Setup": [
            "‚ñ° Define evaluation objectives and success criteria",
            "‚ñ° Prepare diverse, representative test dataset",
            "‚ñ° Set up RAGAS environment and API keys",
            "‚ñ° Establish baseline scores for comparison",
            "‚ñ° Define custom metrics for domain-specific needs"
        ],

        "Core Evaluation": [
            "‚ñ° Run RAGAS faithfulness evaluation",
            "‚ñ° Assess answer relevancy scores",
            "‚ñ° Measure context precision and recall",
            "‚ñ° Calculate traditional metrics (BLEU, ROUGE)",
            "‚ñ° Perform semantic similarity analysis"
        ],

        "Advanced Analysis": [
            "‚ñ° Conduct aspect-based evaluation",
            "‚ñ° Analyze confidence calibration",
            "‚ñ° Categorize and count error types",
            "‚ñ° Identify systematic failure patterns",
            "‚ñ° Validate with human evaluation sample"
        ],

        "Production Readiness": [
            "‚ñ° Set up automated evaluation pipelines",
            "‚ñ° Configure monitoring and alerting",
            "‚ñ° Establish evaluation cadence and triggers",
            "‚ñ° Document evaluation procedures",
            "‚ñ° Train team on evaluation interpretation"
        ],

        "Continuous Improvement": [
            "‚ñ° Regular metric review and updates",
            "‚ñ° User feedback integration",
            "‚ñ° Performance trend analysis",
            "‚ñ° Evaluation methodology refinement",
            "‚ñ° Benchmark against industry standards"
        ]
    }

    print("\n‚úÖ RAG EVALUATION CHECKLIST")
    print("="*40)

    for phase, items in checklist.items():
        print(f"\n{phase}:")
        for item in items:
            print(f"   {item}")

    print(f"\nüéØ Use this checklist to ensure comprehensive RAG evaluation!")

create_evaluation_checklist()

**# SUMMARY AND NEXT STEPS**

In [None]:
print("\n" + "="*60)
print("üéâ NOTEBOOK COMPLETE - RAGAS IMPLEMENTATION AND CORE METRICS")
print("="*60)

print("\nüìã What You've Accomplished:")
print("‚úÖ Mastered RAGAS framework and core philosophy")
print("‚úÖ Implemented all four essential RAGAS metrics")
print("‚úÖ Created custom domain-specific evaluation metrics")
print("‚úÖ Applied traditional generation quality metrics")
print("‚úÖ Built comprehensive evaluation pipelines")
print("‚úÖ Learned advanced evaluation techniques")
print("‚úÖ Established evaluation best practices")

print("\nüöÄ Next Steps:")
print("1. Set up your own RAGAS evaluation with your RAG system")
print("2. Experiment with custom metrics for your domain")
print("3. Establish baseline scores for your application")
print("4. Move on to next notebook for production evaluation pipelines")
print("5. Integrate evaluation into your development workflow")

print("\nüí° Key Takeaways:")
print("‚Ä¢ RAGAS provides sophisticated, automated RAG-specific evaluation")
print("‚Ä¢ Combine multiple evaluation approaches for comprehensive assessment")
print("‚Ä¢ Custom metrics enable domain-specific quality measurement")
print("‚Ä¢ Evaluation should be continuous, not just one-time")
print("‚Ä¢ Good evaluation enables iterative improvement and production confidence")

print("\nüìñ Continue to the next notebook for production evaluation and monitoring!")

# ================================================================
# UTILITY FUNCTIONS FOR EASY REFERENCE
# ================================================================

def quick_ragas_evaluation(questions, contexts, answers, ground_truths=None):
    """
    Quick utility function for RAGAS evaluation.
    Use this for rapid testing in your own projects.
    """
    from datasets import Dataset

    # Prepare dataset
    eval_data = {
        'question': questions,
        'contexts': contexts,
        'answer': answers
    }

    if ground_truths:
        eval_data['ground_truth'] = ground_truths

    dataset = Dataset.from_dict(eval_data)

    # Run evaluation
    evaluator = RAGASEvaluator()
    scores, overall = evaluator.comprehensive_evaluation(dataset)

    return scores, overall

def save_evaluation_results(results, filename="rag_evaluation_results.json"):
    """Save evaluation results to JSON file for later analysis."""
    import json
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"‚úÖ Results saved to {filename}")

# Example usage documentation
print("\nüìö Utility Functions Available:")
print("‚Ä¢ quick_ragas_evaluation() - Fast RAGAS evaluation")
print("‚Ä¢ save_evaluation_results() - Save results to file")
print("‚Ä¢ All class methods can be used independently")

print("\nüîß Ready to evaluate your RAG systems with confidence!")