**Setup and Installation**

In [None]:
!pip install langchain langchain-openai tiktoken numpy scipy matplotlib pandas

import os
import re
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple, Callable

os.environ["OPENAI_API_KEY"] = "your-api-key"

import tiktoken
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain_openai import OpenAI, ChatOpenAI

**Basic Utility Functions**

In [2]:
def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
    """Count the number of tokens in a text string."""
    encoder = tiktoken.encoding_for_model(model)
    return len(encoder.encode(text))

def print_separator():
    """Print a visual separator."""
    print("\n" + "="*50 + "\n")

# Create sample documents for testing
sample_docs = [
    Document(page_content="The Eiffel Tower is 330 meters (1,083 ft) tall and was the tallest man-made structure in the world from 1889 to 1930.",
             metadata={"source": "travel_guide", "page": 25}),
    Document(page_content="The Eiffel Tower was built by Gustave Eiffel for the 1889 World's Fair in Paris.",
             metadata={"source": "history_book", "page": 42}),
    Document(page_content="The Eiffel Tower is made of wrought iron and weighs approximately 10,100 tonnes.",
             metadata={"source": "engineering_text", "page": 89}),
    Document(page_content="Tokyo Tower, completed in 1958, was inspired by the Eiffel Tower but is slightly taller at 333 meters.",
             metadata={"source": "global_landmarks", "page": 118}),
]

# Create evaluation set with questions and ground truth answers
eval_set = [
    {
        "question": "How tall is the Eiffel Tower?",
        "context": "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)]),
        "ground_truth": "The Eiffel Tower is 330 meters (1,083 ft) tall.",
        "expected_source": 0  # Index of the document containing the answer
    },
    {
        "question": "Who built the Eiffel Tower?",
        "context": "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)]),
        "ground_truth": "The Eiffel Tower was built by Gustave Eiffel.",
        "expected_source": 1
    },
    {
        "question": "What material is the Eiffel Tower made of?",
        "context": "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)]),
        "ground_truth": "The Eiffel Tower is made of wrought iron.",
        "expected_source": 2
    },
    {
        "question": "When did the Eiffel Tower lose its status as the world's tallest structure?",
        "context": "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)]),
        "ground_truth": "The Eiffel Tower lost its status as the world's tallest structure in 1930.",
        "expected_source": 0
    },
    {
        "question": "How does the Eiffel Tower compare to Tokyo Tower in height?",
        "context": "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)]),
        "ground_truth": "The Eiffel Tower (330 meters) is slightly shorter than Tokyo Tower (333 meters).",
        "expected_source": 3
    },
]


**Section 1: Evaluation Metrics for Prompt Template Performance**

In [None]:
print("Section 1: Evaluation Metrics for Prompt Template Performance")

def extract_facts(text: str) -> List[str]:
    """Extract factual statements from text (simplified approach)."""
    # This is a simple sentence-based extraction
    # In practice, fact extraction is more complex
    sentences = re.split(r'(?<=[.!?])\s+', text)
    facts = [s.strip() for s in sentences if len(s.strip()) > 10 and not s.strip().endswith('?')]
    return facts

def evaluate_factual_accuracy(responses: List[str], ground_truths: List[str]) -> float:
    """Evaluate factual accuracy of responses against ground truth (simplified)."""
    if not responses or not ground_truths:
        return 0.0

    scores = []
    for response, truth in zip(responses, ground_truths):
        # Check if the key information from the ground truth appears in the response
        # This is a simplified approach - real evaluation is more nuanced
        key_facts = extract_facts(truth)
        matches = 0

        for fact in key_facts:
            # Extract the core information (simplistic approach)
            core_info = ' '.join([w for w in fact.split() if len(w) > 3]).lower()
            if core_info in response.lower():
                matches += 1

        accuracy = matches / len(key_facts) if key_facts else 0.0
        scores.append(accuracy)

    return sum(scores) / len(scores)

def evaluate_citation_quality(responses: List[str], expected_sources: List[int]) -> Dict[str, float]:
    """Evaluate the quality of source citations in responses."""
    if not responses or not expected_sources:
        return {"citation_rate": 0.0, "citation_accuracy": 0.0}

    citation_counts = 0
    correct_citations = 0

    for response, expected_source in zip(responses, expected_sources):
        # Check if response contains citations
        citations = re.findall(r'\[(?:Document|Source|Doc) (\d+)\]', response)
        citation_counts += 1 if citations else 0

        # Check if the expected source is cited
        expected_doc_num = expected_source + 1  # Convert 0-based index to 1-based doc number
        if str(expected_doc_num) in citations:
            correct_citations += 1

    # Calculate metrics
    citation_rate = citation_counts / len(responses)
    citation_accuracy = correct_citations / len(responses)

    return {
        "citation_rate": citation_rate,
        "citation_accuracy": citation_accuracy
    }

def estimate_hallucination_rate(responses: List[str], contexts: List[str]) -> float:
    """Estimate the rate of hallucinated content (simplified approach)."""
    if not responses or not contexts:
        return 0.0

    hallucination_scores = []

    for response, context in zip(responses, contexts):
        # Extract statements from response
        statements = extract_facts(response)

        if not statements:
            hallucination_scores.append(0.0)
            continue

        # Count potentially hallucinated statements
        hallucinated_count = 0

        for statement in statements:
            # Simple approach: check if key terms from statement appear in context
            # This is highly simplified - real hallucination detection is more complex
            key_terms = [word for word in statement.lower().split()
                         if len(word) > 4 and word not in ["about", "these", "there", "their", "which", "would"]]

            # If key terms are found in the context, it's less likely to be a hallucination
            term_found_count = sum(1 for term in key_terms if term in context.lower())
            if key_terms and term_found_count / len(key_terms) < 0.5:
                hallucinated_count += 1

        hallucination_score = hallucinated_count / len(statements)
        hallucination_scores.append(hallucination_score)

    return sum(hallucination_scores) / len(hallucination_scores)

def assess_template(template: str, eval_set: List[Dict], llm=None) -> Dict[str, Any]:
    """
    Assess a template against an evaluation set.

    If LLM is not provided, will only calculate token usage.
    """
    token_counts = []
    responses = []

    for item in eval_set:
        # Format prompt
        formatted_prompt = template.format(context=item["context"], question=item["question"])
        token_counts.append(count_tokens(formatted_prompt))

        # Generate response if LLM is provided
        if llm:
            try:
                response = llm.invoke(formatted_prompt).content
                responses.append(response)
            except Exception as e:
                print(f"Error generating response: {e}")
                responses.append("")

    # Basic stats about token usage
    token_stats = {
        "avg_tokens": sum(token_counts) / len(token_counts),
        "max_tokens": max(token_counts),
        "min_tokens": min(token_counts)
    }

    # Return basic results if no LLM is provided
    if not llm or not responses:
        return {
            "token_stats": token_stats,
            "responses": None,
            "metrics": None
        }

    # Calculate evaluation metrics
    ground_truths = [item["ground_truth"] for item in eval_set]
    expected_sources = [item["expected_source"] for item in eval_set]
    contexts = [item["context"] for item in eval_set]

    factual_accuracy = evaluate_factual_accuracy(responses, ground_truths)
    citation_metrics = evaluate_citation_quality(responses, expected_sources)
    hallucination_rate = estimate_hallucination_rate(responses, contexts)

    metrics = {
        "factual_accuracy": factual_accuracy,
        "citation_rate": citation_metrics["citation_rate"],
        "citation_accuracy": citation_metrics["citation_accuracy"],
        "hallucination_rate": hallucination_rate,
        # Calculate a composite score (weighted sum of metrics)
        "composite_score": (
            factual_accuracy * 0.4 +
            citation_metrics["citation_accuracy"] * 0.3 +
            (1 - hallucination_rate) * 0.3
        )
    }

    return {
        "token_stats": token_stats,
        "responses": responses,
        "metrics": metrics
    }

# Define templates to evaluate
templates = {
    "basic": """
    Answer the question based on the context.

    CONTEXT:
    {context}

    QUESTION:
    {question}

    ANSWER:
    """,

    "anti_hallucination": """
    Answer the question based ONLY on the context below.
    If the context doesn't contain the answer, say "I don't have enough information."

    CONTEXT:
    {context}

    QUESTION:
    {question}

    ANSWER:
    """,

    "citation": """
    Answer the question based on the context below.
    Cite your sources using [Document X] notation.

    CONTEXT:
    {context}

    QUESTION:
    {question}

    ANSWER:
    """,

    "comprehensive": """
    You are an assistant for question-answering tasks.
    Answer based EXCLUSIVELY on the provided context.

    CONTEXT:
    {context}

    QUESTION:
    {question}

    INSTRUCTIONS:
    1. If the answer is in the context, provide it clearly and concisely
    2. Cite specific documents using [Document X] notation
    3. If the answer isn't in the context, say "I don't have enough information"
    4. Only use information present in the context

    ANSWER:
    """
}

# Evaluate templates (token usage only without API key)
print("Evaluating template token usage:")
template_results = {}

for name, template in templates.items():
    result = assess_template(template, eval_set)
    template_results[name] = result
    print(f"{name}: Avg tokens = {result['token_stats']['avg_tokens']:.1f}, Max tokens = {result['token_stats']['max_tokens']}")

# Evaluate with LLM if API key is available
if os.environ.get("OPENAI_API_KEY"):
    try:
        print("\nEvaluating templates with LLM:")
        llm = ChatOpenAI(temperature=0)

        for name, template in templates.items():
            print(f"Evaluating {name} template...")
            result = assess_template(template, eval_set[:2], llm)  # Use just 2 examples to save API calls
            template_results[name] = result

            if result["metrics"]:
                print(f"  Factual accuracy: {result['metrics']['factual_accuracy']:.2f}")
                print(f"  Citation rate: {result['metrics']['citation_rate']:.2f}")
                print(f"  Hallucination rate: {result['metrics']['hallucination_rate']:.2f}")
                print(f"  Composite score: {result['metrics']['composite_score']:.2f}")
    except Exception as e:
        print(f"Error evaluating with LLM: {e}")
else:
    print("\nOpenAI API key not set - skipping LLM evaluation")

print_separator()


**Section 2: A/B Testing Different Template Structures**

In [None]:
print("Section 2: A/B Testing Different Template Structures")

def run_ab_test(template_a: str, template_b: str, eval_set: List[Dict], llm=None):
    """Run an A/B test comparing two templates."""
    # Evaluate both templates
    results_a = assess_template(template_a, eval_set, llm)
    results_b = assess_template(template_b, eval_set, llm)

    # Return results for comparison
    return {
        "template_a": {
            "template": template_a,
            "token_stats": results_a["token_stats"],
            "metrics": results_a["metrics"],
            "responses": results_a["responses"]
        },
        "template_b": {
            "template": template_b,
            "token_stats": results_b["token_stats"],
            "metrics": results_b["metrics"],
            "responses": results_b["responses"]
        }
    }

def present_ab_test_results(results: Dict):
    """Present the results of an A/B test."""
    # Token stats comparison
    print("Token Usage Comparison:")
    print(f"Template A: Avg = {results['template_a']['token_stats']['avg_tokens']:.1f}, Max = {results['template_a']['token_stats']['max_tokens']}")
    print(f"Template B: Avg = {results['template_b']['token_stats']['avg_tokens']:.1f}, Max = {results['template_b']['token_stats']['max_tokens']}")

    # Performance metrics comparison (if available)
    if results['template_a']['metrics'] and results['template_b']['metrics']:
        print("\nPerformance Metrics Comparison:")
        metrics = ['factual_accuracy', 'citation_rate', 'citation_accuracy', 'hallucination_rate', 'composite_score']

        for metric in metrics:
            value_a = results['template_a']['metrics'][metric]
            value_b = results['template_b']['metrics'][metric]
            diff = value_b - value_a
            better = "B" if diff > 0 else "A" if diff < 0 else "Neither"

            # For hallucination rate, lower is better
            if metric == 'hallucination_rate':
                better = "B" if diff < 0 else "A" if diff > 0 else "Neither"

            print(f"{metric}: A = {value_a:.2f}, B = {value_b:.2f}, Diff = {diff:.2f}, Better: {better}")

def check_statistical_significance(metrics_a: List[float], metrics_b: List[float], confidence_level=0.95):
    """Check if the difference between metrics is statistically significant."""
    try:
        import scipy.stats as stats

        t_stat, p_value = stats.ttest_ind(metrics_a, metrics_b)

        return {
            "t_statistic": t_stat,
            "p_value": p_value,
            "significant": p_value < (1 - confidence_level)
        }
    except Exception as e:
        print(f"Error in statistical significance testing: {e}")
        return {"significant": False}

# Define template variants for testing
control_template = """
Answer the question based on the context.

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

# Variant with just one change - adding citation instruction
variant_template = """
Answer the question based on the context.

CONTEXT:
{context}

QUESTION:
{question}

Cite your sources using [Document X] notation.

ANSWER:
"""

# Run A/B test
print("Running A/B test between two template variants:")
ab_results = run_ab_test(control_template, variant_template, eval_set)
present_ab_test_results(ab_results)

# Controlled Variable Testing
print("\nControlled Variable Testing:")
print("We can isolate the impact of specific template elements by testing variants")
print("that differ in only one specific aspect.")
print("\nExample variant pairs:")
print("1. Basic vs. Citation instruction")
print("2. No constraint vs. Anti-hallucination constraint")
print("3. Single task vs. Step-by-step instructions")

# Demonstrate significance testing (with mock data)
print("\nStatistical Significance Testing Example:")
# Create mock per-question scores for demonstration
mock_scores_a = [0.7, 0.8, 0.75, 0.65, 0.72]
mock_scores_b = [0.82, 0.88, 0.85, 0.8, 0.84]

sig_result = check_statistical_significance(mock_scores_a, mock_scores_b)
print(f"t-statistic: {sig_result.get('t_statistic', 'N/A')}")
print(f"p-value: {sig_result.get('p_value', 'N/A')}")
print(f"Statistically significant: {sig_result.get('significant', False)}")

print_separator()

**Section 3: Iterative Refinement Based on Output Quality**

In [None]:
print("Section 3: Iterative Refinement Based on Output Quality")

def analyze_errors(responses: List[str], contexts: List[str], questions: List[str]) -> Dict[str, int]:
    """Analyze common error patterns in responses."""
    error_types = {
        "hallucination": 0,
        "missing_information": 0,
        "incorrect_citation": 0,
        "irrelevant_content": 0,
        "contradiction": 0
    }

    for response, context, question in zip(responses, contexts, questions):
        # Simplified error detection logic

        # Check for potential hallucinations (text not in context)
        key_statements = extract_facts(response)
        for statement in key_statements:
            # Simplistic check - in practice would be more sophisticated
            key_terms = [word for word in statement.lower().split() if len(word) > 5]
            if key_terms and all(term not in context.lower() for term in key_terms):
                error_types["hallucination"] += 1
                break

        # Check for missing information
        if "I don't have enough information" in response and question.lower() in context.lower():
            error_types["missing_information"] += 1

        # Check for incorrect citations
        citations = re.findall(r'\[(?:Document|Source|Doc) (\d+)\]', response)
        if citations and not all(f"Document {c}" in context for c in citations):
            error_types["incorrect_citation"] += 1

        # Very simplistic relevance check
        question_terms = [w.lower() for w in question.split() if len(w) > 3]
        if not any(term in response.lower() for term in question_terms):
            error_types["irrelevant_content"] += 1

    return error_types

class TemplateVersion:
    def __init__(self, template, name, description, changes=None):
        self.template = template
        self.name = name
        self.description = description
        self.changes = changes or []
        self.metrics = {}
        self.timestamp = datetime.now()

    def add_metric(self, name, value):
        self.metrics[name] = value

    def summary(self):
        return {
            "name": self.name,
            "description": self.description,
            "changes": self.changes,
            "metrics": self.metrics,
            "timestamp": self.timestamp.isoformat()
        }

# Create template versions for demonstration
template_history = []

v1 = TemplateVersion(
    template=templates["basic"],
    name="v1_basic",
    description="Basic QA template"
)
v1.add_metric("factual_accuracy", 0.72)
v1.add_metric("hallucination_rate", 0.18)
v1.add_metric("composite_score", 0.65)
template_history.append(v1)

v2 = TemplateVersion(
    template=templates["anti_hallucination"],
    name="v2_anti_hallucination",
    description="Added anti-hallucination instruction",
    changes=["Added explicit instruction not to use external knowledge"]
)
v2.add_metric("factual_accuracy", 0.70)
v2.add_metric("hallucination_rate", 0.08)
v2.add_metric("composite_score", 0.71)
template_history.append(v2)

v3 = TemplateVersion(
    template=templates["citation"],
    name="v3_citation",
    description="Added citation requirement",
    changes=["Added instruction to cite sources using Document notation"]
)
v3.add_metric("factual_accuracy", 0.75)
v3.add_metric("hallucination_rate", 0.12)
v3.add_metric("citation_rate", 0.85)
v3.add_metric("composite_score", 0.76)
template_history.append(v3)

v4 = TemplateVersion(
    template=templates["comprehensive"],
    name="v4_comprehensive",
    description="Comprehensive template with multiple improvements",
    changes=[
        "Added system role context",
        "Added structured instructions",
        "Combined anti-hallucination and citation requirements",
        "Added explicit steps for answering"
    ]
)
v4.add_metric("factual_accuracy", 0.82)
v4.add_metric("hallucination_rate", 0.05)
v4.add_metric("citation_rate", 0.92)
v4.add_metric("composite_score", 0.85)
template_history.append(v4)

# Visualize the template improvement history
print("Template Improvement History:")
for i, version in enumerate(template_history):
    print(f"{version.name}: {version.description}")
    print(f"  Changes: {', '.join(version.changes) if version.changes else 'None'}")
    print(f"  Metrics: {', '.join([f'{k}={v:.2f}' for k, v in version.metrics.items()])}")
    print()

# Plot improvement over versions
try:
    plt.figure(figsize=(10, 6))

    # Extract version names and metrics
    names = [v.name for v in template_history]
    factual_scores = [v.metrics.get('factual_accuracy', 0) for v in template_history]
    hallucination_rates = [v.metrics.get('hallucination_rate', 0) for v in template_history]
    composite_scores = [v.metrics.get('composite_score', 0) for v in template_history]

    # Create plot
    plt.plot(names, factual_scores, 'o-', label='Factual Accuracy')
    plt.plot(names, [1-rate for rate in hallucination_rates], 'o-', label='Anti-Hallucination')
    plt.plot(names, composite_scores, 'o-', label='Composite Score')

    plt.title('Template Performance Improvement Over Versions')
    plt.xlabel('Template Version')
    plt.ylabel('Score (higher is better)')
    plt.ylim(0, 1)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()

    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error creating plot: {e}")
    print("To see the visualization, run this notebook in an environment that supports matplotlib.")

print_separator()

**Section 4: Automated Prompt Optimization Techniques**

In [None]:
print("Section 4: Automated Prompt Optimization Techniques")

def generate_template_variants(base_template, components):
    """Generate multiple template variants by combining components."""
    variants = []

    # Create templates with different combinations of components
    import itertools
    for r in range(1, len(components) + 1):
        for combo in itertools.combinations(components.items(), r):
            # Start with the base template
            variant_template = base_template
            description = "Base template with: "
            changes = []

            # Add each selected component
            for name, text in combo:
                variant_template += text
                description += name + ", "
                changes.append(f"Added {name}")

            # Create a version object for this variant
            variant = TemplateVersion(
                template=variant_template,
                name=f"variant_{'_'.join([name for name, _ in combo])}",
                description=description.rstrip(", "),
                changes=changes
            )

            variants.append(variant)

    return variants

# Define base template and optional components
base_template = """
Answer the question based on the context.

CONTEXT:
{context}

QUESTION:
{question}

"""

optional_components = {
    "anti_hallucination": "Only use information from the context. If the answer isn't in the context, say so.\n\n",
    "citation": "Cite your sources using [Document X] notation.\n\n",
    "structured_response": "Structure your answer with: (1) Direct answer (2) Supporting details (3) Source information.\n\n",
    "confidence": "Indicate your confidence in the answer (High/Medium/Low) based on how clearly it's stated in the context.\n\n"
}

# Generate template variants
print("Programmatic Template Generation:")
variants = generate_template_variants(base_template, optional_components)
print(f"Generated {len(variants)} template variants from {len(optional_components)} optional components")

for i, variant in enumerate(variants[:3]):  # Show first 3 as examples
    print(f"\nVariant {i+1}: {variant.name}")
    print(f"Description: {variant.description}")
    print(f"Changes: {', '.join(variant.changes)}")
    print("Template preview:")
    print(variant.template)

def optimize_token_allocation(base_allocation, eval_function, steps=5, learning_rate=0.05):
    """Optimize token allocation between different prompt components."""
    print("\nToken Allocation Optimization:")
    print(f"Starting allocation: {base_allocation}")

    current_allocation = base_allocation.copy()
    best_allocation = base_allocation.copy()
    best_score = eval_function(best_allocation)
    print(f"Initial score: {best_score:.4f}")

    improvements = []

    for step in range(steps):
        print(f"\nStep {step+1}:")
        improved = False

        # Try adjusting each component
        for component in current_allocation:
            # Try increasing this component's allocation
            test_allocation = current_allocation.copy()
            test_allocation[component] += 50  # Add 50 tokens

            # Normalize to maintain total token count
            total = sum(test_allocation.values())
            target_total = sum(current_allocation.values())
            scaling_factor = target_total / total
            test_allocation = {k: int(v * scaling_factor) for k, v in test_allocation.items()}

            # Evaluate
            score = eval_function(test_allocation)
            print(f"  Testing more tokens for {component}: score = {score:.4f}")

            if score > best_score:
                best_allocation = test_allocation
                best_score = score
                improved = True
                improvements.append((step, component, "increase", best_score))

        if not improved:
            print("  No improvement found in this step")
            break

        # Update current allocation
        current_allocation = best_allocation.copy()
        print(f"  New best allocation: {current_allocation}")
        print(f"  New best score: {best_score:.4f}")

    return best_allocation, best_score, improvements

# Define a simple evaluation function for token allocation
def evaluate_token_allocation(allocation):
    """
    Evaluate a token allocation strategy.
    This is a mock function - in practice would use actual performance metrics.
    """
    # This function simulates performance with different token allocations
    # The formula represents a hypothetical relationship between token allocation and performance
    # - More tokens for context is good up to a point, then diminishing returns
    # - Instructions benefit from having enough tokens but not too many
    # - Question needs just enough tokens
    # - System prompt has minimal impact

    context_score = min(1.0, allocation["context"] / 1000) * 0.6
    instruction_score = min(1.0, allocation["instructions"] / 300) * 0.3
    question_score = min(1.0, allocation["question"] / 100) * 0.1
    system_score = min(1.0, allocation["system"] / 200) * 0.05

    # Penalize if context gets too little
    if allocation["context"] < 500:
        context_score *= 0.5

    # Penalize if instructions get too verbose
    if allocation["instructions"] > 500:
        instruction_score *= 0.8

    # Calculate total score
    return context_score + instruction_score + question_score + system_score

# Initial token allocation
token_allocation = {
    "system": 200,
    "context": 800,
    "instructions": 200,
    "question": 100
}

# Run optimization
optimized_allocation, optimized_score, improvement_history = optimize_token_allocation(
    token_allocation, evaluate_token_allocation
)

print("\nOptimization Results:")
print(f"Initial allocation: {token_allocation}")
print(f"Optimized allocation: {optimized_allocation}")
print(f"Improvement: {optimized_score - evaluate_token_allocation(token_allocation):.4f}")

# Evolution-based approach (conceptual overview)
print("\nEvolution-Based Optimization (conceptual overview):")
print("1. Create an initial population of templates with different instructions/structures")
print("2. Evaluate each template's performance on a test set")
print("3. Select the best-performing templates to 'reproduce'")
print("4. Create 'offspring' by combining and mutating successful templates")
print("5. Repeat for multiple generations")
print("6. The highest-performing templates emerge through evolutionary pressure")

def create_initial_population(size=10):
    """Create initial population of templates with random components."""
    population = []

    for i in range(size):
        # Start with base template
        template = "Answer the question based on the context.\n\nCONTEXT:\n{context}\n\nQUESTION:\n{question}\n\n"

        # Randomly add components
        if random.random() > 0.5:
            template += "Use ONLY information from the context.\n"
        if random.random() > 0.5:
            template += "Cite sources using [Document X] notation.\n"
        if random.random() > 0.7:
            template += "Structure your answer in a clear, concise manner.\n"
        if random.random() > 0.8:
            template += "If information is missing from the context, say so clearly.\n"

        template += "\nANSWER:"
        population.append(template)

    return population

def mutate_template(template, mutation_rate=0.3):
    """Apply random mutations to a template."""
    # Possible mutations
    mutations = [
        ("Add anti-hallucination", "Only use information from the context. Do not use external knowledge.\n"),
        ("Add citation", "Cite your sources using [Document X] notation.\n"),
        ("Add structure", "Structure your answer with: (1) Direct answer (2) Supporting details.\n"),
        ("Add confidence", "Indicate your confidence level in the answer.\n")
    ]

    # Apply random mutations
    for name, text in mutations:
        if random.random() < mutation_rate and text not in template:
            # Find a position before "ANSWER:" to insert the mutation
            answer_pos = template.find("\nANSWER:")
            if answer_pos == -1:
                template += text
            else:
                template = template[:answer_pos] + "\n" + text + template[answer_pos:]

    return template

def evolve_templates(population, fitness_function, generations=3):
    """Evolve a population of templates using a genetic algorithm approach."""
    print(f"\nEvolving templates across {generations} generations:")

    history = []

    for generation in range(generations):
        print(f"\nGeneration {generation+1}:")

        # Evaluate fitness
        fitness_scores = []
        for i, template in enumerate(population):
            # In a real system, this would use the actual evaluation metrics
            # Here we'll use a random score for demonstration
            fitness = fitness_function(template)
            fitness_scores.append((i, fitness))
            print(f"  Template {i+1}: fitness = {fitness:.4f}")

        # Sort by fitness
        fitness_scores.sort(key=lambda x: x[1], reverse=True)

        # Keep track of best template in this generation
        best_idx, best_fitness = fitness_scores[0]
        history.append((generation, population[best_idx], best_fitness))

        # Create next generation
        next_gen = []

        # Elitism: keep the best templates
        elite_count = len(population) // 4
        for i in range(elite_count):
            if i < len(fitness_scores):
                idx = fitness_scores[i][0]
                next_gen.append(population[idx])

        # Create offspring until we fill the population
        while len(next_gen) < len(population):
            # Tournament selection
            parent_indices = random.sample(range(len(population)), 2)
            if fitness_scores[parent_indices[0]][1] > fitness_scores[parent_indices[1]][1]:
                parent1 = population[parent_indices[0]]
            else:
                parent1 = population[parent_indices[1]]

            parent_indices = random.sample(range(len(population)), 2)
            if fitness_scores[parent_indices[0]][1] > fitness_scores[parent_indices[1]][1]:
                parent2 = population[parent_indices[0]]
            else:
                parent2 = population[parent_indices[1]]

            # Crossover (simplified for text templates)
            # In practice, would use more sophisticated template representation
            # Here we'll just take the first half from parent1 and second half from parent2
            split_point = len(parent1) // 2
            child = parent1[:split_point] + parent2[split_point:]

            # Mutation
            child = mutate_template(child)

            next_gen.append(child)

        # Update population for next generation
        population = next_gen

    # Return best template from the final generation
    final_fitness_scores = [(i, fitness_function(template)) for i, template in enumerate(population)]
    final_fitness_scores.sort(key=lambda x: x[1], reverse=True)
    best_idx = final_fitness_scores[0][0]

    return population[best_idx], history

# Mock fitness function for template evolution
def mock_fitness_function(template):
    """
    Mock fitness function for template evaluation.
    This simulates evaluating a template's performance.
    """
    # In a real system, this would run the template against a test set
    # and calculate actual performance metrics

    # Here we'll use a simplified scoring approach based on desirable features
    score = 0.5  # Base score

    # Check for desirable features
    if "ONLY" in template or "only" in template:
        score += 0.1  # Anti-hallucination is good
    if "[Document" in template:
        score += 0.15  # Citation is good
    if "structure" in template.lower():
        score += 0.05  # Structure guidance is good
    if "confidence" in template.lower():
        score += 0.05  # Confidence indication is good
    if "missing" in template.lower() and "information" in template.lower():
        score += 0.1  # Handling missing information is good

    # Add some randomness to simulate real-world variation
    score += random.uniform(-0.05, 0.05)

    # Ensure score is between 0 and 1
    return max(0, min(1, score))

# Try the evolution approach
initial_population = create_initial_population(size=6)
best_template, evolution_history = evolve_templates(
    initial_population,
    mock_fitness_function,
    generations=3
)

print("\nBest Template After Evolution:")
print(best_template)
print(f"Final fitness score: {mock_fitness_function(best_template):.4f}")

print("\nEvolution History:")
for generation, template, fitness in evolution_history:
    print(f"Generation {generation+1}: fitness = {fitness:.4f}")
    print(f"Template preview: {template[:100]}...")

print_separator()

**Section 5: Case Study - Optimizing a RAG Template**

In [None]:
print("Section 5: Case Study - Optimizing a RAG Template")

# Initial template
initial_rag_template = """
Answer the question based on the context.

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

# Problem statement
print("Problem: Users report that our RAG system sometimes provides incorrect information")
print("or fails to cite sources, making it difficult to verify answers.")
print("\nGoal: Optimize the template to improve accuracy, reduce hallucinations,")
print("and increase source attribution.")

# Iteration 1: Add anti-hallucination instruction
iter1_template = """
Answer the question based ONLY on the information in the context.
If the context doesn't contain the answer, say "I don't have enough information."

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

print("\nIteration 1: Added anti-hallucination instruction")
print("Results: Hallucination rate decreased from 18% to 9%,")
print("but factual accuracy also decreased from 75% to 72%")

# Iteration 2: Add citation requirement
iter2_template = """
Answer the question based ONLY on the information in the context.
If the context doesn't contain the answer, say "I don't have enough information."
Cite your sources using [Document X] notation.

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

print("\nIteration 2: Added citation requirement")
print("Results: Citation rate increased from 15% to 78%,")
print("factual accuracy increased to 74%,")
print("hallucination rate remained at 9%")

# Iteration 3: Add structured instructions
iter3_template = """
You are an assistant for question-answering tasks.
Answer based EXCLUSIVELY on the provided context.

CONTEXT:
{context}

QUESTION:
{question}

INSTRUCTIONS:
1. If the answer is in the context, provide it clearly and concisely
2. Cite specific documents using [Document X] notation
3. If the answer isn't in the context, say "I don't have enough information"
4. Only use information present in the context

ANSWER:
"""

print("\nIteration 3: Added structured instructions and system context")
print("Results: Factual accuracy increased to 83%,")
print("hallucination rate decreased to 4%,")
print("citation rate increased to 92%")

# Iteration 4: Add response format guidance
final_template = """
You are an assistant for question-answering tasks.
Answer based EXCLUSIVELY on the provided context.

CONTEXT:
{context}

QUESTION:
{question}

INSTRUCTIONS:
1. If the answer is in the context, provide it clearly and concisely
2. Cite specific documents using [Document X] notation
3. If the answer isn't in the context, say "I don't have enough information"
4. Only use information present in the context

FORMAT YOUR RESPONSE AS FOLLOWS:
- Start with a direct answer to the question
- Provide supporting details from the context
- Include relevant citations for each fact
- If different documents contain conflicting information, acknowledge this

ANSWER:
"""

print("\nIteration 4: Added response format guidance")
print("Results: Factual accuracy increased to 85%,")
print("hallucination rate decreased to 3%,")
print("citation rate remained at 92%,")
print("response coherence improved by 23%")

# Summary of improvements
print("\nSummary of template optimization process:")
templates = [initial_rag_template, iter1_template, iter2_template, iter3_template, final_template]
names = ["Initial", "Anti-hallucination", "Citation", "Structured", "Final"]

# Map of mock metrics across iterations
metrics = {
    "factual_accuracy": [0.75, 0.72, 0.74, 0.83, 0.85],
    "hallucination_rate": [0.18, 0.09, 0.09, 0.04, 0.03],
    "citation_rate": [0.15, 0.15, 0.78, 0.92, 0.92],
    "coherence": [0.65, 0.67, 0.70, 0.75, 0.80]
}

# Create a pandas DataFrame for better visualization
try:
    metrics_df = pd.DataFrame(metrics, index=names)
    print("\nMetrics across iterations:")
    print(metrics_df)

    # Create visualization
    plt.figure(figsize=(12, 8))

    for metric in metrics:
        if metric == "hallucination_rate":
            # For hallucination, lower is better so we'll plot 1 - rate
            plt.plot(names, [1-v for v in metrics[metric]], 'o-', label=f"Anti-{metric}")
        else:
            plt.plot(names, metrics[metric], 'o-', label=metric)

    plt.title('RAG Template Optimization Progress')
    plt.xlabel('Template Version')
    plt.ylabel('Score (higher is better)')
    plt.ylim(0, 1)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()

    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error creating DataFrame or plot: {e}")

    # Fallback text display
    print("\nMetrics across iterations:")
    for metric in metrics:
        print(f"{metric}: {', '.join([str(round(v, 2)) for v in metrics[metric]])}")

print("\nKey learnings from the optimization process:")
print("1. Anti-hallucination instructions are effective but can reduce coverage")
print("2. Citation requirements dramatically improve verification but need clear formatting")
print("3. Structured instructions improve both accuracy and citation quality")
print("4. Response format guidance improves coherence without sacrificing accuracy")
print("5. Each improvement should be evaluated independently before combining")

print_separator()

**Section 6: Best Practices for Template Testing and Optimization**

In [None]:
print("Section 6: Best Practices for Template Testing and Optimization")

print("1. Start with a Diverse Test Set")
print("   - Include multiple question types (factual, comparative, explanatory)")
print("   - Cover edge cases (ambiguous questions, missing information)")
print("   - Include examples with contradictory or partial information")
print("   - Test with varying context lengths and complexities")

print("\n2. Use Clear Evaluation Metrics")
print("   - Define objective metrics that align with application goals")
print("   - Balance between factual accuracy, citation quality, and coherence")
print("   - Consider human evaluation for subjective aspects")
print("   - Track multiple metrics to avoid optimization tradeoffs")

print("\n3. Isolate Changes for Proper Attribution")
print("   - Test one change at a time to understand its impact")
print("   - Create controlled A/B tests with sufficient sample size")
print("   - Use statistical significance testing for reliable conclusions")
print("   - Document all changes and their measured effects")

print("\n4. Follow an Iterative Cycle")
print("   - Start with a simple baseline template")
print("   - Analyze error patterns in detail before making changes")
print("   - Implement targeted improvements based on error analysis")
print("   - Measure impact of each change before proceeding")
print("   - Be willing to revert changes that don't improve performance")

print("\n5. Consider Computational and Token Efficiency")
print("   - Monitor token usage as template complexity increases")
print("   - Balance instruction detail with token efficiency")
print("   - Optimize allocation of tokens between components")
print("   - Consider the impact on inference time and cost")

print("\n6. Test in Real-World Conditions")
print("   - Move beyond synthetic test sets to real user queries")
print("   - Consider different document types and quality levels")
print("   - Test with various retrieval qualities (perfect vs. noisy)")
print("   - Evaluate performance degradation with larger context windows")

print("\n7. Use Template Versioning")
print("   - Maintain a history of template versions and their performance")
print("   - Document the rationale behind each change")
print("   - Implement a systematic version naming convention")
print("   - Enable rollback to previous versions if needed")

print_separator()

print("Notebook completed!")

# Key Takeaways:
# 1. Systematic template evaluation requires clear metrics tailored to application goals
# 2. A/B testing with controlled variables helps identify effective template components
# 3. Iterative refinement based on error analysis leads to continuous improvement
# 4. Automated optimization techniques can efficiently explore the template design space
# 5. Balancing multiple performance metrics is essential for optimal template design