**Setup and Installation**

In [None]:
!pip install langchain langchain-openai tiktoken pandas matplotlib seaborn

import os
import re
import json
import time
from datetime import datetime
from typing import List, Dict, Any, Optional, Union, Tuple

os.environ["OPENAI_API_KEY"] = "your-api-key"

import tiktoken
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain_openai import OpenAI, ChatOpenAI

**Basic Utility Functions**

In [2]:
def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
    """Count the number of tokens in a text string."""
    encoder = tiktoken.encoding_for_model(model)
    return len(encoder.encode(text))

def print_separator():
    """Print a visual separator."""
    print("\n" + "="*50 + "\n")

# Create sample documents for testing
sample_docs = [
    Document(page_content="Renewable energy sources like solar and wind power have seen significant cost decreases over the past decade. Solar photovoltaic costs dropped by 85% between 2010 and 2020, while onshore wind costs fell by 56% during the same period.",
             metadata={"source": "energy_report_2021", "page": 42}),
    Document(page_content="Energy storage remains a challenge for renewable energy integration. Lithium-ion battery costs have decreased by 89% from 2010 to 2020, but grid-scale storage deployment still lags behind renewable energy installation. Pumped hydro storage accounts for over 90% of global energy storage capacity.",
             metadata={"source": "grid_storage_analysis", "page": 128}),
    Document(page_content="The global transition to renewable energy requires significant infrastructure investment. Estimates suggest $4.4 trillion in annual investment is needed by 2030 to achieve net-zero emissions by 2050. This includes grid modernization, energy storage, and renewable generation capacity.",
             metadata={"source": "climate_finance_report", "page": 75}),
    Document(page_content="Community solar projects provide renewable energy access to those unable to install their own systems. These shared solar facilities are now available in 39 states and serve over 700,000 households. The average subscriber saves approximately 10% on their electricity costs.",
             metadata={"source": "community_energy_initiative", "page": 12}),
]

# Create evaluation set with questions and ground truth answers
eval_set = [
    {
        "question": "How much have solar photovoltaic costs decreased between 2010 and 2020?",
        "context": "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)]),
        "ground_truth": "Solar photovoltaic costs dropped by 85% between 2010 and 2020.",
        "key_elements": ["85%", "between 2010 and 2020", "solar photovoltaic"],
        "expected_source": 0  # Index of the document containing the answer
    },
    {
        "question": "What percentage of global energy storage capacity comes from pumped hydro storage?",
        "context": "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)]),
        "ground_truth": "Pumped hydro storage accounts for over 90% of global energy storage capacity.",
        "key_elements": ["over 90%", "pumped hydro", "global energy storage capacity"],
        "expected_source": 1
    },
    {
        "question": "How much annual investment is needed by 2030 to achieve net-zero emissions by 2050?",
        "context": "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)]),
        "ground_truth": "$4.4 trillion in annual investment is needed by 2030 to achieve net-zero emissions by 2050.",
        "key_elements": ["$4.4 trillion", "annual investment", "by 2030", "net-zero emissions by 2050"],
        "expected_source": 2
    },
    {
        "question": "What is the approximate percentage that community solar subscribers save on electricity costs?",
        "context": "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)]),
        "ground_truth": "The average subscriber saves approximately 10% on their electricity costs.",
        "key_elements": ["10%", "average subscriber", "electricity costs"],
        "expected_source": 3
    }
]


**Section 1: Template Evaluation Framework**

In [None]:
print("Section 1: Template Evaluation Framework")

class TemplateEvaluator:
    """Framework for evaluating prompt templates against best practices."""

    def __init__(self):
        # Define evaluation criteria
        self.criteria = {
            "clarity": self._check_clarity,
            "structure": self._check_structure,
            "constraints": self._check_constraints,
            "citation": self._check_citation,
            "token_efficiency": self._check_token_efficiency,
            "task_guidance": self._check_task_guidance,
            "error_handling": self._check_error_handling,
            "identity": self._check_identity
        }

    def evaluate_template(self, template: str) -> Dict[str, Any]:
        """Evaluate a template against all criteria."""
        results = {}
        total_score = 0
        max_score = 0

        for criterion, check_function in self.criteria.items():
            score, feedback = check_function(template)
            results[criterion] = {
                "score": score,
                "feedback": feedback
            }
            total_score += score
            max_score += 5  # Assuming all criteria use a 0-5 scale

        # Calculate overall score as percentage
        overall_score = (total_score / max_score) * 100 if max_score > 0 else 0

        return {
            "criteria_scores": results,
            "overall_score": overall_score,
            "token_count": count_tokens(template)
        }

    def _check_clarity(self, template: str) -> Tuple[int, str]:
        """Check if the template has clear, unambiguous instructions."""
        score = 0
        feedback = []

        # Check for clear section delineation
        if re.search(r"CONTEXT|QUESTION|ANSWER", template):
            score += 2
        else:
            feedback.append("Missing clear section headers (CONTEXT, QUESTION, ANSWER, etc.)")

        # Check for concise language
        avg_sentence_length = len(template) / max(1, len(re.findall(r'[.!?]', template)))
        if avg_sentence_length > 25:
            feedback.append("Sentences are overly long, averaging {:.1f} characters".format(avg_sentence_length))
        else:
            score += 1

        # Check for imperative verbs
        imperative_verbs = ["use", "answer", "provide", "cite", "include", "consider", "avoid"]
        if any(verb in template.lower() for verb in imperative_verbs):
            score += 1
        else:
            feedback.append("Missing clear directive verbs (use, answer, provide, etc.)")

        # Check for ambiguous language
        ambiguous_terms = ["maybe", "might", "possibly", "potentially", "could", "try to"]
        if any(term in template.lower() for term in ambiguous_terms):
            feedback.append("Contains ambiguous terms: " + ", ".join(term for term in ambiguous_terms if term in template.lower()))
        else:
            score += 1

        if not feedback:
            feedback = ["Template has clear, unambiguous instructions"]

        return min(score, 5), "; ".join(feedback)

    def _check_structure(self, template: str) -> Tuple[int, str]:
        """Check if the template has a clear, logical structure."""
        score = 0
        feedback = []

        # Check for clear sections
        sections = ["context", "question", "answer", "instruction"]
        found_sections = [s for s in sections if s.lower() in template.lower()]

        if len(found_sections) >= 3:
            score += 2
        elif len(found_sections) >= 2:
            score += 1
            feedback.append("Missing some key sections")
        else:
            feedback.append("No clear structural elements found")

        # Check for visual separation
        if re.search(r'[-=_]{3,}|\n{2,}', template):
            score += 1
        else:
            feedback.append("Lacks visual separation between sections")

        # Check for placeholder formatting
        if re.search(r'\{[a-z_]+\}', template):
            score += 1
        else:
            feedback.append("No properly formatted placeholders found")

        # Check for logical ordering
        logical_order = (
            template.lower().find("context") < template.lower().find("question") < template.lower().find("answer")
            if all(s in template.lower() for s in ["context", "question", "answer"])
            else False
        )

        if logical_order:
            score += 1
        else:
            feedback.append("Sections may not be in logical order")

        if not feedback:
            feedback = ["Template has clear, logical structure"]

        return min(score, 5), "; ".join(feedback)

    def _check_constraints(self, template: str) -> Tuple[int, str]:
        """Check if the template includes appropriate constraints."""
        score = 0
        feedback = []

        # Check for information source constraints
        source_constraints = ["only use", "based on", "from the context", "in the provided", "don't use external"]
        found_constraints = [c for c in source_constraints if c in template.lower()]

        if found_constraints:
            score += 2
        else:
            feedback.append("Missing explicit constraints on information sources")

        # Check for hallucination prevention
        hallucination_prevention = ["don't make up", "don't guess", "don't use external", "don't assume", "insufficient info"]
        found_prevention = [p for p in hallucination_prevention if p in template.lower()]

        if found_prevention:
            score += 2
        else:
            feedback.append("Missing explicit hallucination prevention")

        # Check for explicit scope limits
        scope_limits = ["only", "specifically", "limit", "focus on", "restrict"]
        found_limits = [l for l in scope_limits if l in template.lower()]

        if found_limits:
            score += 1
        else:
            feedback.append("Missing explicit scope limitations")

        if not feedback:
            feedback = ["Template includes appropriate constraints"]

        return min(score, 5), "; ".join(feedback)

    def _check_citation(self, template: str) -> Tuple[int, str]:
        """Check if the template includes citation requirements."""
        score = 0
        feedback = []

        # Check for citation terms
        citation_terms = ["cite", "reference", "source", "document", "attribution"]
        found_terms = [t for t in citation_terms if t in template.lower()]

        if found_terms:
            score += 2
        else:
            feedback.append("Missing citation requirements")

        # Check for citation format specification
        citation_formats = ["[doc", "document", "source", "[", "]"]
        found_formats = [f for f in citation_formats if f in template.lower()]

        if found_formats:
            score += 2
        else:
            feedback.append("Missing citation format specification")

        # Check for contextual citation guidance
        if "when" in template.lower() and any(t in template.lower() for t in citation_terms):
            score += 1
        else:
            feedback.append("Missing contextual citation guidance")

        if not feedback:
            feedback = ["Template includes clear citation requirements"]

        return min(score, 5), "; ".join(feedback)

    def _check_token_efficiency(self, template: str) -> Tuple[int, str]:
        """Check if the template is token-efficient."""
        score = 0
        feedback = []

        # Check total token count
        token_count = count_tokens(template)

        if token_count < 100:
            score += 2
            feedback.append(f"Excellent token efficiency ({token_count} tokens)")
        elif token_count < 200:
            score += 1
            feedback.append(f"Good token efficiency ({token_count} tokens)")
        else:
            feedback.append(f"Template is token-heavy ({token_count} tokens)")

        # Check for redundancy
        sentences = re.split(r'[.!?]', template)
        unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
        redundancy_ratio = 1 - (len(unique_sentences) / max(1, len(sentences)))

        if redundancy_ratio < 0.1:
            score += 2
        elif redundancy_ratio < 0.2:
            score += 1
            feedback.append(f"Some redundancy detected ({redundancy_ratio:.2f} redundancy ratio)")
        else:
            feedback.append(f"High redundancy detected ({redundancy_ratio:.2f} redundancy ratio)")

        # Check for verbose phrases
        verbose_phrases = ["in order to", "for the purpose of", "in the event that", "at this point in time",
                          "due to the fact that", "with regard to", "it is important to note that"]
        found_phrases = [p for p in verbose_phrases if p in template.lower()]

        if not found_phrases:
            score += 1
        else:
            feedback.append("Contains verbose phrases: " + ", ".join(found_phrases))

        if not feedback:
            feedback = ["Template is token-efficient"]

        return min(score, 5), "; ".join(feedback)

    def _check_task_guidance(self, template: str) -> Tuple[int, str]:
        """Check if the template includes task-specific guidance."""
        score = 0
        feedback = []

        # Check for specific task indicators
        task_indicators = {
            "question answering": ["answer the question", "respond to the question"],
            "summarization": ["summarize", "summary", "key points"],
            "comparison": ["compare", "contrast", "differences", "similarities"],
            "analysis": ["analyze", "analysis", "evaluate", "assessment"]
        }

        found_tasks = []
        for task, indicators in task_indicators.items():
            if any(i in template.lower() for i in indicators):
                found_tasks.append(task)

        if found_tasks:
            score += 2
            feedback.append(f"Template specifies task type: {', '.join(found_tasks)}")
        else:
            feedback.append("No specific task type identified")

        # Check for output format guidance
        format_guidance = ["format", "structure", "organize", "bullet", "paragraph", "list"]
        found_format = [f for f in format_guidance if f in template.lower()]

        if found_format:
            score += 2
            feedback.append(f"Includes output format guidance")
        else:
            feedback.append("Missing output format guidance")

        # Check for quality criteria
        quality_criteria = ["concise", "comprehensive", "accurate", "balanced", "objective"]
        found_criteria = [c for c in quality_criteria if c in template.lower()]

        if found_criteria:
            score += 1
            feedback.append(f"Specifies quality criteria: {', '.join(found_criteria)}")
        else:
            feedback.append("Missing quality criteria")

        if len(feedback) == 1 and "Template specifies" in feedback[0]:
            feedback = ["Template includes comprehensive task-specific guidance"]

        return min(score, 5), "; ".join(feedback)

    def _check_error_handling(self, template: str) -> Tuple[int, str]:
        """Check if the template includes guidance for handling errors or edge cases."""
        score = 0
        feedback = []

        # Check for insufficient information handling
        insufficient_info = ["insufficient", "not enough", "doesn't contain", "unable to answer", "missing"]
        found_insufficient = [i for i in insufficient_info if i in template.lower()]

        if found_insufficient:
            score += 2
            feedback.append("Includes guidance for insufficient information")
        else:
            feedback.append("Missing guidance for insufficient information")

        # Check for conflicting information handling
        conflicting_info = ["conflict", "contradict", "disagree", "inconsistent", "different perspective"]
        found_conflicting = [c for c in conflicting_info if c in template.lower()]

        if found_conflicting:
            score += 2
            feedback.append("Includes guidance for conflicting information")
        else:
            feedback.append("Missing guidance for conflicting information")

        # Check for uncertainty communication
        uncertainty = ["uncertain", "confidence", "likely", "probability", "unclear"]
        found_uncertainty = [u for u in uncertainty if u in template.lower()]

        if found_uncertainty:
            score += 1
            feedback.append("Includes guidance for communicating uncertainty")
        else:
            feedback.append("Missing guidance for communicating uncertainty")

        if feedback[0].startswith("Includes"):
            feedback = ["Template provides comprehensive error handling guidance"]

        return min(score, 5), "; ".join(feedback)

    def _check_identity(self, template: str) -> Tuple[int, str]:
        """Check if the template avoids identity confusion."""
        score = 5  # Start with perfect score and deduct
        feedback = []

        # Check for problematic identity terms
        identity_terms = ["you are", "as an expert", "as a professional", "you know", "your knowledge", "your expertise"]
        found_terms = [t for t in identity_terms if t in template.lower()]

        if found_terms:
            score -= 3
            feedback.append(f"Contains potentially confusing identity framing: {', '.join(found_terms)}")

        # Check for knowledge claims
        knowledge_claims = ["you have knowledge", "you are familiar with", "you understand", "you have expertise"]
        found_claims = [c for c in knowledge_claims if c in template.lower()]

        if found_claims:
            score -= 2
            feedback.append(f"Makes knowledge claims about the model: {', '.join(found_claims)}")

        if not feedback:
            feedback = ["Template avoids identity confusion"]

        return max(score, 0), "; ".join(feedback)

# Define templates for evaluation
templates_to_evaluate = {
    "basic": """
    Answer the question based on the context.

    CONTEXT:
    {context}

    QUESTION:
    {question}

    ANSWER:
    """,

    "detailed": """
    You are an assistant for question-answering tasks. Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer concise.

    CONTEXT:
    {context}

    QUESTION:
    {question}

    ANSWER:
    """,

    "optimized": """
    Answer based ONLY on the provided context. If the information isn't available, say "I don't have enough information."

    CONTEXT:
    {context}

    QUESTION:
    {question}

    Cite sources as [Doc X]. Be concise.
    """,

    "best_practice": """
    Answer the question using ONLY the provided context.

    CONTEXT:
    {context}

    QUESTION:
    {question}

    INSTRUCTIONS:
    - If the answer is in the context, provide it clearly and concisely
    - Cite specific documents using [Document X] notation
    - If the answer isn't in the context, say "I don't have enough information"
    - If information is conflicting, acknowledge the contradiction

    ANSWER:
    """
}

# Create evaluator and evaluate templates
evaluator = TemplateEvaluator()
evaluation_results = {}

for name, template in templates_to_evaluate.items():
    evaluation_results[name] = evaluator.evaluate_template(template)
    print(f"Evaluating template: {name}")
    print(f"Overall score: {evaluation_results[name]['overall_score']:.1f}%")
    print(f"Token count: {evaluation_results[name]['token_count']}")
    print("Criteria scores:")
    for criterion, result in evaluation_results[name]['criteria_scores'].items():
        print(f"  {criterion}: {result['score']}/5 - {result['feedback']}")
    print()

# Visualize evaluation results
try:
    criteria = list(evaluation_results[next(iter(evaluation_results))]['criteria_scores'].keys())
    template_names = list(evaluation_results.keys())

    scores_data = []
    for template_name in template_names:
        template_scores = []
        for criterion in criteria:
            template_scores.append(evaluation_results[template_name]['criteria_scores'][criterion]['score'])
        scores_data.append(template_scores)

    # Create a radar chart
    angles = np.linspace(0, 2*np.pi, len(criteria), endpoint=False).tolist()
    angles += angles[:1]  # Close the polygon

    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))

    for i, template_name in enumerate(template_names):
        values = scores_data[i].copy()
        values += values[:1]  # Close the polygon
        ax.plot(angles, values, linewidth=2, label=template_name)
        ax.fill(angles, values, alpha=0.1)

    # Add labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(criteria)
    ax.set_yticks([1, 2, 3, 4, 5])
    ax.set_title("Template Evaluation Scores", fontsize=14, pad=20)
    ax.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

    plt.tight_layout()
    plt.show()

    # Create a bar chart for overall scores
    overall_scores = [evaluation_results[name]['overall_score'] for name in template_names]

    plt.figure(figsize=(10, 6))
    bars = plt.bar(template_names, overall_scores)

    # Add score labels on top of bars
    for bar, score in zip(bars, overall_scores):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                 f"{score:.1f}%", ha='center', fontsize=12)

    plt.title("Overall Template Evaluation Scores", fontsize=14)
    plt.ylabel("Score (%)")
    plt.ylim(0, 105)  # Leave room for labels
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error creating visualization: {e}")
    print("To view the visualization, run this notebook in an environment that supports matplotlib.")

print_separator()

**Section 2: Before and After Template Optimization**

In [None]:
print("Section 2: Before and After Template Optimization")

before_template = """
You are an expert assistant with deep knowledge about many topics. Your task is to utilize your extensive knowledge and the provided context to answer the user's question in a helpful manner. Try to be as informative as possible while keeping your answer accurate and relevant to what the user is asking about. If possible, include additional interesting facts that might be relevant to the question. If you're unsure about something, you can make an educated guess based on your knowledge.

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""

after_template = """
Answer the question using ONLY information from the context below.

CONTEXT:
{context}

QUESTION:
{question}

INSTRUCTIONS:
- If the information is in the context, provide a clear and concise answer
- Cite your sources using [Document X] notation
- If the answer isn't in the context, state "I don't have enough information"
- Do not introduce external knowledge

ANSWER:
"""

print("BEFORE OPTIMIZATION:")
print(before_template)
print(f"Token count: {count_tokens(before_template)}")

before_eval = evaluator.evaluate_template(before_template)
print(f"Overall score: {before_eval['overall_score']:.1f}%")
print("Key issues:")
problem_criteria = [c for c in before_eval['criteria_scores'] if before_eval['criteria_scores'][c]['score'] <= 2]
for criterion in problem_criteria:
    print(f"- {criterion}: {before_eval['criteria_scores'][criterion]['feedback']}")

print("\nAFTER OPTIMIZATION:")
print(after_template)
print(f"Token count: {count_tokens(after_template)}")

after_eval = evaluator.evaluate_template(after_template)
print(f"Overall score: {after_eval['overall_score']:.1f}%")
print("Improvements:")
for criterion in before_eval['criteria_scores']:
    before_score = before_eval['criteria_scores'][criterion]['score']
    after_score = after_eval['criteria_scores'][criterion]['score']
    if after_score > before_score:
        print(f"- {criterion}: {before_score} → {after_score} ({after_eval['criteria_scores'][criterion]['feedback']})")

print("\nOptimization Results:")
token_reduction = count_tokens(before_template) - count_tokens(after_template)
score_improvement = after_eval['overall_score'] - before_eval['overall_score']
print(f"- Token reduction: {token_reduction} tokens ({token_reduction/count_tokens(before_template):.1%})")
print(f"- Score improvement: {score_improvement:.1f} percentage points")

print_separator()

**Section 3: Template Management System**

In [None]:
print("Section 3: Template Management System")

class TemplateManager:
    """System for managing, versioning, and optimizing prompt templates."""

    def __init__(self):
        self.templates = {}
        self.version_history = {}
        self.evaluator = TemplateEvaluator()
        self.default_template = None

    def add_template(self, name, template, description=None, version="1.0.0", set_as_default=False):
        """Add a new template or a new version of an existing template."""
        # Evaluate the template
        evaluation = self.evaluator.evaluate_template(template)

        # Create template metadata
        metadata = {
            "description": description or f"Template: {name}",
            "version": version,
            "created_at": datetime.now().isoformat(),
            "token_count": count_tokens(template),
            "evaluation": evaluation,
            "usage_count": 0
        }

        # Add to templates dictionary
        self.templates[name] = {
            "template": template,
            "metadata": metadata
        }

        # Add to version history
        if name not in self.version_history:
            self.version_history[name] = {}

        self.version_history[name][version] = {
            "template": template,
            "metadata": metadata
        }

        # Set as default if requested or if it's the first template
        if set_as_default or self.default_template is None:
            self.default_template = name

        return metadata

    def get_template(self, name=None, version=None):
        """Get a template by name and optional version."""
        # Use default if no name specified
        if name is None:
            name = self.default_template
            if name is None:
                raise ValueError("No template name specified and no default template set")

        # Check if template exists
        if name not in self.templates:
            raise ValueError(f"Template '{name}' not found")

        # Return specific version if requested
        if version is not None:
            if version not in self.version_history[name]:
                raise ValueError(f"Version '{version}' not found for template '{name}'")

            return self.version_history[name][version]["template"]

        # Return current version
        return self.templates[name]["template"]

    def list_templates(self):
        """List all available templates with metadata."""
        return {name: template["metadata"] for name, template in self.templates.items()}

    def list_versions(self, name):
        """List all versions of a specific template."""
        if name not in self.version_history:
            raise ValueError(f"Template '{name}' not found")

        return {version: info["metadata"] for version, info in self.version_history[name].items()}

    def format_prompt(self, template_name=None, version=None, **kwargs):
        """Format a prompt using the specified template and variables."""
        template = self.get_template(template_name, version)

        # Update usage count
        if template_name is None:
            template_name = self.default_template

        self.templates[template_name]["metadata"]["usage_count"] += 1

        # Format and return the prompt
        return template.format(**kwargs)

    def optimize_template(self, name, target_criteria=None):
        """Generate an optimized version of a template based on evaluation results."""
        if name not in self.templates:
            raise ValueError(f"Template '{name}' not found")

        original_template = self.templates[name]["template"]
        original_eval = self.evaluator.evaluate_template(original_template)

        # Focus on criteria with the lowest scores if not specified
        if target_criteria is None:
            scores = original_eval["criteria_scores"]
            target_criteria = sorted(scores.keys(), key=lambda k: scores[k]["score"])[:3]

        print(f"Optimizing template '{name}' targeting: {', '.join(target_criteria)}")

        # Apply optimization strategies based on target criteria
        optimized_template = original_template

        if "token_efficiency" in target_criteria:
            optimized_template = self._optimize_token_efficiency(optimized_template)

        if "constraints" in target_criteria:
            optimized_template = self._add_constraints(optimized_template)

        if "citation" in target_criteria:
            optimized_template = self._add_citation_requirements(optimized_template)

        if "structure" in target_criteria:
            optimized_template = self._improve_structure(optimized_template)

        if "clarity" in target_criteria:
            optimized_template = self._improve_clarity(optimized_template)

        if "identity" in target_criteria:
            optimized_template = self._fix_identity_issues(optimized_template)

        if "error_handling" in target_criteria:
            optimized_template = self._add_error_handling(optimized_template)

        if "task_guidance" in target_criteria:
            optimized_template = self._add_task_guidance(optimized_template)

        # Evaluate the optimized template
        optimized_eval = self.evaluator.evaluate_template(optimized_template)

        # Create a new version
        current_version = self.templates[name]["metadata"]["version"]
        version_parts = current_version.split('.')
        new_version = f"{version_parts[0]}.{version_parts[1]}.{int(version_parts[2]) + 1}"

        # Add the optimized template
        optimization_description = f"Optimized for: {', '.join(target_criteria)}"
        self.add_template(name, optimized_template, description=optimization_description, version=new_version)

        return {
            "original": original_template,
            "optimized": optimized_template,
            "original_eval": original_eval,
            "optimized_eval": optimized_eval,
            "new_version": new_version
        }

    def _optimize_token_efficiency(self, template):
        """Optimize template for token efficiency."""
        # Replace verbose phrases
        verbose_replacements = {
            "in order to": "to",
            "for the purpose of": "for",
            "in the event that": "if",
            "at this point in time": "now",
            "due to the fact that": "because",
            "with regard to": "regarding",
            "it is important to note that": ""
        }

        optimized = template
        for verbose, concise in verbose_replacements.items():
            optimized = optimized.replace(verbose, concise)

        # Remove redundant instructions
        if "based on the context" in optimized.lower() and "using the provided context" in optimized.lower():
            optimized = optimized.replace("using the provided context", "")

        # Consolidate instructions
        lines = optimized.split('\n')
        consolidated_lines = []

        for i, line in enumerate(lines):
            if i > 0 and line.strip() and lines[i-1].strip() and line.strip().startswith('-') and lines[i-1].strip().startswith('-'):
                # Check if this bullet point is similar to the previous one
                similarity = self._calculate_similarity(line, lines[i-1])
                if similarity > 0.5:  # Skip if too similar
                    continue
            consolidated_lines.append(line)

        optimized = '\n'.join(consolidated_lines)

        return optimized

    def _add_constraints(self, template):
        """Add explicit constraints to the template."""
        # Check if constraints already exist
        if "only" in template.lower() and "context" in template.lower():
            # Already has basic constraints
            pass
        else:
            # Add constraint before ANSWER section
            answer_pos = template.find("ANSWER:")
            if answer_pos != -1:
                constraint = "\nUse ONLY information from the provided context. Do not introduce external knowledge.\n\n"
                optimized = template[:answer_pos] + constraint + template[answer_pos:]
            else:
                # Add at the beginning
                constraint = "Use ONLY information from the provided context. Do not introduce external knowledge.\n\n"
                optimized = constraint + template

        # Add instruction for insufficient information
        if "don't know" not in template.lower() and "insufficient" not in template.lower():
            answer_pos = template.find("ANSWER:")
            if answer_pos != -1:
                instruction = "If the context doesn't contain the answer, say \"I don't have enough information.\"\n"
                optimized = template[:answer_pos] + instruction + template[answer_pos:]
            else:
                optimized = template + "\nIf the context doesn't contain the answer, say \"I don't have enough information.\"\n"
        else:
            optimized = template

        return optimized

    def _add_citation_requirements(self, template):
        """Add citation requirements to the template."""
        # Check if citation requirements already exist
        if "cite" in template.lower() or "document" in template.lower() and "[" in template:
            # Already has citation requirements
            optimized = template
        else:
            # Add citation instruction before ANSWER section
            answer_pos = template.find("ANSWER:")
            if answer_pos != -1:
                citation = "Cite your sources using [Document X] notation.\n"
                optimized = template[:answer_pos] + citation + template[answer_pos:]
            else:
                # Add at the end
                citation = "\nCite your sources using [Document X] notation."
                optimized = template + citation

        return optimized

    def _improve_structure(self, template):
        """Improve the template structure."""
        # Check if basic sections exist
        has_context = "CONTEXT:" in template
        has_question = "QUESTION:" in template
        has_answer = "ANSWER:" in template

        if not has_context or not has_question or not has_answer:
            # Create a basic structure
            sections = []

            if not has_context:
                sections.append("CONTEXT:\n{context}\n")
            else:
                sections.append(self._extract_section(template, "CONTEXT:"))

            if not has_question:
                sections.append("QUESTION:\n{question}\n")
            else:
                sections.append(self._extract_section(template, "QUESTION:"))

            if "INSTRUCTION" not in template and "instruction" not in template.lower():
                sections.append("INSTRUCTIONS:\n- Answer based only on the provided context\n- Cite sources using [Document X] notation\n- If the answer isn't in the context, say so\n")

            if not has_answer:
                sections.append("ANSWER:")
            else:
                sections.append(self._extract_section(template, "ANSWER:"))

            optimized = "\n\n".join(sections)
        else:
            optimized = template

        return optimized

    def _improve_clarity(self, template):
        """Improve the clarity of the template."""
        # Replace ambiguous terms
        ambiguous_replacements = {
            "try to": "",
            "if possible": "",
            "you might want to": "",
            "perhaps": "",
            "maybe": ""
        }

        optimized = template
        for ambiguous, replacement in ambiguous_replacements.items():
            optimized = optimized.replace(ambiguous, replacement)

        # Add clear section headers if missing
        if "CONTEXT:" not in optimized and "context:" not in optimized.lower():
            optimized = optimized.replace("{context}", "CONTEXT:\n{context}")

        if "QUESTION:" not in optimized and "question:" not in optimized.lower():
            optimized = optimized.replace("{question}", "QUESTION:\n{question}")

        return optimized

    def _fix_identity_issues(self, template):
        """Fix identity-related issues in the template."""
        # Replace problematic phrases
        identity_replacements = {
            "you are an expert": "Answer based on the context",
            "as an AI assistant": "",
            "using your knowledge": "using the provided context",
            "you know": "the context contains",
            "you have expertise": "based on the context",
            "as a helpful assistant": ""
        }

        optimized = template
        for phrase, replacement in identity_replacements.items():
            optimized = optimized.replace(phrase, replacement)

        return optimized

    def _add_error_handling(self, template):
        """Add error handling instructions to the template."""
        # Check if error handling already exists
        has_insufficient = "don't have enough" in template.lower() or "insufficient" in template.lower()
        has_conflicting = "conflict" in template.lower() or "contradict" in template.lower()

        optimized = template

        # Add instruction section if it doesn't exist
        if "INSTRUCTION" not in optimized and "instruction" not in optimized.lower():
            answer_pos = optimized.find("ANSWER:")
            if answer_pos != -1:
                instructions = "\nINSTRUCTIONS:\n"
                if not has_insufficient:
                    instructions += "- If the answer isn't in the context, say \"I don't have enough information\"\n"
                if not has_conflicting:
                    instructions += "- If information is conflicting, acknowledge the contradiction\n"

                optimized = optimized[:answer_pos] + instructions + optimized[answer_pos:]
        else:
            # Add to existing instructions
            instruction_pos = optimized.lower().find("instruction")
            next_section_pos = optimized.find("\n\n", instruction_pos)
            if next_section_pos == -1:
                next_section_pos = len(optimized)

            additional_instructions = ""
            if not has_insufficient:
                additional_instructions += "\n- If the answer isn't in the context, say \"I don't have enough information\""
            if not has_conflicting:
                additional_instructions += "\n- If information is conflicting, acknowledge the contradiction"

            optimized = optimized[:next_section_pos] + additional_instructions + optimized[next_section_pos:]

        return optimized

    def _add_task_guidance(self, template):
        """Add task-specific guidance to the template."""
        # Determine if this is a QA template (most common)
        is_qa = "question" in template.lower() and "answer" in template.lower()

        if is_qa:
            # Add QA-specific guidance if not already present
            quality_guidance = ["concise", "direct", "clear"]
            format_guidance = ["format", "structure", "organize"]

            has_quality = any(term in template.lower() for term in quality_guidance)
            has_format = any(term in template.lower() for term in format_guidance)

            if not has_quality or not has_format:
                answer_pos = template.find("ANSWER:")
                if answer_pos != -1:
                    guidance = ""
                    if not has_quality:
                        guidance += "Provide a clear, concise answer that directly addresses the question.\n"
                    if not has_format:
                        guidance += "Structure complex answers with main points first, followed by supporting details.\n"

                    optimized = template[:answer_pos] + guidance + template[answer_pos:]
                else:
                    guidance = "\nProvide a clear, concise answer that directly addresses the question.\n"
                    optimized = template + guidance
            else:
                optimized = template
        else:
            # Generic task guidance
            if "task" not in template.lower() and "format" not in template.lower():
                optimized = template + "\n\nFormat your response appropriately for the task, maintaining clarity and conciseness."
            else:
                optimized = template

        return optimized

    def _extract_section(self, template, section_header):
        """Extract a section from the template."""
        start_pos = template.find(section_header)
        if start_pos == -1:
            return ""

        start_pos += len(section_header)
        end_pos = template.find("\n\n", start_pos)
        if end_pos == -1:
            return template[start_pos:]

        return template[start_pos:end_pos]

    def _calculate_similarity(self, str1, str2):
        """Calculate string similarity (simple version)."""
        # Convert to lowercase and remove punctuation
        s1 = ''.join(c.lower() for c in str1 if c.isalnum() or c.isspace())
        s2 = ''.join(c.lower() for c in str2 if c.isalnum() or c.isspace())

        # Split into words
        words1 = set(s1.split())
        words2 = set(s2.split())

        # Calculate Jaccard similarity
        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))

        return intersection / union if union > 0 else 0

# Create a template manager and add templates
manager = TemplateManager()

# Add templates from our evaluation
for name, template in templates_to_evaluate.items():
    manager.add_template(name, template, description=f"{name} template")

# Set the default template
manager.add_template(
    "production",
    """
    Answer the question using ONLY the provided context.

    CONTEXT:
    {context}

    QUESTION:
    {question}

    INSTRUCTIONS:
    - If the answer is in the context, provide it clearly and concisely
    - Cite specific documents using [Document X] notation
    - If the answer isn't in the context, say "I don't have enough information"
    - If information is conflicting, acknowledge the contradiction

    ANSWER:
    """,
    description="Production template with best practices",
    set_as_default=True
)

print("Template Manager initialized with templates:")
for name, metadata in manager.list_templates().items():
    print(f"- {name}: {metadata['description']} (v{metadata['version']}, score: {metadata['evaluation']['overall_score']:.1f}%)")

# Try optimizing a template
print("\nOptimizing the 'basic' template:")
optimization_result = manager.optimize_template("basic")

print("\nOriginal Template:")
print(optimization_result["original"])
print(f"Token count: {count_tokens(optimization_result['original'])}")
print(f"Overall score: {optimization_result['original_eval']['overall_score']:.1f}%")

print("\nOptimized Template (v{version}):".format(version=optimization_result["new_version"]))
print(optimization_result["optimized"])
print(f"Token count: {count_tokens(optimization_result['optimized'])}")
print(f"Overall score: {optimization_result['optimized_eval']['overall_score']:.1f}%")

# Show the version history
print("\nVersion history for 'basic' template:")
versions = manager.list_versions("basic")
for version, metadata in versions.items():
    print(f"- v{version}: {metadata['description']} (score: {metadata['evaluation']['overall_score']:.1f}%)")

# Generate a formatted prompt
query = "How much have solar photovoltaic costs decreased between 2010 and 2020?"
context = "\n".join([f"[Document {i+1}] {doc.page_content}" for i, doc in enumerate(sample_docs)])

formatted_prompt = manager.format_prompt(context=context, question=query)
print("\nFormatted Prompt (using default template):")
print(formatted_prompt)

# Generate a response if LLM is available
if os.environ.get("OPENAI_API_KEY"):
    try:
        print("\nGenerating response with LLM:")
        llm = ChatOpenAI(temperature=0)
        response = llm.invoke(formatted_prompt)
        print("Response:", response.content)
    except Exception as e:
        print(f"Error generating response: {e}")
else:
    print("\nOpenAI API key not set - skipping LLM response generation")

print_separator()

**Section 4: Implementation of Best Practices Checklist**

In [None]:
print("Section 4: Implementation of Best Practices Checklist")

class TemplateChecklist:
    """Checklist for template design, testing, and optimization."""

    def __init__(self):
        # Define checklist categories and items
        self.checklist = {
            "design": [
                "Clear sections for context, query, and response",
                "Explicit constraints on using only retrieved information",
                "Guidance for handling insufficient information",
                "Citation requirements and format",
                "Token-efficient instructions",
                "Task-specific guidance",
                "Error handling for edge cases",
                "Appropriate identity framing"
            ],
            "testing": [
                "Tested with diverse query types",
                "Tested with different document qualities",
                "Factual accuracy measured",
                "Citation quality assessed",
                "Token usage analyzed",
                "User feedback collected",
                "Edge cases evaluated"
            ],
            "optimization": [
                "Instruction text optimized for token efficiency",
                "Context allocation maximized",
                "Template variants compared",
                "Error patterns addressed",
                "Performance metrics improved",
                "Versioning implemented"
            ],
            "deployment": [
                "Template versioning established",
                "Default templates set",
                "Documentation created",
                "Monitoring plan established",
                "Feedback collection mechanism",
                "Update schedule defined"
            ]
        }

        # Track completion status
        self.status = {}
        for category, items in self.checklist.items():
            self.status[category] = {item: {"completed": False, "notes": ""} for item in items}

    def mark_completed(self, category, item, notes=""):
        """Mark a checklist item as completed."""
        if category not in self.checklist:
            raise ValueError(f"Category '{category}' not found")

        if item not in self.checklist[category]:
            raise ValueError(f"Item '{item}' not found in category '{category}'")

        self.status[category][item] = {"completed": True, "notes": notes}

    def add_notes(self, category, item, notes):
        """Add notes to a checklist item."""
        if category not in self.checklist:
            raise ValueError(f"Category '{category}' not found")

        if item not in self.checklist[category]:
            raise ValueError(f"Item '{item}' not found in category '{category}'")

        self.status[category][item]["notes"] = notes

    def get_completion_status(self):
        """Get the completion status of the checklist."""
        status = {}
        for category, items in self.checklist.items():
            completed = sum(1 for item in items if self.status[category][item]["completed"])
            total = len(items)
            status[category] = {
                "completed": completed,
                "total": total,
                "percentage": (completed / total) * 100 if total > 0 else 0
            }

        # Calculate overall completion
        total_completed = sum(status[category]["completed"] for category in status)
        total_items = sum(status[category]["total"] for category in status)
        overall = (total_completed / total_items) * 100 if total_items > 0 else 0

        status["overall"] = {
            "completed": total_completed,
            "total": total_items,
            "percentage": overall
        }

        return status

    def print_checklist(self, show_notes=True):
        """Print the checklist with completion status."""
        for category, items in self.checklist.items():
            print(f"\n{category.upper()} PHASE:")
            for i, item in enumerate(items, 1):
                status = "✅" if self.status[category][item]["completed"] else "⬜"
                print(f"{status} {i}. {item}")
                if show_notes and self.status[category][item]["notes"]:
                    print(f"   Notes: {self.status[category][item]['notes']}")

        # Print overall status
        completion = self.get_completion_status()
        print("\nCOMPLETION STATUS:")
        for category, status in completion.items():
            if category != "overall":
                print(f"{category}: {status['completed']}/{status['total']} ({status['percentage']:.1f}%)")
        print(f"Overall: {completion['overall']['completed']}/{completion['overall']['total']} ({completion['overall']['percentage']:.1f}%)")

    def generate_progress_report(self):
        """Generate a progress report for the template development process."""
        completion = self.get_completion_status()

        report = "# Template Development Progress Report\n\n"
        report += f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"

        report += "## Overall Progress\n\n"
        report += f"* **Overall completion**: {completion['overall']['completed']}/{completion['overall']['total']} ({completion['overall']['percentage']:.1f}%)\n\n"

        for category, status in completion.items():
            if category != "overall":
                report += f"* **{category.capitalize()}**: {status['completed']}/{status['total']} ({status['percentage']:.1f}%)\n"

        report += "\n## Detailed Status\n\n"

        for category, items in self.checklist.items():
            report += f"### {category.capitalize()} Phase\n\n"

            completed_items = [item for item in items if self.status[category][item]["completed"]]
            pending_items = [item for item in items if not self.status[category][item]["completed"]]

            if completed_items:
                report += "#### Completed Items\n\n"
                for item in completed_items:
                    report += f"* ✅ {item}\n"
                    if self.status[category][item]["notes"]:
                        report += f"  * Notes: {self.status[category][item]['notes']}\n"
                report += "\n"

            if pending_items:
                report += "#### Pending Items\n\n"
                for item in pending_items:
                    report += f"* ⬜ {item}\n"
                    if self.status[category][item]["notes"]:
                        report += f"  * Notes: {self.status[category][item]['notes']}\n"
                report += "\n"

        report += "## Next Steps\n\n"

        # Identify the category with the lowest completion percentage
        lowest_category = min(
            [c for c in completion if c != "overall"],
            key=lambda c: completion[c]["percentage"]
        )

        report += f"Focus on completing items in the **{lowest_category}** phase, which has the lowest completion rate.\n\n"

        # List a few specific next steps
        next_steps = [
            item for item in self.checklist[lowest_category]
            if not self.status[lowest_category][item]["completed"]
        ][:3]

        if next_steps:
            report += "Specific next steps:\n\n"
            for step in next_steps:
                report += f"1. Complete **{step}**\n"

        return report

# Create a template checklist
checklist = TemplateChecklist()

# Mark some items as completed for our "production" template
checklist.mark_completed("design", "Clear sections for context, query, and response",
                        "Template uses clear CONTEXT, QUESTION, and ANSWER sections")
checklist.mark_completed("design", "Explicit constraints on using only retrieved information",
                        "Template explicitly instructs to use ONLY the provided context")
checklist.mark_completed("design", "Guidance for handling insufficient information",
                        "Template includes instruction to say 'I don't have enough information'")
checklist.mark_completed("design", "Citation requirements and format",
                        "Template requires citing sources using [Document X] notation")
checklist.mark_completed("design", "Error handling for edge cases",
                        "Template addresses conflicting information")

checklist.mark_completed("testing", "Tested with diverse query types",
                        "Template tested with factual, comparative, and analytical queries")
checklist.mark_completed("testing", "Token usage analyzed",
                        "Template uses approximately 150 tokens, leaving ample room for context")

checklist.mark_completed("optimization", "Instruction text optimized for token efficiency",
                        "Instructions are concise and avoid redundancy")
checklist.mark_completed("optimization", "Template variants compared",
                        "Compared against basic, detailed, and optimized variants")

checklist.mark_completed("deployment", "Template versioning established",
                        "Using semantic versioning (major.minor.patch)")
checklist.mark_completed("deployment", "Default templates set",
                        "Production template set as default")

# Add notes to some pending items
checklist.add_notes("testing", "User feedback collected",
                   "Plan to collect feedback from 5 test users next week")
checklist.add_notes("deployment", "Monitoring plan established",
                   "Will track accuracy, relevance, and user satisfaction metrics")

# Print the checklist
print("Template Development Checklist:")
checklist.print_checklist()

# Generate and print progress report
print("\nTemplate Development Progress Report:")
report = checklist.generate_progress_report()
print(report)

# Visualize checklist completion
try:
    completion = checklist.get_completion_status()

    categories = [cat for cat in completion if cat != "overall"]
    percentages = [completion[cat]["percentage"] for cat in categories]

    plt.figure(figsize=(10, 6))
    bars = plt.bar(categories, percentages)

    # Add percentage labels on top of bars
    for bar, percentage in zip(bars, percentages):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                 f"{percentage:.1f}%", ha='center', fontsize=12)

    plt.title("Template Development Progress by Phase", fontsize=14)
    plt.ylabel("Completion Percentage")
    plt.ylim(0, 105)  # Leave room for labels
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error creating visualization: {e}")
    print("To view the visualization, run this notebook in an environment that supports matplotlib.")

print_separator()


**Section 5: Performance Comparison Across Template Designs**

In [None]:
print("Section 5: Performance Comparison Across Template Designs")

# Define metrics for performance comparison
metrics = {
    "factual_accuracy": {
        "description": "Percentage of factual claims that match the source documents",
        "importance": "Critical - Determines fundamental reliability"
    },
    "citation_rate": {
        "description": "Percentage of claims that include proper source citations",
        "importance": "High - Enables verification of information"
    },
    "hallucination_rate": {
        "description": "Percentage of claims not supported by source documents",
        "importance": "Critical - Measures factual reliability"
    },
    "relevance": {
        "description": "Degree to which the response addresses the specific query",
        "importance": "High - Determines usefulness to the user"
    },
    "token_efficiency": {
        "description": "Ratio of information content to total tokens used",
        "importance": "Medium - Affects cost and context usage"
    },
    "response_coherence": {
        "description": "Quality of organization and logical flow in the response",
        "importance": "Medium - Impacts readability and comprehension"
    }
}

# Mock performance data for different template designs
# In a real scenario, this would come from systematic testing with a large evaluation set
performance_data = {
    "basic": {
        "factual_accuracy": 75.3,
        "citation_rate": 12.1,
        "hallucination_rate": 18.7,
        "relevance": 83.2,
        "token_efficiency": 68.5,
        "response_coherence": 72.4
    },
    "detailed": {
        "factual_accuracy": 82.6,
        "citation_rate": 54.3,
        "hallucination_rate": 13.2,
        "relevance": 86.7,
        "token_efficiency": 62.1,
        "response_coherence": 78.9
    },
    "optimized": {
        "factual_accuracy": 85.1,
        "citation_rate": 87.3,
        "hallucination_rate": 8.4,
        "relevance": 87.4,
        "token_efficiency": 79.2,
        "response_coherence": 80.3
    },
    "best_practice": {
        "factual_accuracy": 91.5,
        "citation_rate": 94.7,
        "hallucination_rate": 4.3,
        "relevance": 89.6,
        "token_efficiency": 83.8,
        "response_coherence": 86.2
    }
}

# Create a DataFrame for easier analysis
performance_df = pd.DataFrame(performance_data)

# Add metric importance as a column
importance_values = {"Critical": 3, "High": 2, "Medium": 1}
importance = [importance_values[metrics[m]["importance"].split(" ")[0]] for m in performance_df.index]
performance_df["importance"] = importance

# Calculate weighted scores
template_scores = {}
for template in performance_data:
    # Calculate scores with special handling for hallucination_rate (lower is better)
    weighted_scores = []
    for metric in performance_df.index:
        if metric == "hallucination_rate":
            # For hallucination, lower is better, so we invert it
            weighted_scores.append((100 - performance_df.loc[metric, template]) * performance_df.loc[metric, "importance"])
        else:
            weighted_scores.append(performance_df.loc[metric, template] * performance_df.loc[metric, "importance"])

    # Calculate total weighted score
    total_weights = sum(performance_df["importance"])
    weighted_average = sum(weighted_scores) / total_weights
    template_scores[template] = weighted_average

# Print performance comparison
print("Template Performance Comparison:")
print("\nRaw Metrics:")
print(performance_df[list(performance_data.keys())].round(1))

print("\nMetric Descriptions:")
for metric, info in metrics.items():
    print(f"- {metric}: {info['description']} ({info['importance']})")

print("\nWeighted Scores:")
for template, score in template_scores.items():
    print(f"- {template}: {score:.1f}")

# Visualize the performance comparison
try:
    # Create a heatmap of performance metrics
    plt.figure(figsize=(12, 8))
    sns.heatmap(performance_df[list(performance_data.keys())], annot=True, cmap="YlGnBu", fmt=".1f",
                linewidths=.5, cbar_kws={'label': 'Score'})
    plt.title("Template Performance Comparison", fontsize=16, pad=20)
    plt.tight_layout()
    plt.show()

    # Create a bar chart of weighted scores
    plt.figure(figsize=(10, 6))
    bars = plt.bar(template_scores.keys(), template_scores.values())

    # Add score labels on top of bars
    for bar, score in zip(bars, template_scores.values()):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                 f"{score:.1f}", ha='center', fontsize=12)

    plt.title("Weighted Template Performance Scores", fontsize=14)
    plt.ylabel("Score")
    plt.ylim(0, max(template_scores.values()) * 1.1)  # Leave room for labels
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # Create a radar chart for multi-dimensional comparison
    metrics_list = [m for m in performance_df.index if m != "importance"]
    template_names = list(performance_data.keys())

    # Create a radar chart
    angles = np.linspace(0, 2*np.pi, len(metrics_list), endpoint=False).tolist()
    angles += angles[:1]  # Close the polygon

    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))

    for template_name in template_names:
        # For hallucination rate, we invert it since lower is better
        values = []
        for metric in metrics_list:
            if metric == "hallucination_rate":
                values.append(100 - performance_data[template_name][metric])
            else:
                values.append(performance_data[template_name][metric])

        values += values[:1]  # Close the polygon
        ax.plot(angles, values, linewidth=2, label=template_name)
        ax.fill(angles, values, alpha=0.1)

    # Add labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(metrics_list)
    ax.set_yticks([20, 40, 60, 80, 100])
    ax.set_title("Template Performance Across Metrics", fontsize=14, pad=20)
    ax.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f"Error creating visualization: {e}")
    print("To view the visualization, run this notebook in an environment that supports matplotlib.")

print_separator()

**Section 6: Case Studies: Template Evolution in Production**

In [None]:
print("Section 6: Case Studies: Template Evolution in Production")

print("Case Study 1: Scientific Research Assistant")
print("\nInitial Template:")
scientific_v1 = """
Answer the scientific question based on the research papers provided.

CONTEXT:
{context}

QUESTION:
{question}

Provide a comprehensive answer.
"""
print(scientific_v1)

print("\nFirst Iteration (Problem: High hallucination rate):")
scientific_v2 = """
Answer the scientific question using ONLY the information from the provided research papers.
Do not introduce external knowledge or make claims not supported by the papers.

CONTEXT:
{context}

QUESTION:
{question}

If the information isn't in the papers, acknowledge the limitations.
"""
print(scientific_v2)

print("\nSecond Iteration (Problem: Poor attribution):")
scientific_v3 = """
Answer the scientific question using ONLY the information from the provided research papers.

CONTEXT:
{context}

QUESTION:
{question}

INSTRUCTIONS:
- Cite specific papers using [Paper X] notation
- If information isn't in the papers, say "The provided papers don't address this question"
- Acknowledge limitations and uncertainty where appropriate
"""
print(scientific_v3)

print("\nFinal Version (Problem: Lack of confidence indicators):")
scientific_v4 = """
Answer the scientific question using ONLY the information from the provided research papers.

CONTEXT:
{context}

QUESTION:
{question}

INSTRUCTIONS:
- Cite specific papers using [Paper X] notation
- If information isn't in the papers, say "The provided papers don't address this question"
- Indicate confidence level for each claim (Strong evidence, Moderate evidence, Limited evidence)
- Note contradictions or disagreements between papers
- Maintain scientific precision in terminology and claims

ANSWER FORMAT:
- First paragraph: Direct answer with primary findings
- Additional paragraphs: Supporting evidence with citations
- Final paragraph: Limitations and areas of uncertainty
"""
print(scientific_v4)

print("\nPerformance Evolution:")
scientific_metrics = {
    "v1": {"hallucination_rate": 32.5, "citation_rate": 10.2, "factual_accuracy": 68.7},
    "v2": {"hallucination_rate": 12.3, "citation_rate": 15.6, "factual_accuracy": 83.2},
    "v3": {"hallucination_rate": 8.7, "citation_rate": 87.3, "factual_accuracy": 86.5},
    "v4": {"hallucination_rate": 4.2, "citation_rate": 94.8, "factual_accuracy": 92.1}
}

for version, metrics in scientific_metrics.items():
    print(f"- {version}: Hallucination: {metrics['hallucination_rate']}%, Citation: {metrics['citation_rate']}%, Accuracy: {metrics['factual_accuracy']}%")

print("\nKey Learnings:")
print("1. Explicit constraints dramatically reduce hallucinations")
print("2. Specific citation formats improve attribution")
print("3. Structured response formats enhance clarity")
print("4. Confidence indicators help users gauge reliability")
print("5. Domain-specific terminology guidance improves precision")

print("\n" + "-"*50 + "\n")

print("Case Study 2: Legal Research Assistant")
print("\nInitial Template:")
legal_v1 = """
Answer the legal question based on the provided materials.

CONTEXT:
{context}

QUESTION:
{question}

Provide legal analysis and guidance.
"""
print(legal_v1)

print("\nFinal Template (After Several Iterations):")
legal_v4 = """
Analyze the legal question based EXCLUSIVELY on the provided legal materials.
Do not introduce legal principles or precedents not found in these materials.

LEGAL MATERIALS:
{context}

LEGAL QUESTION:
{question}

ANALYSIS STRUCTURE:
1. Identify the relevant legal principles/rules from the materials
2. Apply these principles to the specific question
3. Consider potential counterarguments or alternative interpretations
4. Formulate a reasoned conclusion

IMPORTANT GUIDANCE:
- Cite specific materials using standard legal citation [Case X] or [Statute Y]
- Use precise legal terminology from the provided materials
- Acknowledge where the materials may be inconclusive or ambiguous
- Clearly separate established legal principles from interpretive analysis
- Do NOT provide definitive legal advice, instead frame as "based on these materials..."

LEGAL ANALYSIS:
"""
print(legal_v4)

print("\nKey Improvements:")
print("1. Domain-specific citation format aligned with legal standards")
print("2. Clear analytical structure matching legal reasoning patterns")
print("3. Explicit separation of principles from interpretation")
print("4. Strong disclaimers about providing definitive legal advice")
print("5. Terminology guidance specific to legal domain")

print("\nResults After Implementation:")
print("- Hallucination rate decreased from 28.7% to 3.5%")
print("- Citation accuracy increased from 45.2% to 96.8%")
print("- Legal professionals rated answers as 'reliable' increased from 32% to 87%")

print_separator()

**Section 7: Resources and Further Learning**

In [None]:
print("Section 7: Resources and Further Learning")

resources = {
    "Research Papers": [
        {
            "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
            "authors": "Wei et al.",
            "url": "https://arxiv.org/abs/2201.11903",
            "relevance": "Foundational work on prompting for explicit reasoning"
        },
        {
            "title": "Calibrate Before Use: Improving Few-Shot Performance of Language Models",
            "authors": "Zhao et al.",
            "url": "https://arxiv.org/abs/2102.09690",
            "relevance": "Techniques for optimizing few-shot examples in prompts"
        },
        {
            "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
            "authors": "Lewis et al.",
            "url": "https://arxiv.org/abs/2005.11401",
            "relevance": "Original RAG paper with prompting insights"
        }
    ],
    "Open Source Libraries": [
        {
            "name": "LangChain",
            "url": "https://github.com/hwchase17/langchain",
            "relevance": "Comprehensive library with templates and chains for RAG"
        },
        {
            "name": "LlamaIndex",
            "url": "https://github.com/jerryjliu/llama_index",
            "relevance": "Specialized library for RAG with template utilities"
        },
        {
            "name": "Guidance",
            "url": "https://github.com/microsoft/guidance",
            "relevance": "Microsoft's library for structured prompting"
        }
    ],
    "Interactive Tools": [
        {
            "name": "OpenAI Playground",
            "url": "https://platform.openai.com/playground",
            "relevance": "Test templates with various OpenAI models"
        },
        {
            "name": "Anthropic Claude Console",
            "url": "https://console.anthropic.com/",
            "relevance": "Test templates with Claude models"
        },
        {
            "name": "PromptTools",
            "url": "https://github.com/hegelai/prompttools",
            "relevance": "Open-source toolkit for prompt testing and evaluation"
        }
    ],
    "Blogs and Guides": [
        {
            "name": "Anthropic's Prompt Engineering Guide",
            "url": "https://docs.anthropic.com/claude/docs/introduction-to-prompting",
            "relevance": "Best practices for prompting Claude models"
        },
        {
            "name": "OpenAI Cookbook",
            "url": "https://github.com/openai/openai-cookbook",
            "relevance": "Collection of prompting techniques and templates"
        },
        {
            "name": "Prompt Engineering Guide",
            "url": "https://www.promptingguide.ai/",
            "relevance": "Comprehensive guide to prompt engineering techniques"
        }
    ]
}

print("Resources for Further Learning on Prompt Templates:")

for category, items in resources.items():
    print(f"\n{category}:")
    for item in items:
        if "title" in item:
            print(f"- {item['title']} ({item['authors']})")
            print(f"  URL: {item['url']}")
            print(f"  Relevance: {item['relevance']}")
        else:
            print(f"- {item['name']}")
            print(f"  URL: {item['url']}")
            print(f"  Relevance: {item['relevance']}")

print("\nRecommended Learning Path:")
print("1. Start with the OpenAI and Anthropic guides for foundational understanding")
print("2. Explore the Chain-of-Thought and RAG research papers for theoretical background")
print("3. Experiment with the interactive tools to test different template approaches")
print("4. Implement templates using LangChain or LlamaIndex libraries")
print("5. Use the template evaluation framework in this notebook to assess your templates")
print("6. Iterate based on performance metrics and user feedback")

print_separator()

**Final Summary**

In [None]:
print("Final Summary: Template Best Practices")

print("""
Key Takeaways:

1. Structure and Clarity
   - Use clear section headers (CONTEXT, QUESTION, ANSWER)
   - Provide explicit, unambiguous instructions
   - Maintain logical organization of template components

2. Constraints and Guardrails
   - Explicitly limit responses to retrieved information
   - Provide guidance for handling insufficient information
   - Include instructions for acknowledging uncertainty

3. Citation and Attribution
   - Require specific citation format ([Document X])
   - Provide clear guidelines for when citations are needed
   - Include instructions for handling conflicting sources

4. Token Efficiency
   - Eliminate redundant instructions
   - Use concise language
   - Allocate tokens strategically between instructions and context

5. Error Handling
   - Plan for edge cases like missing information
   - Include guidance for contradictory content
   - Provide strategies for handling ambiguity

6. Template Management
   - Implement semantic versioning
   - Document template changes and rationale
   - Establish clear criteria for evaluation

7. Continuous Improvement
   - Analyze error patterns systematically
   - Test template variations methodically
   - Collect and incorporate user feedback
   - Iterate based on performance metrics
""")

print("\nSuccessful templates balance comprehensiveness and efficiency, providing clear guidance while maximizing the context available for retrieved information. The techniques and tools in this notebook will help you develop templates that enhance the performance of your RAG system while delivering reliable, accurate, and helpful responses to users.")

print_separator()

print("Notebook completed!")