**Setup and Installation**

In [None]:
!pip install langchain langchain-openai tiktoken faiss-cpu sentence-transformers

import os
import re
import json
import numpy as np
from typing import List, Dict, Any, Optional, Tuple

os.environ["OPENAI_API_KEY"] = "your-api-key"

import tiktoken
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain.chains import LLMChain
from langchain_core.documents import Document

**Basic Utility Functions**

In [3]:
def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
    """Count the number of tokens in a text string."""
    encoder = tiktoken.encoding_for_model(model)
    return len(encoder.encode(text))

def print_separator():
    """Print a visual separator."""
    print("\n" + "="*50 + "\n")

**Section 1: Basic Few-Shot Templates**

In [None]:
print("Section 1: Basic Few-Shot Templates")

# Basic few-shot template for question answering
basic_few_shot_template = """
Answer the question based on the context. If the answer isn't in the context, say "I don't have enough information."

Example 1:
Context: The Golden Gate Bridge was completed in 1937. It has a total length of 8,981 feet.
Question: When was the Golden Gate Bridge completed?
Answer: The Golden Gate Bridge was completed in 1937.

Example 2:
Context: The Eiffel Tower is 330 meters tall and was completed in 1889.
Question: How deep is the foundation of the Eiffel Tower?
Answer: I don't have enough information.

Now, answer the following:
Context: {context}
Question: {question}
Answer:
"""

print("Basic few-shot template:")
print(basic_few_shot_template)
print(f"Token count: {count_tokens(basic_few_shot_template)}")

# Test with a sample context and question
sample_context = "The Python programming language was created by Guido van Rossum and first released in 1991."
sample_question = "Who created the Python programming language?"

formatted_prompt = basic_few_shot_template.format(
    context=sample_context,
    question=sample_question
)

print("\nFormatted prompt:")
print(formatted_prompt)

print_separator()

**Section 2: Example Categories for RAG Applications**

In [None]:
print("Section 2: Example Categories for RAG Applications")

# Source attribution examples
attribution_examples = [
    {
        "context": "[Doc1] The Python programming language was created by Guido van Rossum in 1991.",
        "question": "Who created Python?",
        "answer": "Python was created by Guido van Rossum in 1991 [Doc1]."
    },
    {
        "context": "[Doc1] The first Harry Potter book was published in 1997.\n[Doc2] J.K. Rowling is the author of the Harry Potter series.",
        "question": "When was the first Harry Potter book published and who wrote it?",
        "answer": "The first Harry Potter book was published in 1997 [Doc1] and was written by J.K. Rowling [Doc2]."
    }
]

# Incomplete information examples
incomplete_info_examples = [
    {
        "context": "[Doc1] JavaScript was created by Brendan Eich in 1995.",
        "question": "When was TypeScript released?",
        "answer": "I don't have enough information in the provided context to answer when TypeScript was released."
    },
    {
        "context": "[Doc1] The Great Pyramid of Giza is one of the Seven Wonders of the Ancient World.",
        "question": "How tall is the Great Pyramid of Giza?",
        "answer": "The context doesn't provide information about the height of the Great Pyramid of Giza."
    }
]

# Contradiction resolution examples
contradiction_examples = [
    {
        "context": "[Doc1] Some studies suggest coffee may increase heart disease risk.\n[Doc2] Recent research indicates moderate coffee consumption may reduce heart disease risk.",
        "question": "Does coffee increase heart disease risk?",
        "answer": "The information in the context presents conflicting views. According to [Doc1], some studies suggest coffee may increase heart disease risk, while [Doc2] indicates recent research shows moderate coffee consumption may actually reduce heart disease risk. The scientific consensus appears to be evolving on this topic."
    },
    {
        "context": "[Doc1] The film 'Inception' was released in 2010.\n[Doc2] The movie 'Inception' came out in 2009.",
        "question": "When was Inception released?",
        "answer": "There is conflicting information in the provided context. [Doc1] states that 'Inception' was released in 2010, while [Doc2] states it came out in 2009. Without additional information, I cannot determine which date is correct."
    }
]

# Synthesis examples
synthesis_examples = [
    {
        "context": "[Doc1] Machine learning is a subset of artificial intelligence.\n[Doc2] Deep learning is a type of machine learning that uses neural networks with multiple layers.\n[Doc3] Neural networks are computational models inspired by the human brain.",
        "question": "Explain the relationship between AI, machine learning, and deep learning.",
        "answer": "Based on the provided context, artificial intelligence is a broad field that includes machine learning as a subset [Doc1]. Deep learning is a specific type of machine learning that utilizes neural networks with multiple layers [Doc2]. These neural networks are computational models inspired by the human brain [Doc3]."
    }
]

# Function to demonstrate example usage
def print_example_category(category_name, examples):
    print(f"\n{category_name} Examples:")
    for i, example in enumerate(examples):
        print(f"Example {i+1}:")
        print(f"Context: {example['context']}")
        print(f"Question: {example['question']}")
        print(f"Answer: {example['answer']}")
        print("-" * 30)

# Print example categories
print_example_category("Source Attribution", attribution_examples)
print_example_category("Incomplete Information", incomplete_info_examples)
print_example_category("Contradiction Resolution", contradiction_examples)
print_example_category("Synthesis", synthesis_examples)

print_separator()

**Section 3: Creating Templates with Diverse Examples**

In [None]:
print("Section 3: Creating Templates with Diverse Examples")

# Template with diverse examples
diverse_examples_template = """
Answer based on the provided information.

Example 1: [Direct answer with citation]
Context: [Doc1] Einstein published the theory of relativity in 1905.
Question: When did Einstein publish the theory of relativity?
Answer: Einstein published the theory of relativity in 1905 [Doc1].

Example 2: [Synthesis across documents]
Context: [Doc1] The Great Depression began with the stock market crash in 1929.
[Doc2] The Great Depression lasted until about 1939.
Question: What was the Great Depression?
Answer: The Great Depression was an economic downturn that began with the stock market crash in 1929 [Doc1] and lasted until about 1939 [Doc2].

Example 3: [Missing information]
Context: [Doc1] The Python programming language was created by Guido van Rossum.
Question: When was Python 3.0 released?
Answer: I don't have enough information to answer when Python 3.0 was released.

Context: {context}
Question: {question}
Answer:
"""

print("Template with diverse examples:")
print(diverse_examples_template)
print(f"Token count: {count_tokens(diverse_examples_template)}")

# Function to format examples into a template string
def format_examples_into_template(examples, max_examples=2):
    formatted_examples = ""

    for i, example in enumerate(examples[:max_examples]):
        formatted_examples += f"\nExample {i+1}:\n"
        formatted_examples += f"Context: {example['context']}\n"
        formatted_examples += f"Question: {example['question']}\n"
        formatted_examples += f"Answer: {example['answer']}\n"

    return formatted_examples

# Create a template with attribution and incomplete info examples
custom_examples = attribution_examples[:1] + incomplete_info_examples[:1]
formatted_examples = format_examples_into_template(custom_examples)

custom_template = f"""
Answer the question based on the provided context. If the information isn't available, say so clearly.
{formatted_examples}
Context: {{context}}
Question: {{question}}
Answer:
"""

print("\nCustom template with selected examples:")
print(custom_template)

print_separator()

**Section 4: Dynamic Example Selection**

In [None]:
print("Section 4: Dynamic Example Selection")

# Create an example library organized by query types
example_library = {
    "factual": attribution_examples,
    "comparative": [
        {
            "context": "[Doc1] Python is dynamically typed. [Doc2] Java is statically typed.",
            "question": "Compare Python and Java typing systems.",
            "answer": "Based on the context, Python is dynamically typed [Doc1] while Java is statically typed [Doc2]."
        },
        {
            "context": "[Doc1] Electric cars run on batteries and electric motors. [Doc2] Gasoline cars use internal combustion engines.",
            "question": "What's the difference between electric and gasoline cars?",
            "answer": "According to the context, electric cars run on batteries and electric motors [Doc1], while gasoline cars use internal combustion engines [Doc2]."
        }
    ],
    "cause_effect": [
        {
            "context": "[Doc1] Greenhouse gases trap heat in the Earth's atmosphere. [Doc2] Increased greenhouse gas emissions have led to global temperature rise.",
            "question": "Why is the Earth's temperature rising?",
            "answer": "Based on the context, the Earth's temperature is rising because increased greenhouse gas emissions [Doc2] trap heat in the Earth's atmosphere [Doc1]."
        }
    ],
    "not_found": incomplete_info_examples,
    "contradictory": contradiction_examples,
    "synthesis": synthesis_examples
}

# Function to detect query type
def detect_query_type(query: str) -> str:
    """Detect the type of query based on keywords and structure."""
    query = query.lower()

    # Check for comparison questions
    if any(term in query for term in ["compare", "difference", "versus", "vs", "similarities", "differences"]):
        return "comparative"

    # Check for cause and effect questions
    if any(term in query for term in ["why", "cause", "effect", "result", "impact", "lead to"]):
        return "cause_effect"

    # Check for synthesis questions (typically broader questions)
    if any(term in query for term in ["explain", "describe", "elaborate", "summarize"]):
        return "synthesis"

    # Default to factual
    return "factual"

# Function to select examples based on query type
def select_examples(query: str, max_examples: int = 2) -> List[Dict]:
    """Select appropriate examples based on query type."""
    query_type = detect_query_type(query)

    # Get examples for the detected type, or fall back to factual
    selected_examples = example_library.get(query_type, example_library["factual"])

    # Limit to max_examples
    return selected_examples[:max_examples]

# Test with different query types
test_queries = [
    "Who invented the light bulb?",
    "Compare renewable and non-renewable energy sources.",
    "Why does climate change happen?",
    "Explain the water cycle and its importance."
]

for query in test_queries:
    query_type = detect_query_type(query)
    examples = select_examples(query)

    print(f"\nQuery: {query}")
    print(f"Detected type: {query_type}")
    print(f"Selected {len(examples)} examples:")

    for i, example in enumerate(examples):
        print(f"  Example {i+1}: Question: {example['question']}")

print_separator()

**Section 5: Advanced Example Selection with Semantic Similarity**

In [None]:
print("Section 5: Advanced Example Selection with Semantic Similarity")

try:
    from sentence_transformers import SentenceTransformer

    # This is optional and will only run if sentence-transformers is installed
    print("Demonstrating semantic similarity-based example selection...")

    # Initialize model for semantic similarity (lightweight model)
    try:
        model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

        # Flatten all examples for similarity comparison
        all_examples = []
        for category, examples in example_library.items():
            for example in examples:
                example['category'] = category
                all_examples.append(example)

        def select_examples_by_similarity(query: str, examples: List[Dict], top_k: int = 2):
            """Select examples most similar to the query."""
            if not examples:
                return []

            # Get embeddings
            query_embedding = model.encode(query)
            question_embeddings = model.encode([ex['question'] for ex in examples])

            # Calculate similarities
            similarities = [np.dot(query_embedding, q_emb) / (np.linalg.norm(query_embedding) * np.linalg.norm(q_emb))
                            for q_emb in question_embeddings]

            # Get top-k examples by similarity
            top_indices = np.argsort(similarities)[-top_k:][::-1]

            return [examples[i] for i in top_indices]

        # Test semantic similarity selection
        test_query = "What are the environmental impacts of deforestation?"
        similar_examples = select_examples_by_similarity(test_query, all_examples, top_k=2)

        print(f"\nQuery: {test_query}")
        print("Selected examples by semantic similarity:")

        for i, example in enumerate(similar_examples):
            print(f"Example {i+1} (Category: {example['category']}):")
            print(f"Question: {example['question']}")
            print(f"Context: {example['context']}")
            print("-" * 30)

    except Exception as e:
        print(f"Couldn't initialize semantic model: {e}")
        print("Continuing with keyword-based example selection...")

except ImportError:
    print("Sentence Transformers not installed. Skipping semantic similarity example.")
    print("To enable, install with: pip install sentence-transformers")

print_separator()

**Section 6: Balancing Example Count with Context Limitations**

In [None]:
print("Section 6: Balancing Example Count with Context Limitations")

def build_prompt_with_examples(query: str, context: str, max_tokens: int = 3000) -> str:
    """
    Build a prompt with dynamically selected examples while respecting token limits.

    Args:
        query: The user's question
        context: The retrieved context to include
        max_tokens: Maximum tokens allowed for the prompt

    Returns:
        A formatted prompt with examples
    """
    # Base template without examples
    base_template = """
    Answer the question based on the provided context. If the information isn't in the context, say so clearly.

    {examples}

    Context: {context}
    Question: {question}
    Answer:
    """

    # Calculate tokens for fixed parts
    fixed_template = base_template.format(examples="", context=context, question=query)
    fixed_tokens = count_tokens(fixed_template)

    # Calculate available tokens for examples
    available_for_examples = max_tokens - fixed_tokens

    # Select examples
    selected_examples = select_examples(query)

    # Format examples into text
    examples_text = ""
    current_example_tokens = 0

    for i, example in enumerate(selected_examples):
        example_text = f"\nExample {i+1}:\n"
        example_text += f"Context: {example['context']}\n"
        example_text += f"Question: {example['question']}\n"
        example_text += f"Answer: {example['answer']}\n"

        example_tokens = count_tokens(example_text)

        # Check if adding this example would exceed our token budget
        if current_example_tokens + example_tokens <= available_for_examples:
            examples_text += example_text
            current_example_tokens += example_tokens
        else:
            # Stop adding examples if we'd exceed the limit
            break

    # Format final prompt
    final_prompt = base_template.format(
        examples=examples_text,
        context=context,
        question=query
    )

    return final_prompt, count_tokens(final_prompt)

# Test with different context lengths
short_context = "The Amazon rainforest is located in South America. It spans across Brazil, Peru, Colombia, and several other countries."
long_context = "The Amazon rainforest, also known as Amazonia, is a moist broadleaf tropical rainforest in the Amazon biome that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 km2 (2,700,000 sq mi), of which 5,500,000 km2 (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations and 3,344 formally acknowledged indigenous territories. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Bolivia, Ecuador, French Guiana, Guyana, Suriname, and Venezuela. Four nations have 'Amazonas' as the name of a political-administrative region. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species."

test_query = "Where is the Amazon rainforest located?"

short_prompt, short_tokens = build_prompt_with_examples(test_query, short_context)
long_prompt, long_tokens = build_prompt_with_examples(test_query, long_context)

print(f"Short context prompt (tokens: {short_tokens}):")
print(short_prompt[:300] + "...")

print(f"\nLong context prompt (tokens: {long_tokens}):")
print(long_prompt[:300] + "...")

print("\nExample count comparison:")
short_example_count = short_prompt.count("Example ")
long_example_count = long_prompt.count("Example ")

print(f"Short context allows for {short_example_count} examples")
print(f"Long context allows for {long_example_count} examples")

print_separator()

**Section 7: Example Rotation Strategies**

In [None]:
print("Section 7: Example Rotation Strategies")

class ExampleRotator:
    def __init__(self, example_library):
        self.example_library = example_library
        self.usage_counts = {category: [0] * len(examples)
                             for category, examples in example_library.items()}

    def rotate_examples(self, query, strategy="round_robin", max_examples=2):
        """
        Select examples using various rotation strategies.

        Strategies:
        - round_robin: Rotate through examples evenly
        - query_type: Select examples matching query type
        - diversity: Select examples from different categories
        """
        if strategy == "round_robin":
            # Flatten and rotate examples
            all_examples = []
            for category, examples in self.example_library.items():
                all_examples.extend([(category, i, ex) for i, ex in enumerate(examples)])

            # Sort by usage count
            all_examples.sort(key=lambda x: self.usage_counts[x[0]][x[1]])

            # Select least used examples
            selected = all_examples[:max_examples]

            # Update usage counts
            for category, idx, _ in selected:
                self.usage_counts[category][idx] += 1

            return [ex for _, _, ex in selected]

        elif strategy == "query_type":
            # Match by query type
            query_type = detect_query_type(query)
            examples = self.example_library.get(query_type, self.example_library["factual"])

            # Get indices sorted by usage count
            indices = sorted(range(len(examples)),
                            key=lambda i: self.usage_counts[query_type][i])

            # Select and update usage
            selected = [examples[i] for i in indices[:max_examples]]
            for i in indices[:max_examples]:
                self.usage_counts[query_type][i] += 1

            return selected

        elif strategy == "diversity":
            # Select from different categories
            categories = list(self.example_library.keys())
            selected = []

            for i in range(min(max_examples, len(categories))):
                category = categories[i % len(categories)]
                examples = self.example_library[category]

                if not examples:
                    continue

                # Find least used example
                idx = min(range(len(examples)),
                         key=lambda i: self.usage_counts[category][i])

                selected.append(examples[idx])
                self.usage_counts[category][idx] += 1

            return selected

        else:
            # Default to query type matching
            return select_examples(query, max_examples)

# Create rotator
rotator = ExampleRotator(example_library)

# Test rotation strategies
strategies = ["round_robin", "query_type", "diversity"]
test_queries = [
    "What is photosynthesis?",
    "Compare renewable and fossil fuels.",
    "Why do seasons change?",
    "Explain artificial intelligence."
]

for strategy in strategies:
    print(f"\nStrategy: {strategy}")

    for query in test_queries:
        examples = rotator.rotate_examples(query, strategy=strategy)

        print(f"\nQuery: {query}")
        print(f"Selected examples:")

        for i, example in enumerate(examples):
            question = example['question']
            category = next((cat for cat, exs in example_library.items()
                           if example in exs), "unknown")
            print(f"  Example {i+1}: [{category}] {question}")

print_separator()

**Section 8: Putting It All Together - Complete RAG Prompt with Examples**

In [None]:
print("Section 8: Putting It All Together - Complete RAG Prompt with Examples")

def create_complete_rag_prompt(query, context_docs, max_tokens=3500):
    """
    Create a complete RAG prompt with dynamically selected examples.

    Args:
        query: The user question
        context_docs: List of Document objects with retrieved information
        max_tokens: Maximum tokens for the prompt

    Returns:
        Formatted prompt with examples and context
    """
    # 1. Process retrieved documents into context string
    context = "\n\n".join([f"[Doc {i+1}] {doc.page_content}"
                           for i, doc in enumerate(context_docs)])

    # 2. Detect query type and select appropriate examples
    query_type = detect_query_type(query)
    examples = select_examples(query)

    # 3. Format template with placeholders
    template = """
    You are an AI assistant answering questions based on the provided context.

    {examples}

    CONTEXT:
    {context}

    QUESTION:
    {question}

    INSTRUCTIONS:
    - Answer based ONLY on the information in the CONTEXT
    - If the CONTEXT doesn't contain the answer, say "I don't have enough information to answer this question"
    - Cite sources using [Doc X] notation
    - For contradictory information, acknowledge the conflict and present both perspectives

    ANSWER:
    """

    # 4. Calculate token budgets
    template_without_context_examples = template.format(
        examples="", context="", question=query
    )
    fixed_tokens = count_tokens(template_without_context_examples)
    context_tokens = count_tokens(context)

    # Calculate remaining tokens for examples
    available_for_examples = max_tokens - fixed_tokens - context_tokens

    # 5. Format examples to fit token limit
    examples_text = ""
    current_example_tokens = 0

    for i, example in enumerate(examples):
        example_text = f"\nExample {i+1}:\n"
        example_text += f"Context: {example['context']}\n"
        example_text += f"Question: {example['question']}\n"
        example_text += f"Answer: {example['answer']}\n"

        example_tokens = count_tokens(example_text)

        if current_example_tokens + example_tokens <= available_for_examples:
            examples_text += example_text
            current_example_tokens += example_tokens
        else:
            break

    # 6. Construct final prompt
    final_prompt = template.format(
        examples=examples_text,
        context=context,
        question=query
    )

    # 7. Check if we need to truncate context (emergency backup)
    total_tokens = count_tokens(final_prompt)
    if total_tokens > max_tokens:
        # Need to reduce context
        available_for_context = max_tokens - (total_tokens - context_tokens)

        # Simple truncation (in practice, you'd want smarter truncation)
        encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
        context_tokens_list = encoder.encode(context)
        truncated_context_tokens = context_tokens_list[:available_for_context]
        truncated_context = encoder.decode(truncated_context_tokens)

        # Rebuild prompt
        final_prompt = template.format(
            examples=examples_text,
            context=truncated_context,
            question=query
        )

    return final_prompt

# Create sample documents
sample_docs = [
    Document(page_content="Climate change is the long-term alteration of temperature and typical weather patterns. It is primarily caused by human activities, especially the burning of fossil fuels.",
             metadata={"source": "climate_science_journal"}),
    Document(page_content="The effects of climate change include rising sea levels, more frequent extreme weather events, and shifts in plant and animal ranges.",
             metadata={"source": "environmental_report"}),
    Document(page_content="Renewable energy sources like solar and wind power can help reduce greenhouse gas emissions that contribute to climate change.",
             metadata={"source": "energy_policy_paper"})
]

# Test the complete RAG prompt
test_query = "What causes climate change and what are its effects?"
complete_prompt = create_complete_rag_prompt(test_query, sample_docs)

print("Complete RAG prompt with examples:")
print(complete_prompt)
print(f"\nTotal token count: {count_tokens(complete_prompt)}")

# Try with OpenAI if API key is available
if os.environ.get("OPENAI_API_KEY"):
    try:
        llm = OpenAI(temperature=0)
        response = llm.invoke(complete_prompt)

        print("\nGenerated response:")
        print(response)
    except Exception as e:
        print(f"Error generating response: {e}")
else:
    print("\nNo OpenAI API key found. Skipping response generation.")

print_separator()

print("Notebook completed!")