**Setup and Installation**

In [None]:
!pip install -q langchain langchain-openai chromadb pydantic sentence-transformers datasets numpy scikit-learn pandas langchain-community

import os
import re
import json
import random
import numpy as np
from typing import List, Dict, Any, Optional, Union
from pydantic import BaseModel, Field

# Configure OpenAI API key (replace with your own)
os.environ["OPENAI_API_KEY"] = "your-api-key-here"

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser, CommaSeparatedListOutputParser

# Initialize our LLM
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

print("üöÄ Notebook environment setup complete!")

**Creating a sample document collection for our examples**

In [None]:
def create_sample_kb():
    """Create a sample knowledge base with documents about programming."""
    docs = [
        Document(page_content="Python is a high-level, interpreted programming language known for its readability and simplicity. It supports multiple programming paradigms including procedural, object-oriented, and functional programming.",
                 metadata={"source": "programming/python.txt", "language": "python", "category": "language_overview"}),
        Document(page_content="JavaScript is a scripting language that enables interactive web pages. It is an essential part of web applications and all modern browsers have a dedicated JavaScript engine to execute it.",
                 metadata={"source": "programming/javascript.txt", "language": "javascript", "category": "language_overview"}),
        Document(page_content="Python lists are mutable sequences, typically used to store collections of homogeneous items. Lists can be indexed, sliced, and modified. Common operations include append(), extend(), insert(), remove(), and pop().",
                 metadata={"source": "programming/python_lists.txt", "language": "python", "category": "data_structures"}),
        Document(page_content="JavaScript arrays are high-level, list-like objects with additional features. They can be manipulated using methods like push(), pop(), shift(), and unshift(). Array methods like map(), filter(), and reduce() enable functional programming patterns.",
                 metadata={"source": "programming/javascript_arrays.txt", "language": "javascript", "category": "data_structures"}),
        Document(page_content="Python functions are defined using the def keyword, followed by a function name and parameters. They can include optional type hints, default parameters, variable-length arguments, and return values.",
                 metadata={"source": "programming/python_functions.txt", "language": "python", "category": "functions"}),
        Document(page_content="JavaScript functions are first-class objects, meaning they can be passed as arguments, returned from other functions, and assigned to variables. They can be declared using function declarations, function expressions, or arrow functions.",
                 metadata={"source": "programming/javascript_functions.txt", "language": "javascript", "category": "functions"}),
        Document(page_content="Python's exception handling uses try, except, else, and finally blocks. Specific exception types can be caught and handled separately. Custom exceptions can be created by subclassing the Exception class.",
                 metadata={"source": "programming/python_exceptions.txt", "language": "python", "category": "error_handling"}),
        Document(page_content="JavaScript error handling utilizes try-catch-finally statements. The Error object and its subtypes (SyntaxError, TypeError, etc.) provide information about the error. Custom errors can be created by extending the Error class.",
                 metadata={"source": "programming/javascript_errors.txt", "language": "javascript", "category": "error_handling"}),
    ]

    # Create embeddings and vectorstore
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
    return vectorstore, docs, embeddings

# Initialize our sample knowledge base
vectorstore, sample_docs, embeddings = create_sample_kb()
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

print("üìö Sample knowledge base created with programming language documents")

**10.5.1 Prompt Engineering with Examples**

In [None]:
# Few-shot prompt engineering examples
examples = [
    {
        "query": "How do I create a list in Python?",
        "improved_query": "What are Python lists, their syntax, and common operations for creating and manipulating them?"
    },
    {
        "query": "Tell me about JavaScript functions",
        "improved_query": "What are JavaScript functions, their types (declarations, expressions, arrow functions), and how are they used as first-class objects?"
    },
    {
        "query": "Error handling",
        "improved_query": "What are the methods and syntax for error handling and exception management in programming languages?"
    }
]

# Create a few-shot query improvement prompt
few_shot_template = """You are an expert query optimizer for a retrieval system focused on programming topics.
Your task is to improve user queries to maximize the relevance of retrieved information.

Here are some examples of how to improve queries:

Query: {example1_query}
Improved Query: {example1_improved}

Query: {example2_query}
Improved Query: {example2_improved}

Query: {example3_query}
Improved Query: {example3_improved}

Now, please improve the following query:
Query: {query}

Improved Query:"""

def improve_query_with_examples(query):
    """Improve a user query using few-shot learning examples"""
    prompt = few_shot_template.format(
        example1_query=examples[0]["query"],
        example1_improved=examples[0]["improved_query"],
        example2_query=examples[1]["query"],
        example2_improved=examples[1]["improved_query"],
        example3_query=examples[2]["query"],
        example3_improved=examples[2]["improved_query"],
        query=query
    )

    improved_query = llm.invoke(prompt).content
    return improved_query

# Test the simple few-shot approach
print("\nüìù Testing few-shot query improvement:")
test_queries = [
    "how to use arrays in JavaScript",
    "what is exception handling"
]

for query in test_queries:
    improved = improve_query_with_examples(query)
    print(f"Original: {query}")
    print(f"Improved: {improved}\n")

**10.5.2 Handling Edge Cases in Query Generation**

In [None]:
def query_generation_with_fallbacks(user_input, max_retries=3):
    """Generate a search query with fallback mechanisms"""

    # Initial prompt for query generation
    prompt_template = """Based on the user's input, generate a clear search query that would help retrieve relevant information.
The query should be concise and focused on the core information need.

User Input: {input}

Search Query:"""

    # Try to generate a query
    for attempt in range(max_retries):
        try:
            # Adjust temperature for each retry - getting more creative if earlier attempts failed
            temperature = 0.2 * (attempt + 1)  # Start at 0.2, then 0.4, then 0.6
            current_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=temperature)

            # Generate query
            query = current_llm.invoke(prompt_template.format(input=user_input)).content

            # Validate the query is non-empty and substantive (at least 3 words)
            if query and len(query.split()) >= 3:
                return query, attempt, False

            # If validation fails, trigger fallback
            raise ValueError("Generated query too short or empty")

        except Exception as e:
            print(f"Attempt {attempt+1} failed: {str(e)}")

            if attempt == max_retries - 1:
                # Final fallback: extract keywords from the user input
                fallback_query = extract_keywords_fallback(user_input)
                return fallback_query, attempt, True

    # Should never reach here due to final fallback, but just in case
    return user_input, max_retries, True

def extract_keywords_fallback(text):
    """Extract key nouns and entities as a fallback query mechanism"""
    # For a production system, you might use NLP libraries like spaCy
    # This is a simplified version

    # Remove common filler words
    stopwords = ["the", "a", "an", "is", "are", "was", "were", "be", "been",
                "being", "to", "of", "and", "or", "not", "for", "with", "by", "about"]

    # Lowercase and tokenize
    tokens = re.findall(r'\b\w+\b', text.lower())

    # Remove stopwords and keep words with 3+ chars
    keywords = [word for word in tokens if word not in stopwords and len(word) >= 3]

    # Take the most frequent 3-5 keywords
    if len(keywords) > 5:
        # Count frequencies (a simple approach)
        from collections import Counter
        keyword_counts = Counter(keywords)
        keywords = [word for word, _ in keyword_counts.most_common(5)]

    return " ".join(keywords)

# Test with problematic inputs
print("\n‚ö†Ô∏è Testing query generation with problematic inputs:")
problematic_inputs = [
    "???",
    "I'm not sure what I'm looking for, maybe something about programming?"
]

for input_text in problematic_inputs:
    query, attempts, used_fallback = query_generation_with_fallbacks(input_text)
    print(f"Input: {input_text}")
    print(f"Generated Query: {query}")
    print(f"Attempts: {attempts + 1}, Used Fallback: {used_fallback}\n")

**10.5.3 Multi-Query and Multi-Retriever Architectures**

In [None]:
# Multi-query generation
def generate_query_variations(query, n=3):
    """Generate multiple variations of a query to improve retrieval coverage"""

    prompt = f"""Generate {n} different versions of the following search query.
Each version should focus on a different aspect or use different terminology,
but all should aim to retrieve similar information.

Original query: {query}

Generate exactly {n} alternative queries, numbered 1-{n}.
1."""

    response = llm.invoke(prompt).content

    # Parse the numbered list of queries
    variation_pattern = r"\d+\.\s*(.*?)(?=\d+\.|$)"
    variations = re.findall(variation_pattern, response, re.DOTALL)

    # Clean up the variations
    variations = [v.strip() for v in variations]

    # Ensure we have the requested number of variations
    while len(variations) < n:
        variations.append(query)  # Fall back to original if parsing failed

    # Limit to requested number
    variations = variations[:n]

    return variations

# Simple Multi-Query Retriever implementation
def retrieve_with_variations(query, base_retriever, n_variations=2):
    """Retrieve documents using the original query and its variations"""
    # Generate variations
    variations = generate_query_variations(query, n=n_variations)

    print(f"Original: {query}")
    for i, var in enumerate(variations, 1):
        print(f"Variation {i}: {var}")

    # Get results from each query
    all_docs = []

    # Add results from original query
    original_docs = base_retriever.get_relevant_documents(query)
    all_docs.extend(original_docs)

    # Add results from variations
    for var in variations:
        var_docs = base_retriever.get_relevant_documents(var)
        all_docs.extend(var_docs)

    # Deduplicate results
    seen_contents = set()
    unique_docs = []

    for doc in all_docs:
        # Use first 100 chars as a simple deduplication key
        content_key = doc.page_content[:100]
        if content_key not in seen_contents:
            unique_docs.append(doc)
            seen_contents.add(content_key)

    return unique_docs

# Create specialized retrievers
def create_specialized_retrievers():
    """Create specialized retrievers for different types of queries"""

    # Language-specific retrievers
    python_retriever = vectorstore.as_retriever(
        search_kwargs={"k": 2, "filter": {"language": "python"}}
    )

    js_retriever = vectorstore.as_retriever(
        search_kwargs={"k": 2, "filter": {"language": "javascript"}}
    )

    # Category-specific retrievers
    function_retriever = vectorstore.as_retriever(
        search_kwargs={"k": 2, "filter": {"category": "functions"}}
    )

    data_structures_retriever = vectorstore.as_retriever(
        search_kwargs={"k": 2, "filter": {"category": "data_structures"}}
    )

    # Generic retriever with higher k for broader queries
    generic_retriever = vectorstore.as_retriever(
        search_kwargs={"k": 3}
    )

    return {
        "python": python_retriever,
        "javascript": js_retriever,
        "functions": function_retriever,
        "data_structures": data_structures_retriever,
        "generic": generic_retriever
    }

# Simple query router
def route_query(query, retrievers):
    """Route a query to the appropriate specialized retriever"""
    query_lower = query.lower()

    if "python" in query_lower:
        return "python", retrievers["python"]
    elif "javascript" in query_lower or "js" in query_lower:
        return "javascript", retrievers["javascript"]
    elif "function" in query_lower or "method" in query_lower:
        return "functions", retrievers["functions"]
    elif "list" in query_lower or "array" in query_lower or "data structure" in query_lower:
        return "data_structures", retrievers["data_structures"]
    else:
        return "generic", retrievers["generic"]

# Test multi-query retrieval
print("\nüîÑ Testing multi-query retrieval:")
test_query = "How do Python lists work?"
results = retrieve_with_variations(test_query, retriever, n_variations=2)
print(f"Retrieved {len(results)} unique documents\n")

# Test specialized retriever routing
print("\nüö¶ Testing specialized retriever routing:")
specialized_retrievers = create_specialized_retrievers()

test_queries = [
    "How do Python lists work?",
    "JavaScript function syntax",
    "Best data structures for searching"
]

for query in test_queries:
    retriever_name, specialized_retriever = route_query(query, specialized_retrievers)
    print(f"Query: {query}")
    print(f"Routed to: {retriever_name} retriever\n")

**10.5.4 Advanced Filtering and Query Construction**

In [None]:
# Create a schema for filter specifications
class FilterSpec(BaseModel):
    field: str = Field(description="The metadata field to filter on")
    value: Any = Field(description="The value to filter for")
    operator: str = Field(description="The operation to perform (equals, contains, greater_than, less_than, in_list)")

class FilterGroup(BaseModel):
    filters: List[FilterSpec] = Field(description="List of filter specifications")
    logic: str = Field(description="Logic to combine filters (AND, OR)")

# Function to generate filters based on natural language query
def generate_dynamic_filters(query, available_metadata_fields):
    """Generate filter specifications based on a natural language query"""

    # Create a prompt that explains available metadata fields
    fields_description = "\n".join([f"- {field}" for field in available_metadata_fields])

    prompt = f"""Based on this query, create filter specifications for a document retrieval system.

Available metadata fields:
{fields_description}

Query: {query}

Generate JSON for filters that would help retrieve the most relevant documents.
Use this format:
{{
  "filters": [
    {{
      "field": "field_name",
      "value": "value to match",
      "operator": "equals"  # Can be: equals, contains, greater_than, less_than, in_list
    }}
  ],
  "logic": "AND"  # Can be: AND, OR
}}

If no filters are needed, return an empty filters list.
"""

    # Create parser for the filter specification
    parser = PydanticOutputParser(pydantic_object=FilterGroup)

    try:
        # Generate filter specification
        response = llm.invoke(prompt).content

        # Extract JSON from response (in case the model includes explanation text)
        import re
        json_match = re.search(r'({.*})', response.replace('\n', ' '), re.DOTALL)
        if json_match:
            response_json = json_match.group(1)
            # Parse the filter specification
            filter_group = parser.parse(response_json)
            return filter_group
        else:
            # Fallback: return empty filter group
            return FilterGroup(filters=[], logic="AND")

    except Exception as e:
        print(f"Error generating filters: {str(e)}")
        # Return empty filter group as fallback
        return FilterGroup(filters=[], logic="AND")

# Convert filter specifications to retriever search_kwargs
def convert_filters_to_search_kwargs(filter_group):
    """Convert a FilterGroup to search_kwargs for a retriever"""

    # For Chroma, we'll use the "where" filter format
    filter_dict = {}

    # If no filters, return empty dict
    if not filter_group.filters:
        return {}

    # Process each filter
    for f in filter_group.filters:
        if f.operator == "equals":
            filter_dict[f.field] = f.value
        elif f.operator == "contains":
            # This depends on your vector store's capabilities
            # Some might support contains with special syntax
            filter_dict[f.field] = {"$in": [f.value]}
        elif f.operator == "greater_than":
            filter_dict[f.field] = {"$gt": f.value}
        elif f.operator == "less_than":
            filter_dict[f.field] = {"$lt": f.value}
        elif f.operator == "in_list":
            filter_dict[f.field] = {"$in": f.value if isinstance(f.value, list) else [f.value]}

    # Return as search_kwargs
    return {"filter": filter_dict}

# Test dynamic filter generation
print("\nüîç Testing dynamic filter generation:")
available_metadata_fields = ["language", "category", "source"]

test_queries = [
    "Tell me about Python functions",
    "How do arrays work in JavaScript?"
]

for query in test_queries:
    filters = generate_dynamic_filters(query, available_metadata_fields)
    search_kwargs = convert_filters_to_search_kwargs(filters)

    print(f"Query: {query}")
    print(f"Generated Filters: {json.dumps(filters.dict(), indent=2)}")
    print(f"Search kwargs: {search_kwargs}\n")

**10.5.5 Managing High Cardinality Variables**

In [None]:
# Simulate a high-cardinality scenario with programming languages
def create_high_cardinality_kb():
    """Create a knowledge base with high cardinality in language field"""

    languages = [
        "Python", "JavaScript", "Java", "C++", "C#", "Ruby", "Go", "Swift",
        "Kotlin", "Rust", "PHP", "TypeScript", "Scala", "Perl", "Haskell"
    ]

    docs = []
    for lang in languages:
        doc = Document(
            page_content=f"{lang} is a programming language used for various types of software development.",
            metadata={"language": lang.lower(), "type": "language_overview"}
        )
        docs.append(doc)

        # Add 1-2 more documents for some languages
        if random.random() < 0.3:
            doc = Document(
                page_content=f"{lang} is known for its {random.choice(['performance', 'simplicity', 'flexibility', 'strong type system'])}.",
                metadata={"language": lang.lower(), "type": "language_feature"}
            )
            docs.append(doc)

    # Create embeddings and vectorstore
    embeddings = OpenAIEmbeddings()
    high_cardinality_store = Chroma.from_documents(documents=docs, embedding=embeddings)
    return high_cardinality_store, languages, embeddings

high_cardinality_store, languages, hc_embeddings = create_high_cardinality_kb()
print(f"üìö Created high-cardinality knowledge base with {len(languages)} languages")

# Approach 1: LLM-based category selection
def expand_query_with_categories(query, category_field, all_categories, llm):
    """Expand a query to include likely relevant categories from high-cardinality fields"""

    # Create a prompt that includes available categories
    categories_str = ", ".join(all_categories[:20])  # Limit to 20 for prompt size
    if len(all_categories) > 20:
        categories_str += f", and {len(all_categories) - 20} more"

    prompt = f"""Based on this query, identify which categories from the "{category_field}" field
would be most relevant. These will be used to filter search results.

Available categories include: {categories_str}

Query: {query}

List the 1-3 most relevant categories, separated by commas. Only include categories that are
definitely relevant, and stick to the exact category names provided.
"""

    try:
        response = llm.invoke(prompt).content

        # Extract categories from response
        # Use a comma-separated list parser
        parser = CommaSeparatedListOutputParser()
        categories = parser.parse(response)

        # Validate against actual categories
        valid_categories = [c.lower() for c in all_categories]
        validated = [c.lower() for c in categories if c.lower() in valid_categories]

        return validated

    except Exception as e:
        print(f"Error in query expansion: {str(e)}")
        return []

# Test LLM-based category selection
print("\nü§ñ Testing LLM-based category selection for high-cardinality fields:")
test_queries = [
    "I need help with web development",
    "What's good for data analysis?",
    "Mobile app development languages"
]

for query in test_queries:
    selected_langs = expand_query_with_categories(query, "language", languages, llm)

    print(f"Query: {query}")
    print(f"Selected languages: {', '.join(selected_langs)}\n")

print("\n‚úÖ Advanced Query Analysis and Optimization notebook complete!")