**Setup and Installation**

In [None]:
# Install necessary packages
!pip install langchain langchain-openai langchain-community chromadb

import os
import time
from typing import List
import warnings
warnings.filterwarnings('ignore')

# Set your OpenAI API key (in Colab, you should use secrets or environment variables)
# Option 1: Use this if you want to enter your key interactively
from getpass import getpass
OPENAI_API_KEY = getpass("Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Option 2: Uncomment and use this if you prefer (less secure but more convenient)
# os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"  # Replace with your actual key

# Import necessary components
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.memory import ConversationBufferWindowMemory
from langchain.memory import ConversationSummaryMemory
from langchain_core.documents import Document
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.runnables import RunnablePassthrough

print("Setup complete!")

**Create a Sample Knowledge Base**

In [None]:
documents = [
    Document(
        page_content="Retrieval Augmented Generation (RAG) is a technique that combines retrieval-based and generation-based approaches in natural language processing. It enhances large language models by retrieving relevant information from external knowledge sources before generating a response.",
        metadata={"source": "introduction_to_rag.pdf", "page": 1}
    ),
    Document(
        page_content="The key components of a RAG system include document loaders, text splitters, embedding models, vector stores, retrievers, and prompt templates. Document loaders ingest content from various sources. Text splitters segment documents into manageable chunks for embedding and retrieval.",
        metadata={"source": "rag_components.pdf", "page": 5}
    ),
    Document(
        page_content="Embedding models in RAG convert text into mathematical vectors that capture semantic meaning. These models transform words and documents into numerical representations that enable semantic search and retrieval based on meaning rather than keywords.",
        metadata={"source": "embedding_models.pdf", "page": 12}
    ),
    Document(
        page_content="Vector stores organize embedded documents for efficient retrieval. They index vector representations and support different search algorithms, from exact nearest neighbors to approximate methods that trade some accuracy for faster query speed.",
        metadata={"source": "vector_stores.pdf", "page": 18}
    ),
    Document(
        page_content="Retrievers in RAG systems are responsible for finding the most relevant information from a knowledge base. They can use various strategies from simple vector similarity to hybrid approaches combining semantic and keyword search.",
        metadata={"source": "retrievers.pdf", "page": 24}
    ),
    Document(
        page_content="Prompt templates structure interactions with language models in RAG systems. They provide a consistent format for combining user queries with retrieved context to generate accurate and contextually appropriate responses.",
        metadata={"source": "prompt_templates.pdf", "page": 31}
    ),
    Document(
        page_content="RAG offers several advantages over traditional LLMs. It provides more accurate and up-to-date information, reduces hallucinations, enables source attribution, and allows for domain-specific knowledge without retraining the base model.",
        metadata={"source": "rag_advantages.pdf", "page": 7}
    ),
    Document(
        page_content="Challenges in RAG implementation include retrieval quality, context window limitations, maintaining freshness of knowledge, and balancing retrieval with generation. Effective RAG systems require careful tuning of both retrieval and generation components.",
        metadata={"source": "rag_challenges.pdf", "page": 9}
    )
]

**Create embeddings and vector store**

In [None]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings)

print(f"Created vector store with {len(documents)} documents")

Created vector store with 8 documents


**10.2.1 Incorporating Chat History for Contextual Understanding**

---


**Basic Implementation with ConversationBufferMemory**



In [None]:
# Initialize memory to store conversation history
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# Create a conversational retrieval chain with memory
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    retriever=vectorstore.as_retriever(),
    memory=memory
)

# First question with no prior context
first_response = qa_chain.invoke({"question": "What is RAG?"})
print("First Question: What is RAG?")
print(f"Response: {first_response['answer']}\n")

# Follow-up question that relies on conversation history
follow_up_response = qa_chain.invoke({"question": "What are its advantages?"})
print("Follow-up Question: What are its advantages?")
print(f"Response: {follow_up_response['answer']}\n")

**Fixed-size Window for Chat History**

In [None]:
# Initialize memory with a fixed-size window
windowed_memory = ConversationBufferWindowMemory(
    memory_key="chat_history",
    k=3,  # Number of exchanges to keep
    return_messages=True
)

# Create a conversation chain with windowed memory
windowed_qa_chain = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    retriever=vectorstore.as_retriever(),
    memory=windowed_memory
)

# Demonstrate a conversation with memory window
print("Using ConversationBufferWindowMemory (keeping last 3 exchanges):")

questions = [
    "What are the key components of RAG?",
    "Can you tell me more about embedding models?",
    "How do vector stores work?",
    "What about retrievers?",
    "And what role do prompt templates play?",
    "What challenges exist with these systems?"
]

for i, question in enumerate(questions):
    response = windowed_qa_chain.invoke({"question": question})
    print(f"Q{i+1}: {question}")
    print(f"A{i+1}: {response['answer'][:150]}...\n")

    # Show current memory size after each exchange
    memory_messages = windowed_memory.load_memory_variables({})["chat_history"]
    print(f"Memory now contains {len(memory_messages)//2} exchanges\n")

**Summarization Approach for Memory**

In [None]:
# Memory that uses an LLM to summarize conversation history
summary_memory = ConversationSummaryMemory(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    memory_key="chat_history",
    return_messages=True
)

# Create a conversation chain with summary memory
summary_qa_chain = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    retriever=vectorstore.as_retriever(),
    memory=summary_memory
)

# Demonstrate a conversation with summarized memory
print("Using ConversationSummaryMemory:")
summary_questions = [
    "What is RAG and what are its main components?",
    "How do these components work together in a complete system?",
    "What are the advantages of RAG over traditional LLMs?",
    "What challenges do RAG systems face?"
]

for i, question in enumerate(summary_questions):
    response = summary_qa_chain.invoke({"question": question})
    print(f"Q{i+1}: {question}")
    print(f"A{i+1}: {response['answer'][:150]}...\n")

    # Access the current summary
    if i < len(summary_questions) - 1:  # Skip after last question to avoid extra API call
        current_summary = summary_memory.predict_new_summary(
            summary_memory.chat_memory.messages,
            ""
        )
        print(f"Current conversation summary: {current_summary[:200]}...\n")

**10.2.2 Streaming Responses for Better User Experience**

---

**Basic Streaming Implementation**


In [None]:
# Initialize a streaming-enabled LLM
streaming_llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()]
)

# Create a template for RAG
template = """Answer the question based on the following context:
Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Define function to format documents into context string
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create a streaming RAG chain
streaming_chain = (
    {"context": vectorstore.as_retriever() | format_docs, "question": RunnablePassthrough()}
    | prompt
    | streaming_llm
)

print("Streaming response (you'll see the answer appear word by word):")
# Invoke with streaming output (will print to console)
_ = streaming_chain.invoke("What are the advantages of RAG systems?")
print("\nStreaming complete!")

**Phased Streaming with Retrieval Feedback**

In [None]:
def stream_retrieval_and_generation(query: str, retriever, llm, prompt):
    """
    Demonstrate a phased approach to streaming that shows the retrieval process.
    """
    # Phase 1: Notify user about search
    print("Searching knowledge base...\n")

    # Phase 2: Retrieve documents with visual feedback
    start_time = time.time()
    retrieved_docs = retriever.get_relevant_documents(query)
    retrieval_time = time.time() - start_time

    print(f"Found {len(retrieved_docs)} relevant documents in {retrieval_time:.2f} seconds:")
    for i, doc in enumerate(retrieved_docs, 1):
        preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
        print(f"Document {i}: {preview}\n")

    # Phase 3: Generate and stream the response
    print("\nGenerating response:")

    # Prepare context from retrieved documents
    context = "\n\n".join(doc.page_content for doc in retrieved_docs)

    # Create and invoke the streaming chain
    chain = prompt | llm
    _ = chain.invoke({"context": context, "question": query})

    return "Response generation complete!"

# Test the phased streaming approach
query = "How do embedding models and vector stores work together in RAG?"
prompt_template = ChatPromptTemplate.from_template(template)

print("Demonstrating phased streaming with retrieval feedback:")
stream_retrieval_and_generation(
    query,
    vectorstore.as_retriever(),
    streaming_llm,
    prompt_template
)

**10.2.3 Source Attribution and Transparency**

---

**Basic Source Attribution**

In [None]:
# Create a QA chain that returns source documents
qa_with_sources = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

# Query the system
result = qa_with_sources({"query": "What are the components of a RAG system?"})

# Access both the answer and the sources
answer = result["result"]
source_documents = result["source_documents"]

# Display answer and sources
print(f"Answer: {answer}\n")
print("Sources:")
for i, doc in enumerate(source_documents, 1):
    print(f"Source {i}: {doc.metadata.get('source', 'Unknown')}, Page {doc.metadata.get('page', 'Unknown')}")
    print(f"Content snippet: {doc.page_content[:150]}...\n")

**In-text Citations**

In [None]:
citation_prompt = """
Answer the question based solely on the following context.
Use [doc1], [doc2], etc. to indicate which document supports each part of your answer.

Context:
{context}

Question: {question}

Answer with citations:
"""

# Modify the documents to include their index
def add_index_to_docs(docs):
    doc_string = ""
    for i, doc in enumerate(docs, 1):
        doc_string += f"[doc{i}]: {doc.page_content}\n\n"
    return doc_string

# Create the chain with citation instructions
citation_chain = (
    {"context": vectorstore.as_retriever() | add_index_to_docs, "question": RunnablePassthrough()}
    | ChatPromptTemplate.from_template(citation_prompt)
    | ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
)

# Query with citations
print("Response with in-text citations:")
response = citation_chain.invoke("What are the advantages of RAG?")
print(response.content)

**Building a Citation Lookup System**

In [None]:
def create_response_with_clickable_citations(query, response, source_docs):
    """
    Create a response with clickable citations that map to source documents.
    This simulates what would happen in a web interface.
    """
    import re

    # Extract citations like [doc1], [doc2] from the response
    citations = re.findall(r'\[doc(\d+)\]', response)

    # Create a mapping of citation numbers to documents
    citation_map = {}
    for citation in citations:
        doc_num = int(citation)
        if doc_num <= len(source_docs):
            doc = source_docs[doc_num-1]
            citation_map[f"doc{doc_num}"] = {
                "content": doc.page_content,
                "source": doc.metadata.get("source", "Unknown"),
                "page": doc.metadata.get("page", "")
            }

    # In a real application, you'd return this for the frontend to use
    return {
        "response": response,
        "citations": citation_map
    }

# Get documents from the retriever
docs = vectorstore.as_retriever().get_relevant_documents("What are the key components and advantages of RAG?")

# Get a response with citations
raw_response = citation_chain.invoke("What are the key components and advantages of RAG?")

# Create clickable citations
result = create_response_with_clickable_citations(
    "What are the key components and advantages of RAG?",
    raw_response.content,
    docs
)

# Simulate how this would work in a web interface
print("RESPONSE WITH CLICKABLE CITATIONS:")
print(result["response"])
print("\nWhen a citation is clicked, it would show:")
for citation, source in result["citations"].items():
    print(f"\n--- {citation} ---")
    print(f"Source: {source['source']}, Page: {source['page']}")
    print(f"Content: {source['content']}")

**10.2.4 User-Specific Retrieval Patterns**

---

**Query Augmentation with User Context**

In [None]:
def create_personalized_query(query: str, user_profile: dict) -> str:
    """Augment the query with relevant user context."""
    # Extract relevant user attributes based on the query topic
    relevant_attributes = []

    if "investment" in query.lower() or "financial" in query.lower():
        if "age" in user_profile:
            relevant_attributes.append(f"age {user_profile['age']}")
        if "risk_tolerance" in user_profile:
            relevant_attributes.append(f"risk tolerance: {user_profile['risk_tolerance']}")

    elif "health" in query.lower() or "medical" in query.lower():
        if "medical_conditions" in user_profile:
            conditions = user_profile["medical_conditions"]
            relevant_attributes.append(f"medical conditions: {', '.join(conditions)}")

    # If we have relevant attributes, include them in the query
    if relevant_attributes:
        augmented_query = f"{query} [For a person with {'; '.join(relevant_attributes)}]"
        return augmented_query

    # If no relevant attributes, return the original query
    return query

# Example usage
user_profile = {
    "age": 65,
    "risk_tolerance": "conservative",
    "medical_conditions": ["hypertension", "type 2 diabetes"]
}

original_query = "What investment strategies should I consider?"
personalized_query = create_personalized_query(original_query, user_profile)

print(f"Original: {original_query}")
print(f"Personalized: {personalized_query}")

# Create another example with health-related query
health_query = "What exercises are recommended for maintaining health?"
personalized_health_query = create_personalized_query(health_query, user_profile)

print(f"\nOriginal: {health_query}")
print(f"Personalized: {personalized_health_query}")

**Metadata Filtering by User Attributes**

In [None]:
# In a real application, documents would have metadata like age_group, risk_profile, etc.
# For demonstration purposes, we'll add this metadata

# Add more detailed metadata to our documents
documents_with_metadata = [
    Document(
        page_content="For young investors (20-35), a growth-focused strategy with higher equity allocation is often recommended. This typically involves 80-90% stocks with an emphasis on emerging markets and innovative sectors.",
        metadata={"source": "investment_strategies.pdf", "page": 15, "age_group": "young_adult", "risk_profile": "aggressive"}
    ),
    Document(
        page_content="Middle-aged investors (35-60) should consider a balanced approach with a moderate allocation to both stocks and bonds. A typical allocation might be 60% stocks and 40% bonds, adjusting based on personal risk tolerance.",
        metadata={"source": "investment_strategies.pdf", "page": 18, "age_group": "middle_aged", "risk_profile": "moderate"}
    ),
    Document(
        page_content="Senior investors (60+) with a conservative risk profile should focus on capital preservation and income generation. A portfolio with 30-40% stocks and 60-70% bonds, with an emphasis on dividend stocks and high-quality bonds is often appropriate.",
        metadata={"source": "investment_strategies.pdf", "page": 22, "age_group": "senior", "risk_profile": "conservative"}
    ),
    Document(
        page_content="For those with high risk tolerance regardless of age, a higher allocation to small cap stocks and emerging markets can potentially yield greater returns, though with increased volatility.",
        metadata={"source": "investment_strategies.pdf", "page": 25, "age_group": "any", "risk_profile": "aggressive"}
    ),
    Document(
        page_content="Conservative investors should prioritize high-quality corporate bonds, Treasury securities, and blue-chip dividend stocks to minimize volatility while generating stable income.",
        metadata={"source": "investment_strategies.pdf", "page": 27, "age_group": "any", "risk_profile": "conservative"}
    ),
]

# Create a new vector store with the metadata-rich documents
metadata_vectorstore = Chroma.from_documents(documents=documents_with_metadata, embedding=embeddings)

def create_user_specific_filters(user_profile: dict) -> dict:
    """Create metadata filters based on user profile."""
    # For Chroma with the basic configuration, we need a simpler filter structure
    # We'll handle one filter at a time for compatibility

    # Add age range filter if user age is available
    if "age" in user_profile:
        age = user_profile["age"]
        if age < 35:
            return {"age_group": "young_adult"}
        elif age < 60:
            return {"age_group": "middle_aged"}
        else:
            return {"age_group": "senior"}

    # Add risk profile filter if available
    if "risk_tolerance" in user_profile:
        return {"risk_profile": user_profile["risk_tolerance"]}

    return {}

# Create a filtered retriever based on age (senior)
age_filter = {"age_group": {"$eq": "senior"}}
print(f"Using age filter: {age_filter}")

age_filtered_retriever = metadata_vectorstore.as_retriever(
    search_kwargs={"k": 2, "filter": age_filter}
)

# Create a filtered retriever based on risk profile
risk_filter = {"risk_profile": {"$eq": "conservative"}}
print(f"Using risk profile filter: {risk_filter}")

risk_filtered_retriever = metadata_vectorstore.as_retriever(
    search_kwargs={"k": 2, "filter": risk_filter}
)

# Retrieve documents matching the age filter
print("\nRetrieving documents with AGE filtering (senior):")
age_results = age_filtered_retriever.get_relevant_documents("investment strategies")
for i, doc in enumerate(age_results, 1):
    print(f"Document {i}:")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}\n")

# Retrieve documents matching the risk filter
print("\nRetrieving documents with RISK PROFILE filtering (conservative):")
risk_results = risk_filtered_retriever.get_relevant_documents("investment strategies")
for i, doc in enumerate(risk_results, 1):
    print(f"Document {i}:")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}\n")

# For comparison, retrieve without filtering
print("Retrieving without filtering:")
unfiltered_results = metadata_vectorstore.as_retriever(search_kwargs={"k": 2}).get_relevant_documents("investment strategies")
for i, doc in enumerate(unfiltered_results, 1):
    print(f"Document {i}:")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}\n")

**Personalized Vector Stores (Simulated)**

In [None]:
# Create some additional personal documents for the user
personal_documents = [
    Document(
        page_content="Based on our last portfolio review, we agreed to allocate 40% to bonds, 30% to dividend stocks, 20% to REITs, and 10% to cash reserves, given your conservative approach and income needs in retirement.",
        metadata={"source": "personal_financial_plan.pdf", "type": "personal", "user_id": "user_123"}
    ),
    Document(
        page_content="Your recent health check showed improved blood pressure (130/82) following medication adjustment, but continued monitoring of blood glucose levels is recommended for your type 2 diabetes management.",
        metadata={"source": "health_summary.pdf", "type": "personal", "user_id": "user_123"}
    )
]

# Simulate a user-specific vector store by combining general and personal documents
def create_personalized_vectorstore(user_id, general_docs, personal_docs):
    """Create a personalized vector store for a specific user."""
    print(f"Creating vector store for user {user_id}")

    # Combine general documents with user-specific ones
    all_docs = general_docs + [
        doc for doc in personal_docs
        if doc.metadata.get("user_id") == user_id
    ]

    # Create a new vector store
    user_vectorstore = Chroma.from_documents(
        documents=all_docs,
        embedding=embeddings
    )

    print(f"Created personalized vector store with {len(all_docs)} documents")
    return user_vectorstore

# Create a personalized vector store
user_id = "user_123"
user_vectorstore = create_personalized_vectorstore(
    user_id,
    documents_with_metadata,
    personal_documents
)

# Create a retriever from the user's store
user_retriever = user_vectorstore.as_retriever(search_kwargs={"k": 3})

# Test personalized retrieval
print("\nPersonalized retrieval results:")
personal_results = user_retriever.get_relevant_documents("What investment strategy should I follow based on my profile?")
for i, doc in enumerate(personal_results, 1):
    print(f"Document {i}:")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}\n")

# Demonstrate how this affects RAG responses
personalized_qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    chain_type="stuff",
    retriever=user_retriever
)

print("Using a personalized RAG system:")
personalized_response = personalized_qa.invoke({"query": "What investment strategy should I follow?"})
print(personalized_response["result"])

**Best Practices and Considerations**

In [None]:
print("""
## Best Practices for Interactive Q&A Systems with RAG

### Chat History Management
1. Limit history size to fit within context windows (typically keep 5-10 exchanges)
2. Consider summary approaches for longer conversations
3. Store timestamps for multi-session interactions
4. Prioritize recent exchanges over older ones
5. Consider user privacy when storing conversation history

### Streaming Implementation
1. Always provide visual feedback during document retrieval
2. Monitor connection status to detect disconnections
3. Implement timeouts for long-running queries
4. Include progress indicators for multi-step processes
5. Consider chunking very long responses

### Source Attribution
1. Make citations clickable or expandable in user interfaces
2. Include complete metadata for verification
3. Balance citation density with readability
4. Handle conflicting information transparently
5. Consider confidence levels for different sources

### Personalization
1. Start with simple query augmentation before complex personalization
2. Always get explicit consent for using personal data
3. Implement strict access controls for user-specific data
4. Be transparent about how personalization affects results
5. Provide options to disable personalization

### Performance Considerations
1. Cache common query results to improve response time
2. Use async operations for resource-intensive processes
3. Implement fallback mechanisms for when retrieval or generation fails
4. Consider rate limiting to prevent resource exhaustion
5. Monitor token usage to stay within API limits
""")


## Best Practices for Interactive Q&A Systems with RAG

### Chat History Management
1. Limit history size to fit within context windows (typically keep 5-10 exchanges)
2. Consider summary approaches for longer conversations
3. Store timestamps for multi-session interactions
4. Prioritize recent exchanges over older ones
5. Consider user privacy when storing conversation history

### Streaming Implementation
1. Always provide visual feedback during document retrieval
2. Monitor connection status to detect disconnections
3. Implement timeouts for long-running queries
4. Include progress indicators for multi-step processes
5. Consider chunking very long responses

### Source Attribution
1. Make citations clickable or expandable in user interfaces
2. Include complete metadata for verification
3. Balance citation density with readability
4. Handle conflicting information transparently
5. Consider confidence levels for different sources

### Personalization
1. Start with simple query augmen