**Setup and Installation**

In [None]:
# Install necessary packages
!pip install langchain langchain-openai langchain-community chromadb faiss-cpu tiktoken

# For tool integration
!pip install wikipedia requests duckduckgo-search

import os
import time
import json
import re
from typing import List, Optional, Dict, Any, Callable, Union
import warnings
warnings.filterwarnings('ignore')

# Set your OpenAI API key (in Colab, you should use secrets or environment variables)
from getpass import getpass
OPENAI_API_KEY = getpass("Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Import necessary components
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_community.vectorstores import Chroma, FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import (
    ConversationBufferMemory,
    ConversationBufferWindowMemory,
    ConversationSummaryBufferMemory,
    ConversationTokenBufferMemory,
    ConversationSummaryMemory
)
from langchain.prompts import ChatPromptTemplate, PromptTemplate, MessagesPlaceholder
from langchain_core.messages import get_buffer_string
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools import WikipediaQueryRun, DuckDuckGoSearchRun
from langchain_community.utilities import WikipediaAPIWrapper, DuckDuckGoSearchAPIWrapper
from langchain.agents import AgentType, Tool, initialize_agent, AgentExecutor
from langchain.tools.retriever import create_retriever_tool

# Initialize the LLM
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# For streaming responses
streaming_llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()]
)

print("Setup complete!")

**Create a Sample Knowledge Base**

In [None]:
documents = [
    Document(
        page_content="Retrieval Augmented Generation (RAG) combines retrieval-based and generation-based approaches in natural language processing. It enhances large language models by retrieving relevant information from external knowledge sources before generating a response.",
        metadata={"source": "introduction_to_rag.pdf", "topic": "rag", "date": "2023-01-15"}
    ),
    Document(
        page_content="LLMs like GPT-4 have significant knowledge cutoff dates, after which they don't have access to information. RAG systems can overcome this limitation by retrieving up-to-date information from external sources.",
        metadata={"source": "llm_limitations.pdf", "topic": "rag", "date": "2023-02-20"}
    ),
    Document(
        page_content="Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. It enables computers to understand, interpret, and generate human language in a valuable way.",
        metadata={"source": "nlp_basics.pdf", "topic": "ai", "date": "2022-12-10"}
    ),
    Document(
        page_content="Transformer architecture revolutionized NLP with its attention mechanism, allowing models to weigh the importance of different words in a sentence. This breakthrough enabled the development of powerful language models like BERT and GPT.",
        metadata={"source": "transformer_architecture.pdf", "topic": "ai", "date": "2023-01-05"}
    ),
    Document(
        page_content="Semantic search uses embeddings to understand the meaning behind search queries rather than just matching keywords. It captures the intent and contextual meaning of queries to provide more relevant results.",
        metadata={"source": "semantic_search.pdf", "topic": "search", "date": "2023-03-12"}
    ),
    Document(
        page_content="Vector databases like Pinecone, Weaviate, and Chroma are specialized for storing and searching vector embeddings. They enable efficient similarity search based on semantic meaning rather than exact keyword matches.",
        metadata={"source": "vector_databases.pdf", "topic": "search", "date": "2023-02-28"}
    ),
    Document(
        page_content="Machine learning models can exhibit bias based on the data they're trained on. This can lead to unfair or discriminatory outcomes. Techniques like adversarial training and data augmentation can help mitigate these biases.",
        metadata={"source": "ai_ethics.pdf", "topic": "ethics", "date": "2023-04-05"}
    ),
    Document(
        page_content="Reinforcement Learning from Human Feedback (RLHF) is a technique used to align AI systems with human preferences. It involves training models using feedback from human evaluators, helping to make AI outputs more helpful, harmless, and honest.",
        metadata={"source": "rlhf_explained.pdf", "topic": "ethics", "date": "2023-03-18"}
    ),
    Document(
        page_content="Edge AI involves running machine learning models directly on edge devices like smartphones or IoT devices, rather than in the cloud. This approach reduces latency, enhances privacy, and enables AI applications in environments with limited connectivity.",
        metadata={"source": "edge_computing.pdf", "topic": "tech", "date": "2023-02-10"}
    ),
    Document(
        page_content="Quantum computing leverages quantum mechanics principles to process information in ways classical computers cannot. While still in early stages, it promises breakthroughs in areas like cryptography, material science, and complex system modeling.",
        metadata={"source": "quantum_computing.pdf", "topic": "tech", "date": "2023-04-22"}
    ),
]

# Create embeddings and vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents=documents, embedding=embeddings)

print(f"Created vector store with {len(documents)} documents")

# Create a simple retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

**10.4.1 Memory Management Strategies**

---

**Short-term vs. Long-term Memory Architectures**


In [None]:
# 1. Buffer Memory (Short-term, stores everything verbatim)
buffer_memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# 2. Window Memory (Short-term, keeps last k exchanges)
window_memory = ConversationBufferWindowMemory(
    memory_key="chat_history",
    k=3,  # Number of exchanges to keep
    return_messages=True
)

# 3. Token-limited Memory (Short-term with token limits)
token_memory = ConversationTokenBufferMemory(
    llm=llm,
    memory_key="chat_history",
    max_token_limit=1000,
    return_messages=True
)

# 4. Summary Memory (Long-term, keeps a running summary)
summary_memory = ConversationSummaryMemory(
    llm=llm,
    memory_key="chat_history",
    return_messages=True
)

# 5. Summary Buffer Memory (Hybrid approach)
summary_buffer_memory = ConversationSummaryBufferMemory(
    llm=llm,
    memory_key="chat_history",
    max_token_limit=1000,
    return_messages=True
)

**Demonstration of Different Memory Types**

In [None]:
def test_memory_type(memory, questions):
    """Test a memory type with a series of questions"""
    retrieval_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        verbose=False
    )

    responses = []
    for i, question in enumerate(questions):
        print(f"Question {i+1}: {question}")
        response = retrieval_chain.invoke({"question": question})
        print(f"Response: {response['answer']}\n")
        responses.append(response['answer'])

        # Display current memory state
        mem_variables = memory.load_memory_variables({})

        if isinstance(mem_variables["chat_history"], list):
            chat_length = len(mem_variables["chat_history"])
            print(f"Memory contains {chat_length//2} exchanges")
        else:
            # For summary memory
            print(f"Memory contains summary of conversation")

        print("=" * 50)

    return responses

# Sample conversation flow
questions = [
    "What is RAG?",
    "What are its advantages?",
    "How does it relate to semantic search?",
    "Can it help with LLM knowledge cutoff issues?",
    "What other technology trends are happening in AI?"
]

# Test different memory types
print("Testing Buffer Memory:")
buffer_responses = test_memory_type(buffer_memory, questions[:2])  # Just test with first two questions

**Implementing a Custom Memory Architecture**

In [None]:
class PrioritizedMemory:
    """Memory that prioritizes entries based on relevance to current query"""

    def __init__(self, embedding_function, max_entries=10):
        self.embedding_function = embedding_function
        self.max_entries = max_entries
        self.messages = []
        self.embeddings = []

    def add_message(self, message):
        """Add a message to memory"""
        self.messages.append(message)
        # Get embedding for the message
        embedding = self.embedding_function.embed_query(message['content'])
        self.embeddings.append(embedding)

        # Trim if exceeding max entries
        if len(self.messages) > self.max_entries:
            self.messages.pop(0)
            self.embeddings.pop(0)

    def get_relevant_messages(self, query, k=3):
        """Retrieve most relevant messages to the current query"""
        if not self.messages:
            return []

        # Get query embedding
        query_embedding = self.embedding_function.embed_query(query)

        # Calculate similarities
        similarities = []
        for i, emb in enumerate(self.embeddings):
            # Compute cosine similarity
            similarity = self._cosine_similarity(query_embedding, emb)
            similarities.append((i, similarity))

        # Sort by similarity (descending)
        sorted_indices = sorted(similarities, key=lambda x: x[1], reverse=True)

        # Get top k messages
        relevant_indices = [idx for idx, _ in sorted_indices[:k]]
        relevant_messages = [self.messages[i] for i in relevant_indices]

        return relevant_messages

    def _cosine_similarity(self, a, b):
        """Compute cosine similarity between two vectors"""
        dot_product = sum(x * y for x, y in zip(a, b))
        magnitude_a = sum(x * x for x in a) ** 0.5
        magnitude_b = sum(x * x for x in b) ** 0.5
        return dot_product / (magnitude_a * magnitude_b)

# Example of using the prioritized memory
prioritized_memory = PrioritizedMemory(embedding_function=embeddings)

# We won't run a full test here as it would require deeper integration
print("Prioritized memory initialized")

**10.4.2 Contextual Retrieval for Natural Conversations**

---

**Query Formulation from Chat Context**

In [None]:
def rewrite_query_with_context(query, chat_history):
    """
    Rewrite query considering chat history context
    """
    if not chat_history:
        return query

    # Format chat history as a string
    formatted_history = ""
    for message in chat_history:
        if isinstance(message, HumanMessage):
            formatted_history += f"Human: {message.content}\n"
        elif isinstance(message, AIMessage):
            formatted_history += f"AI: {message.content}\n"

    # Create prompt for query rewriting
    rewrite_prompt = f"""
    Given the chat history and the latest query, rewrite the query to be more specific and self-contained,
    incorporating relevant context from the chat history.

    Chat History:
    {formatted_history}

    Latest Query: {query}

    Rewritten Query:
    """

    # Get rewritten query
    response = llm.invoke(rewrite_prompt)
    rewritten_query = response.content.strip()

    return rewritten_query

# Example of query rewriting
chat_history_example = [
    HumanMessage(content="What is a Retrieval Augmented Generation system?"),
    AIMessage(content="Retrieval Augmented Generation (RAG) is a technique that combines retrieval-based and generation-based approaches in natural language processing. It enhances large language models by retrieving relevant information from external knowledge sources before generating a response."),
    HumanMessage(content="What are its main advantages?")
]

original_query = "What are its main advantages?"
rewritten_query = rewrite_query_with_context(original_query, chat_history_example)

print(f"Original query: {original_query}")
print(f"Rewritten query: {rewritten_query}\n")

**Building a Contextual RAG Chain**

In [None]:
def contextual_rag_chain():
    """Create a RAG chain with contextual query understanding"""

    # Create conversation memory
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True
    )

    # Create a template for context-aware retrieval
    prompt_template = """
    Answer the question based on the following context and the chat history.
    If you don't know the answer, just say you don't know.

    Chat History:
    {chat_history}

    Context:
    {context}

    Question: {question}

    Answer:
    """

    qa_prompt = ChatPromptTemplate.from_template(prompt_template)

    # Define a function to get a string representation of chat history
    def get_chat_history(inputs):
        memory_variables = memory.load_memory_variables({})
        chat_history = memory_variables.get("chat_history", [])

        # Format the chat history as a string
        formatted_history = ""
        for message in chat_history:
            if isinstance(message, HumanMessage):
                formatted_history += f"Human: {message.content}\n"
            elif isinstance(message, AIMessage):
                formatted_history += f"AI: {message.content}\n"

        return formatted_history

    # Define the retriever to get relevant documents
    def get_context(inputs):
        query = inputs["question"]

        # Get chat history
        memory_variables = memory.load_memory_variables({})
        chat_history = memory_variables.get("chat_history", [])

        # If there's chat history, rewrite the query for better context
        if chat_history:
            rewritten_query = rewrite_query_with_context(query, chat_history)
            print(f"Original query: {query}")
            print(f"Rewritten query: {rewritten_query}")

            # Use the rewritten query for retrieval
            docs = retriever.get_relevant_documents(rewritten_query)
        else:
            # For the first question, use it directly
            docs = retriever.get_relevant_documents(query)

        # Format the documents into a string
        return "\n\n".join(doc.page_content for doc in docs)

    # Build the contextual RAG chain
    contextual_chain = (
        {
            "question": lambda x: x["question"],
            "context": get_context,
            "chat_history": get_chat_history
        }
        | qa_prompt
        | llm
    )

    # Add memory updating
    def invoke_with_memory(inputs):
        result = contextual_chain.invoke(inputs)
        # Save interaction to memory
        memory.save_context(
            {"input": inputs["question"]},
            {"output": result.content}
        )
        return result.content

    return invoke_with_memory

# Create a contextual RAG chain
contextual_rag = contextual_rag_chain()

# Test with a conversation flow
print("Testing Contextual RAG Chain:")
test_conversation = [
    "What are Transformer models in AI?",
    "How did they change NLP?",
    "What are some applications of this technology?"
]

for question in test_conversation:
    print(f"\nQuestion: {question}")
    answer = contextual_rag({"question": question})
    print(f"Answer: {answer}")
    print("=" * 50)

**Balancing Retrieval with Conversational Flow**

In [None]:
def balanced_conversation_chain():
    """Create a chain that balances retrieval with natural conversation"""
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True
    )

    # Define a function to check if retrieval is needed
    def should_retrieve(query, chat_history):
        """Determine if the query needs knowledge retrieval"""
        if not chat_history:
            return True  # Always retrieve for first question

        # Check for question patterns that likely need retrieval
        info_seeking_patterns = [
            r"what (is|are|was|were)",
            r"how (do|does|did)",
            r"why (is|are|was|were)",
            r"when (is|are|was|were)",
            r"where (is|are|was|were)",
            r"who (is|are|was|were)",
            r"define",
            r"explain",
            r"describe"
        ]

        # Check for conversational patterns that likely don't need retrieval
        conversational_patterns = [
            r"^(yes|no|maybe|sure|thanks|thank you|ok|okay|great)$",
            r"can you",
            r"please",
            r"i (think|believe|feel)",
            r"(and|but|or) (what|how) about"
        ]

        # Check if query matches information-seeking patterns
        for pattern in info_seeking_patterns:
            if re.search(pattern, query.lower()):
                return True

        # Check if query only matches conversational patterns
        only_conversational = False
        for pattern in conversational_patterns:
            if re.search(pattern, query.lower()):
                only_conversational = True
                break

        # If query is very short, likely conversational
        if len(query.split()) < 4 and only_conversational:
            return False

        # Default to retrieving
        return True

    # Define retrieval logic
    def conditional_retrieval(inputs):
        query = inputs["question"]
        memory_variables = memory.load_memory_variables({})
        chat_history = memory_variables.get("chat_history", [])

        # Determine if retrieval is needed
        retrieval_needed = should_retrieve(query, chat_history)

        # If retrieval is needed, get relevant documents
        if retrieval_needed:
            print("Retrieval activated for this query")
            # Rewrite query with context
            rewritten_query = rewrite_query_with_context(query, chat_history)
            # Get documents
            docs = retriever.get_relevant_documents(rewritten_query)
            context = "\n\n".join(doc.page_content for doc in docs)
        else:
            print("Conversational mode (no retrieval) for this query")
            context = "No retrieval performed - use your general knowledge for this conversational query."

        return context

    # Create prompt templates for different modes
    standard_prompt = """
    Answer the following question based on the context and chat history.

    Chat History:
    {chat_history}

    Context from knowledge base:
    {context}

    Question: {question}

    Answer:
    """

    # Build the chain
    balanced_chain = (
        {
            "question": lambda x: x["question"],
            "context": conditional_retrieval,
            "chat_history": lambda x: get_buffer_string(memory.chat_memory.messages)
        }
        | ChatPromptTemplate.from_template(standard_prompt)
        | llm
    )

    # Add memory updating
    def invoke_with_memory(inputs):
        result = balanced_chain.invoke(inputs)
        # Save interaction to memory
        memory.save_context(
            {"input": inputs["question"]},
            {"output": result.content}
        )
        return result.content

    return invoke_with_memory

# Create a balanced conversation chain
balanced_convo = balanced_conversation_chain()

# Test with mixed retrieval and conversational questions
print("Testing Balanced Conversation Chain:")
mixed_conversation = [
    "What is semantic search technology?",
    "That sounds interesting!",
    "How does it relate to vector databases?",
    "Thank you for explaining that",
    "What ethical concerns should I be aware of?"
]

for question in mixed_conversation:
    print(f"\nQuestion: {question}")
    answer = balanced_convo({"question": question})
    print(f"Answer: {answer}")
    print("=" * 50)

**10.4.3 Tool Integration for Enhanced Capabilities**

---

**Combining RAG with External Tools**

In [10]:
# Define some external tools
wikipedia_tool = Tool(
    name="Wikipedia",
    func=WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()).run,
    description="Useful for searching for information on Wikipedia. Input should be a search query."
)

search_tool = Tool(
    name="DuckDuckGo Search",
    func=DuckDuckGoSearchRun(api_wrapper=DuckDuckGoSearchAPIWrapper()).run,
    description="Useful for searching for current information on the web. Input should be a search query."
)

# Create a retriever tool from our knowledge base
retriever_tool = create_retriever_tool(
    retriever,
    "Knowledge Base",
    "Useful for searching our internal knowledge base for information on AI, RAG, and technology."
)

# List of tools
tools = [retriever_tool, wikipedia_tool, search_tool]

**Building a Tool-Selection Framework**

In [None]:
def create_tool_selection_chain():
    """Create a chain that selects appropriate tools based on the query"""

    # Tool selection prompt
    tool_selection_prompt = """
    You need to decide which tool(s) to use for answering the user's question.

    User Question: {question}

    Available tools:
    1. Knowledge Base: Our internal knowledge base with information on AI, RAG, and technology.
    2. Wikipedia: Access to Wikipedia for general knowledge and concepts.
    3. DuckDuckGo Search: Web search for current or specific information not in our knowledge base.

    For this question, which tool(s) would be most appropriate? Respond with a comma-separated list of tool names or "none" if no tools are needed.

    Selected tool(s):
    """

    # Create the selection chain
    selection_chain = PromptTemplate.from_template(tool_selection_prompt) | llm

    return selection_chain

# Create the tool selection chain
tool_selector = create_tool_selection_chain()

# Example queries
tool_test_queries = [
    "What is RAG technology?",
    "What happened in the 2024 Olympics?",
    "Who invented the transistor?",
    "I'm feeling tired today"
]

# Test tool selection
for query in tool_test_queries:
    print(f"Query: {query}")
    selection = tool_selector.invoke({"question": query})
    print(f"Recommended tools: {selection.content}\n")

**Implementing a RAG Chain with Tool Integration**

In [None]:
def create_rag_with_tools_chain():
    """Create a RAG chain that can use tools when appropriate"""
    # Create memory
    memory = ConversationBufferWindowMemory(
        memory_key="chat_history",
        k=5,
        return_messages=True
    )

    # Initialize the agent
    agent = initialize_agent(
        tools,
        llm,
        agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
        memory=memory,
        verbose=True  # For demonstration, we'll keep this verbose
    )

    # Create a wrapper for simplified invocation
    def invoke_agent(inputs):
        question = inputs["question"]
        result = agent.invoke({"input": question})
        return result["output"]

    return invoke_agent

# Create the RAG with tools chain
rag_with_tools = create_rag_with_tools_chain()

# We won't run this agent right now as it requires actual API calls,
# but below is how you would invoke it
print("RAG with Tools chain created (won't run API calls in this demo)")
# result = rag_with_tools({"question": "What is RAG and how does it relate to current AI trends?"})
# print(result)

**10.4.4 Optimizing for Extended Conversations**

---

**Managing Growing Chat Histories**


In [None]:
def token_counter(text):
    """Count tokens in a text using a simple approximation (not exact)"""
    # Approximate token count (this is a rough estimate)
    return len(text.split()) * 1.3  # Average words to tokens ratio

def create_token_optimized_memory(max_tokens=2000):
    """Create a memory that optimizes token usage"""
    # Use token-aware memory
    token_memory = ConversationTokenBufferMemory(
        llm=llm,
        max_token_limit=max_tokens,
        memory_key="chat_history",
        return_messages=True
    )

    return token_memory

# Create token-optimized memory
token_optimized_memory = create_token_optimized_memory()
print("Token-optimized memory created")

**Summarization and Compression Techniques**

In [None]:
def create_hybrid_memory():
    """Create a hybrid memory that combines recent messages with a summary"""
    # Use summary buffer memory
    summary_memory = ConversationSummaryBufferMemory(
        llm=llm,
        max_token_limit=1000,
        memory_key="chat_history",
        return_messages=True
    )

    return summary_memory

def compress_history(messages, max_tokens=1000):
    """Compress a long conversation history"""

    # If messages are short enough, return as is
    estimated_tokens = sum(token_counter(msg.content) for msg in messages)
    if estimated_tokens <= max_tokens:
        return messages

    # Separate into recent and old messages
    # Keep the most recent 3 exchanges (6 messages) as is
    recent_messages = messages[-6:] if len(messages) > 6 else messages
    old_messages = messages[:-6] if len(messages) > 6 else []

    # If there are no old messages, return recent ones
    if not old_messages:
        return recent_messages

    # Create a summary of old messages
    old_messages_text = "\n".join([f"{'Human' if i % 2 == 0 else 'AI'}: {msg.content}"
                                for i, msg in enumerate(old_messages)])

    summary_prompt = f"""
    Summarize the following conversation in a concise way that preserves the key information,
    especially topics, entities, and important details that might be referenced later.
    Keep your summary under 200 words.

    Conversation:
    {old_messages_text}

    Summary:
    """

    summary = llm.invoke(summary_prompt).content

    # Create a system message with the summary
    system_summary = SystemMessage(content=f"Previous conversation summary: {summary}")

    # Return the summary and recent messages
    return [system_summary] + recent_messages

# Example of conversation compression
sample_history = [
    HumanMessage(content="What is RAG technology?"),
    AIMessage(content="Retrieval Augmented Generation (RAG) is a technique that combines retrieval-based and generation-based approaches in natural language processing. It enhances large language models by retrieving relevant information from external knowledge sources before generating a response."),
    HumanMessage(content="What are the main components of a RAG system?"),
    AIMessage(content="A RAG system typically consists of: 1) A document store or knowledge base 2) An embedding model to convert documents into vector representations 3) A vector database for efficient similarity search 4) A retriever that finds relevant documents for a query 5) A large language model that generates responses based on the retrieved information and 6) A prompt template that structures the interaction between these components."),
    HumanMessage(content="How does the retrieval process work?"),
    AIMessage(content="The retrieval process in RAG works by first converting the user's query into a vector embedding using the same embedding model used for the documents. Then, a similarity search is performed in the vector database to find documents whose embeddings are closest to the query embedding (using metrics like cosine similarity). The most similar documents are retrieved and provided as context to the language model along with the original query to generate an informed response."),
    HumanMessage(content="What are some challenges with RAG systems?"),
    AIMessage(content="Some key challenges with RAG systems include: 1) Retrieval quality - ensuring the most relevant documents are found 2) Context window limitations - balancing comprehensive context with token limits 3) Hallucination mitigation - even with retrieval, LLMs can still generate incorrect information 4) Latency - the additional retrieval step adds processing time 5) Handling ambiguous queries that may not clearly map to specific documents 6) Keeping the knowledge base up-to-date with fresh information.")
]

# Compress the history
compressed_history = compress_history(sample_history)

print(f"Original message count: {len(sample_history)}")
print(f"Compressed message count: {len(compressed_history)}")

# Show the system message summary if compression happened
if len(compressed_history) < len(sample_history):
    system_msg = compressed_history[0]
    print("\nCompressed Summary:")
    print(system_msg.content)

**Maintaining Coherence Across Sessions**

In [None]:
class PersistentConversationManager:
    """Manages persistent conversations across sessions"""

    def __init__(self, user_id, storage_path="./conversations"):
        self.user_id = user_id
        self.storage_path = storage_path
        self.current_session = []
        self.llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

        # Create storage directory if it doesn't exist
        os.makedirs(storage_path, exist_ok=True)

    def get_conversation_file(self):
        """Get the path to the conversation file for this user"""
        return os.path.join(self.storage_path, f"{self.user_id}_conversation.json")

    def load_conversation_history(self):
        """Load conversation history from storage"""
        file_path = self.get_conversation_file()
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)

                # Convert to message objects
                history = []
                for msg in data.get('history', []):
                    if msg['type'] == 'human':
                        history.append(HumanMessage(content=msg['content']))
                    elif msg['type'] == 'ai':
                        history.append(AIMessage(content=msg['content']))
                    elif msg['type'] == 'system':
                        history.append(SystemMessage(content=msg['content']))

                return history
            except Exception as e:
                print(f"Error loading conversation: {e}")
                return []
        else:
            return []

    def save_conversation_history(self, history):
        """Save conversation history to storage"""
        file_path = self.get_conversation_file()

        # Convert to serializable format
        serializable_history = []
        for msg in history:
            if isinstance(msg, HumanMessage):
                serializable_history.append({'type': 'human', 'content': msg.content})
            elif isinstance(msg, AIMessage):
                serializable_history.append({'type': 'ai', 'content': msg.content})
            elif isinstance(msg, SystemMessage):
                serializable_history.append({'type': 'system', 'content': msg.content})

        # Save to file
        with open(file_path, 'w') as f:
            json.dump({'history': serializable_history}, f)

    def start_new_session(self):
        """Start a new conversation session"""
        # Load previous history
        previous_history = self.load_conversation_history()

        # If history exists, create a summary to maintain context
        if previous_history:
            # Compress the history to create a summary
            self.current_session = compress_history(previous_history)
            print(f"Loaded previous conversation with {len(previous_history)} messages")
            print(f"Compressed to {len(self.current_session)} messages")
        else:
            # Start fresh
            self.current_session = []
            print("Starting new conversation with no previous history")

        return self.current_session

    def add_message(self, message, is_human=True):
        """Add a message to the current session"""
        if is_human:
            self.current_session.append(HumanMessage(content=message))
        else:
            self.current_session.append(AIMessage(content=message))

        # Save after each message
        self.save_conversation_history(self.current_session)

    def get_response(self, query):
        """Get a response to a query using the conversation history"""
        # Add the query to the session
        self.add_message(query, is_human=True)

        # Create context for the query using RAG
        context = ""
        if retriever:
            docs = retriever.get_relevant_documents(query)
            if docs:
                context = "\n\n".join(doc.page_content for doc in docs)

        # Create prompt with history and context
        prompt = f"""
        You are a helpful AI assistant. Respond to the user's query based on the conversation history and provided context.

        Context: {context}

        Respond to the latest message from the user.
        """

        # Generate response
        messages = [SystemMessage(content=prompt)] + self.current_session
        response = self.llm.invoke(messages)

        # Add response to session
        self.add_message(response.content, is_human=False)

        return response.content

    def end_session(self):
        """End the current session and save"""
        self.save_conversation_history(self.current_session)
        print("Session ended and saved")

# Create a conversation manager for demonstration
# Note: In a real application, each user would have their own ID
demo_manager = PersistentConversationManager(user_id="demo_user")

# Start a new session
history = demo_manager.start_new_session()

# Simulate a short conversation
print("\nSimulating conversation with persistent manager:")
queries = [
    "What are the key benefits of RAG systems?",
    "How do they handle up-to-date information?"
]

for query in queries:
    print(f"\nHuman: {query}")
    response = demo_manager.get_response(query)
    print(f"AI: {response}")

# End the session
demo_manager.end_session()

**Implementing a Complete RAG Chatbot**

In [None]:
class ComprehensiveRAGChatbot:
    """A complete RAG chatbot with memory, tools, and contextual understanding"""

    def __init__(self, retriever, memory_type="summary_buffer", max_tokens=2000):
        self.retriever = retriever
        self.memory_type = memory_type
        self.max_tokens = max_tokens

        # Initialize LLM
        self.llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)

        # Initialize memory based on type
        self.memory = self._initialize_memory()

        # Initialize tools
        self.tools = self._initialize_tools()

        # Initialize the agent
        if self.tools:
            self.agent = initialize_agent(
                self.tools,
                self.llm,
                agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
                memory=self.memory,
                verbose=False
            )
        else:
            self.agent = None

    def _initialize_memory(self):
        """Initialize the appropriate memory type"""
        if self.memory_type == "buffer":
            return ConversationBufferMemory(
                memory_key="chat_history",
                return_messages=True
            )
        elif self.memory_type == "window":
            return ConversationBufferWindowMemory(
                memory_key="chat_history",
                k=5,
                return_messages=True
            )
        elif self.memory_type == "token":
            return ConversationTokenBufferMemory(
                llm=self.llm,
                memory_key="chat_history",
                max_token_limit=self.max_tokens,
                return_messages=True
            )
        elif self.memory_type == "summary":
            return ConversationSummaryMemory(
                llm=self.llm,
                memory_key="chat_history",
                return_messages=True
            )
        else:  # Default to summary buffer
            return ConversationSummaryBufferMemory(
                llm=self.llm,
                memory_key="chat_history",
                max_token_limit=self.max_tokens,
                return_messages=True
            )

    def _initialize_tools(self):
        """Initialize tools for the chatbot"""
        # Create retriever tool
        retriever_tool = create_retriever_tool(
            self.retriever,
            "Knowledge Base",
            "Useful for searching our internal knowledge base for information on AI and technology."
        )

        # For simplicity, we'll just use the retriever tool
        # In a real implementation, you might add more tools
        return [retriever_tool]

    def _should_use_tools(self, query):
        """Determine if tools should be used for this query"""
        # This is a simplified version - in a real implementation,
        # you might use more sophisticated logic
        info_seeking_patterns = [
            r"what (is|are|was|were)",
            r"how (do|does|did)",
            r"why (is|are|was|were)",
            r"when (is|are|was|were)",
            r"where (is|are|was|were)",
            r"who (is|are|was|were)",
            r"tell me about",
            r"explain",
            r"describe"
        ]

        for pattern in info_seeking_patterns:
            if re.search(pattern, query.lower()):
                return True

        return False

    def _get_context_directly(self, query):
        """Get context directly from the retriever (no tools)"""
        docs = self.retriever.get_relevant_documents(query)
        return "\n\n".join(doc.page_content for doc in docs)

    def chat(self, query):
        """Process a user query and return a response"""
        # Check if we should use tools
        use_tools = self._should_use_tools(query) and self.agent is not None

        if use_tools:
            # Use the agent with tools
            result = self.agent.invoke({"input": query})
            return result["output"]
        else:
            # Use direct RAG (no tools)
            # Get context from retriever
            context = self._get_context_directly(query)

            # Get chat history
            chat_history = self.memory.chat_memory.messages if hasattr(self.memory, 'chat_memory') else []

            # Create messages for the LLM
            messages = [
                SystemMessage(content=f"""
                You are a helpful AI assistant. Answer the user's question based on the
                provided context and conversation history. If the context doesn't contain
                relevant information, use your general knowledge but be clear about what
                you know vs. what you're inferring.

                Context from knowledge base:
                {context}
                """)
            ]

            # Add chat history
            messages.extend(chat_history)

            # Add the current query
            messages.append(HumanMessage(content=query))

            # Generate response
            response = self.llm.invoke(messages)

            # Update memory
            self.memory.chat_memory.add_user_message(query)
            self.memory.chat_memory.add_ai_message(response.content)

            return response.content

# Create a comprehensive RAG chatbot
comprehensive_chatbot = ComprehensiveRAGChatbot(retriever, memory_type="summary_buffer")

# Test the chatbot with a conversation
print("\nTesting Comprehensive RAG Chatbot:")
conversation = [
    "What is Retrieval Augmented Generation?",
    "How does it compare to traditional question answering?",
    "Can you tell me about transformer architecture?",
    "Thanks for explaining that!",
    "What about ethical considerations in AI development?"
]

for query in conversation:
    print(f"\nHuman: {query}")
    response = comprehensive_chatbot.chat(query)
    print(f"AI: {response}")

**Conclusion**

In this notebook, we've implemented and demonstrated the key components of building intelligent RAG-powered chatbots:

1. **Memory Management Strategies** - Different approaches to managing conversation history
2. **Contextual Retrieval** - Techniques for improved query understanding and relevant document retrieval
3. **Tool Integration** - Methods for combining RAG with external tools
4. **Optimizing for Extended Conversations** - Strategies for handling long interactions

These building blocks can be combined and customized to create sophisticated conversational AI systems tailored to specific use cases and domains.