# EcoHome RAG Setup
## Set up Retrieval-Augmented Generation (RAG) with ChromaDB

This notebook:
1. Loads energy-saving documents from the knowledge base
2. Splits documents into chunks for embedding
3. Creates embeddings using OpenAI
4. Stores embeddings in ChromaDB vector database
5. Tests retrieval with sample queries

In [None]:
# Import required libraries
import sys
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Check for API key
if not os.getenv("OPENAI_API_KEY"):
    print("✗ Error: OPENAI_API_KEY not found in environment variables")
    print("Please set your OpenAI API key in .env file")
else:
    print("✓ OpenAI API key found")

# Add parent directory to path
sys.path.append(os.path.dirname(os.getcwd()))

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

print("✓ Imports successful")

## Step 1: Load Knowledge Base Documents

Load all text documents from the data/documents/ directory.

In [None]:
# Set up document loader
documents_path = "./data/documents/"

print(f"Loading documents from: {os.path.abspath(documents_path)}")

loader = DirectoryLoader(
    documents_path,
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=True
)

# Load documents
documents = loader.load()

print(f"\n✓ Loaded {len(documents)} documents")
print("\nDocuments:")
for doc in documents:
    source = doc.metadata.get("source", "Unknown")
    filename = source.split("\\")[-1] if "\\" in source else source.split("/")[-1]
    print(f"  - {filename}: {len(doc.page_content)} characters")

## Step 2: Split Documents into Chunks

Split long documents into smaller chunks for better retrieval and embedding.

In [None]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        # Maximum chunk size
    chunk_overlap=200,      # Overlap between chunks
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]  # Split on paragraphs, then sentences
)

# Split documents
print("Splitting documents into chunks...")
chunks = text_splitter.split_documents(documents)

print(f"✓ Created {len(chunks)} chunks from {len(documents)} documents")
print(f"\nAverage chunk size: {sum(len(chunk.page_content) for chunk in chunks) / len(chunks):.0f} characters")
print(f"\nSample chunk:")
print("-" * 50)
print(chunks[0].page_content[:500] + "...")
print("-" * 50)

## Step 3: Create Embeddings and Vector Store

Generate embeddings for all chunks and store them in ChromaDB for fast retrieval.

In [None]:
# Initialize embeddings model
print("Initializing OpenAI embeddings...")
embeddings = OpenAIEmbeddings()
print("✓ Embeddings model initialized")

# Set up ChromaDB directory
chroma_db_path = "./chroma_db"
print(f"\nCreating vector store at: {os.path.abspath(chroma_db_path)}")

# Create vector store
print("\nGenerating embeddings and building vector store...")
print("(This may take a minute for large knowledge bases)")

vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=chroma_db_path
)

print(f"\n✓ Vector store created with {len(chunks)} embedded chunks")
print(f"✓ Persisted to: {os.path.abspath(chroma_db_path)}")

## Step 4: Test Retrieval

Test the RAG system with sample queries to verify it's working correctly.

In [None]:
# Test queries
test_queries = [
    "How can I reduce my HVAC energy costs?",
    "What are the best practices for EV charging?",
    "How do I optimize my solar battery storage?",
    "What are good energy-saving tips for summer?"
]

print("Testing retrieval with sample queries...\n")
print("=" * 80)

for query in test_queries:
    print(f"\nQuery: {query}")
    print("-" * 80)
    
    # Search for relevant documents
    results = vector_store.similarity_search(query, k=2)
    
    print(f"Found {len(results)} relevant results:\n")
    
    for i, result in enumerate(results, 1):
        source = result.metadata.get("source", "Unknown")
        filename = source.split("\\")[-1] if "\\" in source else source.split("/")[-1]
        
        print(f"{i}. From: {filename}")
        print(f"   Content: {result.page_content[:300]}...")
        print()
    
    print("=" * 80)

print("\n✓ Retrieval tests complete!")

## Step 5: Test with Similarity Search and Scores

Test retrieval with similarity scores to understand relevance.

In [None]:
# Test with scores
test_query = "How to save energy with smart home automation?"

print(f"Query: {test_query}\n")
print("=" * 80)

# Search with scores
results_with_scores = vector_store.similarity_search_with_score(test_query, k=3)

print(f"\nTop 3 Results with Similarity Scores:\n")

for i, (result, score) in enumerate(results_with_scores, 1):
    source = result.metadata.get("source", "Unknown")
    filename = source.split("\\")[-1] if "\\" in source else source.split("/")[-1]
    
    print(f"{i}. Relevance Score: {score:.4f}")
    print(f"   Source: {filename}")
    print(f"   Content Preview:")
    print(f"   {result.page_content[:400]}...")
    print("\n" + "-" * 80 + "\n")

print("\n✓ Note: Lower scores indicate higher similarity/relevance")

## Step 6: Verify Vector Store Persistence

Verify that the vector store was saved and can be reloaded.

In [None]:
# Check that ChromaDB was persisted
print("Verifying vector store persistence...\n")

if os.path.exists(chroma_db_path):
    print(f"✓ Vector store directory exists: {os.path.abspath(chroma_db_path)}")
    
    # List files in directory
    files = os.listdir(chroma_db_path)
    print(f"\nFiles in vector store:")
    for file in files:
        file_path = os.path.join(chroma_db_path, file)
        if os.path.isfile(file_path):
            size = os.path.getsize(file_path)
            print(f"  - {file}: {size:,} bytes")
    
    # Test reloading
    print("\nTesting vector store reload...")
    reloaded_store = Chroma(
        persist_directory=chroma_db_path,
        embedding_function=embeddings
    )
    
    # Test query on reloaded store
    test_results = reloaded_store.similarity_search("solar panels", k=1)
    print(f"✓ Successfully reloaded vector store")
    print(f"✓ Verified with test query - found {len(test_results)} result(s)")
else:
    print(f"✗ Error: Vector store directory not found at {os.path.abspath(chroma_db_path)}")

## Step 7: RAG System Statistics

Display summary statistics about the RAG system.

In [None]:
# Calculate statistics
total_chars = sum(len(doc.page_content) for doc in documents)
total_words = sum(len(doc.page_content.split()) for doc in documents)
avg_chunk_chars = sum(len(chunk.page_content) for chunk in chunks) / len(chunks)
avg_chunk_words = sum(len(chunk.page_content.split()) for chunk in chunks) / len(chunks)

print("RAG System Statistics")
print("=" * 80)
print(f"\nKnowledge Base:")
print(f"  Documents: {len(documents)}")
print(f"  Total characters: {total_chars:,}")
print(f"  Total words: {total_words:,}")
print(f"  Average document size: {total_chars // len(documents):,} characters")

print(f"\nChunking:")
print(f"  Total chunks: {len(chunks)}")
print(f"  Average chunk size: {avg_chunk_chars:.0f} characters ({avg_chunk_words:.0f} words)")
print(f"  Chunk overlap: 200 characters")

print(f"\nVector Store:")
print(f"  Database: ChromaDB")
print(f"  Location: {os.path.abspath(chroma_db_path)}")
print(f"  Embeddings: OpenAI (text-embedding-ada-002)")
print(f"  Embedded chunks: {len(chunks)}")

print(f"\nUsage:")
print(f"  The RAG system is now ready for use by the EcoHome agent")
print(f"  It will automatically search these {len(chunks)} chunks to answer questions")
print(f"  Average retrieval: 2-3 most relevant chunks per query")

print("\n" + "=" * 80)
print("✓ RAG setup complete!")

## Summary

RAG system has been successfully set up with:
- All knowledge base documents loaded and embedded
- ChromaDB vector store created and persisted
- Retrieval tested and verified working
- System ready for use by the EcoHome agent

The vector store is now available for:
- The `search_energy_tips` tool in tools.py
- The EcoHome agent to provide informed recommendations
- Direct queries for energy-saving information

Next steps:
1. Run `03_run_and_evaluate.ipynb` to test the complete agent system