# RAG Document Assistant Demo

This notebook demonstrates the core functionality of the RAG Document Assistant.

In [1]:
# Import necessary libraries
import os
import sys
from dotenv import load_dotenv
import pandas as pd
import matplotlib.pyplot as plt

# Add the project directory to the path
sys.path.append(os.path.abspath('.'))

# Load environment variables
load_dotenv()

# Import custom modules
from v1.src.config import Config
from v1.src.embedding import DocumentEmbedder
from v1.src.retriever import RAGRetriever
from v1.src.query_optimization import QueryOptimizer

## 1. Initialize Components

First, let's initialize the core components of our RAG system.

In [2]:
# Initialize components
config_path = "v1/config/config.yaml"

# Check if config file exists
if not os.path.exists(config_path):
    print(f"Config file not found at {config_path}")
else:
    print(f"Config file found at {config_path}")

# Initialize components with error handling
try:
    config = Config(config_path)
    print("Config loaded successfully")
    
    embedder = DocumentEmbedder(config_path)
    print("Document embedder initialized")
    
    retriever = RAGRetriever(config_path)
    print("RAG retriever initialized")
    
    optimizer = QueryOptimizer(config_path)
    print("Query optimizer initialized")
except Exception as e:
    print(f"Error initializing components: {e}")

Config file found at v1/config/config.yaml
Config loaded successfully
Document embedder initialized


  self.embedding_model = HuggingFaceEmbeddings(


No vector store available. QA chain cannot be initialized.
RAG retriever initialized
Query optimizer initialized




## 2. Document Processing

Let's process a sample document and add it to our vector store.

In [3]:
import os
import tkinter as tk
from tkinter import filedialog
import PyPDF2
import docx
import pandas as pd
import csv

def upload_file():
    """
    Open a file dialog to upload a document
    
    Returns:
        str: Path to the uploaded file
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main window
    
    file_path = filedialog.askopenfilename(
        title="Select a Document",
        filetypes=[
            ("Supported Files", "*.pdf *.docx *.csv *.txt"),
            ("PDF Files", "*.pdf"),
            ("Word Documents", "*.docx"),
            ("CSV Files", "*.csv"),
            ("Text Files", "*.txt")
        ]
    )
    
    return file_path if file_path else None

def preview_pdf(file_path):
    """
    Preview PDF file content
    
    Args:
        file_path (str): Path to the PDF file
    
    Returns:
        str: Preview text of the PDF
    """
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Limit preview to first 3 pages
            preview_text = ""
            for page_num in range(min(3, len(pdf_reader.pages))):
                page = pdf_reader.pages[page_num]
                preview_text += page.extract_text()
            
            return preview_text[:1000]  # Limit to 1000 characters
    except Exception as e:
        print(f"Error previewing PDF: {e}")
        return None

def preview_docx(file_path):
    """
    Preview DOCX file content
    
    Args:
        file_path (str): Path to the Word document
    
    Returns:
        str: Preview text of the document
    """
    try:
        doc = docx.Document(file_path)
        
        # Collect first few paragraphs
        preview_text = "\n".join([para.text for para in doc.paragraphs[:5]])
        
        return preview_text[:1000]  # Limit to 1000 characters
    except Exception as e:
        print(f"Error previewing DOCX: {e}")
        return None

def preview_csv(file_path):
    """
    Preview CSV file content
    
    Args:
        file_path (str): Path to the CSV file
    
    Returns:
        pandas.DataFrame: First few rows of the CSV
    """
    try:
        df = pd.read_csv(file_path)
        return df.head()
    except Exception as e:
        print(f"Error previewing CSV: {e}")
        return None

def preview_txt(file_path):
    """
    Preview TXT file content
    
    Args:
        file_path (str): Path to the text file
    
    Returns:
        str: Preview text of the file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            return text[:1000]  # Limit to 1000 characters
    except Exception as e:
        print(f"Error previewing TXT: {e}")
        return None

def process_document(file_path=None, embedder=None):
    """
    Process a document for embedding and vector store
    
    Args:
        file_path (str, optional): Path to the document. 
                                   If None, opens file upload dialog
        embedder (object, optional): Embedding and vector store handler
    
    Returns:
        list: Document chunk IDs or None if processing fails
    """
    # If no file path provided, open file upload dialog
    if file_path is None:
        file_path = upload_file()
        
        # Exit if no file selected
        if file_path is None:
            print("No file selected.")
            return None
    
    # Verify file exists
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None
    
    print(f"Processing document: {file_path}")
    
    try:
        # Determine file type for preview
        file_extension = os.path.splitext(file_path)[1].lower()
        
        # Preview document based on type
        if file_extension == '.pdf':
            preview = preview_pdf(file_path)
            print("PDF Preview:\n", preview)
        elif file_extension == '.docx':
            preview = preview_docx(file_path)
            print("DOCX Preview:\n", preview)
        elif file_extension == '.csv':
            preview = preview_csv(file_path)
            print("CSV Preview:\n", preview)
        elif file_extension == '.txt':
            preview = preview_txt(file_path)
            print("TXT Preview:\n", preview)
        
        # Only proceed with embedding if embedder is provided
        if embedder is not None:
            # Load and process document
            documents = embedder.load_document(file_path)
            print(f"Loaded {len(documents)} document(s)")
            
            # Add to vector store
            doc_ids = embedder.add_documents(documents)
            print(f"Added {len(doc_ids)} document chunks to vector store")
            
            return doc_ids
        
        return file_path
    
    except Exception as e:
        print(f"Error processing document: {e}")
        return None

# Example usage (commented out)
# def example_usage():
#     # Scenario 1: Provide a file path
#     # process_document('/path/to/your/document.pdf', embedder)
#     
#     # Scenario 2: Open file upload dialog
#     # process_document(embedder=embedder)
#     pass

In [None]:
# Process a sample document (replace with your own document path)
#sample_doc_path = "path/to/your/sample/document.pdf"  # Update this path

# Uncomment to process the document
# doc_ids = process_document(sample_doc_path)

## 3. Query Optimization

Let's test the query optimization functionality.

In [None]:
# Test query optimization
def test_query_optimization(query):
    print(f"Original query: {query}")
    
    try:
        # Optimize query
        optimization_result = optimizer.optimize_query(query)
        
        # Display results
        print(f"\nOptimized query: {optimization_result['optimized_query']}")
        
        print("\nQuery variations:")
        for i, variation in enumerate(optimization_result['variations'], 1):
            print(f"  {i}. {variation}")
        
        print("\nExtracted keywords:")
        print(f"  {', '.join(optimization_result['keywords'])}")
        
        return optimization_result
    except Exception as e:
        print(f"Error optimizing query: {e}")
        return None

In [None]:
# Test with a sample query
sample_query = "What are the main benefits of RAG systems?"
optimization_result = test_query_optimization(sample_query)

## 4. Document Retrieval and Question Answering

Now let's test the retrieval and question answering functionality.

In [None]:
# Test retrieval and question answering
def test_retrieval(query, use_optimization=True):
    print(f"Query: {query}")
    
    try:
        # Process query
        if use_optimization:
            print("Using query optimization...")
            optimization_result = optimizer.optimize_query(query)
            optimized_query = optimization_result["optimized_query"]
            print(f"Optimized query: {optimized_query}")
            result = retriever.answer_query(optimized_query)
        else:
            result = retriever.answer_query(query)
        
        # Display results
        print("\nAnswer:")
        print(result["answer"])
        
        print("\nRetrieved documents:")
        for i, doc in enumerate(result["documents"], 1):
            print(f"\nDocument {i}:")
            print(f"Content: {doc.page_content[:200]}...")
            print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        
        return result
    except Exception as e:
        print(f"Error retrieving answer: {e}")
        return None

In [None]:
# Test with a sample query
sample_query = "What are the main benefits of RAG systems?"
retrieval_result = test_retrieval(sample_query, use_optimization=True)

## 5. Performance Evaluation

Let's evaluate the performance of our RAG system.

In [None]:
# Simple performance evaluation
import time

def evaluate_performance(queries, use_optimization=True):
    results = []
    
    for query in queries:
        start_time = time.time()
        
        if use_optimization:
            optimization_result = optimizer.optimize_query(query)
            optimized_query = optimization_result["optimized_query"]
            result = retriever.answer_query(optimized_query)
        else:
            result = retriever.answer_query(query)
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        
        results.append({
            "query": query,
            "time": elapsed_time,
            "num_docs": len(result["documents"]) if "documents" in result else 0
        })
    
    return pd.DataFrame(results)

In [None]:
# Sample queries for evaluation
sample_queries = [
    "What are the main benefits of RAG systems?",
    "How does document chunking affect retrieval quality?",
    "What embedding models work best for RAG?",
    "Explain the difference between sparse and dense retrievers"
]

# Evaluate with and without optimization
print("Evaluating without optimization...")
results_without_opt = evaluate_performance(sample_queries, use_optimization=False)

print("\nEvaluating with optimization...")
results_with_opt = evaluate_performance(sample_queries, use_optimization=True)

# Display results
print("\nResults without optimization:")
display(results_without_opt)

print("\nResults with optimization:")
display(results_with_opt)

# Plot comparison
plt.figure(figsize=(10, 6))
plt.bar(range(len(sample_queries)), results_without_opt['time'], width=0.4, label='Without Optimization')
plt.bar([x + 0.4 for x in range(len(sample_queries))], results_with_opt['time'], width=0.4, label='With Optimization')
plt.xlabel('Query')
plt.ylabel('Time (seconds)')
plt.title('Query Processing Time Comparison')
plt.xticks([x + 0.2 for x in range(len(sample_queries))], [f'Query {i+1}' for i in range(len(sample_queries))])
plt.legend()
plt.tight_layout()
plt.show()

## 6. Conclusion

This notebook demonstrates the core functionality of our RAG Document Assistant. We've tested:

1. Component initialization
2. Document processing
3. Query optimization
4. Document retrieval and question answering
5. Performance evaluation

Next steps:
- Fine-tune the embedding model
- Optimize chunking parameters
- Improve query optimization
- Enhance the evaluation metrics