# 🗄️ Strategy 1: Basic ChromaDB RAG

**Philosophy**: Build a solid foundation RAG system using ChromaDB for vector storage and semantic search. Focus on clean implementation and effective retrieval.

## Core Components:
- Document loading and chunking
- Embedding generation with sentence-transformers
- ChromaDB vector storage
- Semantic similarity search
- LLM-powered answer generation

## Optimization Areas:
- Chunk size and overlap strategies
- Embedding model selection
- Retrieval parameters (k, similarity threshold)
- Prompt engineering for Q&A
- Citation and source attribution

## Input Requirements:
You **must** have completed Phase 1 (parsing challenge) and have parsed markdown files in `../data/input_papers/`

In [1]:
# Install required packages for ChromaDB RAG with multi-LLM support
!pip3 install chromadb sentence-transformers
!pip3 install openai google-generativeai python-dotenv
!pip3 install langchain langchain-openai langchain-google-genai
!pip3 install numpy pandas matplotlib

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting google-ai-generativelanguage==0.6.15
  Using cached google_ai_generativelanguage-0.6.15-py3-none-any.whl (1.3 MB)
Installing collected packages: google-ai-generativelanguage
  Attempting uninstall: google-ai-generativelanguage
    Found existing installation: google-ai-generativelanguage 0.6.18
    Uninstalling google-ai-generativelanguage-0.6.18:
      Successfully uninstalled google-ai-generativelanguage-0.6.18
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-google-genai 2.1.8 requires google-ai-generativelanguage<0.7.0,>=0.6.18, but you have

In [2]:
import os
import json
import time
from pathlib import Path
from typing import List, Dict, Any, Optional

import chromadb
from chromadb.config import Settings
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

# LLM imports - Multi-provider support
import openai
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage, SystemMessage

# Load environment variables
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [3]:
import os

# ⚙️ CONFIGURATION
class RAGConfig:
    # Paths
    INPUT_DIR = "../data/input_papers/"
    OUTPUT_DIR = "../data/vector_store/"
    
    # Chunking parameters
    CHUNK_SIZE = 500  # 🔧 TRY: 200, 500, 1000, 1500
    CHUNK_OVERLAP = 50  # 🔧 TRY: 0, 25, 50, 100
    
    # Embedding model
    EMBEDDING_MODEL = "all-MiniLM-L6-v2"  # 🔧 TRY: "all-mpnet-base-v2", "all-distilroberta-v1"
    
    # Retrieval parameters
    RETRIEVAL_K = 5  # 🔧 TRY: 3, 5, 7, 10
    SIMILARITY_THRESHOLD = 0.3  # 🔧 TRY: 0.2, 0.3, 0.4, 0.5
    
    # LLM settings - Multi-provider support
    LLM_PROVIDER = os.getenv('LLM_PROVIDER', 'openai')  # 🔧 OPTIONS: "openai", "gemini"
    LLM_MODEL = os.getenv('LLM_MODEL', 'gpt-3.5-turbo')  # 🔧 OpenAI: "gpt-4", "gpt-3.5-turbo" | Gemini: "gemini-pro", "gemini-1.5-flash"
    LLM_TEMPERATURE = float(os.getenv('LLM_TEMPERATURE', '0.1'))  # 🔧 TRY: 0.0, 0.1, 0.3
    MAX_TOKENS = int(os.getenv('LLM_MAX_TOKENS', '1000'))  # 🔧 TRY: 500, 1000, 2000

config = RAGConfig()

# Ensure output directory exists
os.makedirs(config.OUTPUT_DIR, exist_ok=True)

# Display current configuration
print("🔧 RAG CONFIGURATION:")
print(f"Provider: {config.LLM_PROVIDER}")
print(f"Model: {config.LLM_MODEL}")
print(f"Embedding: {config.EMBEDDING_MODEL}")
print(f"Chunk size: {config.CHUNK_SIZE}")
print(f"Retrieval K: {config.RETRIEVAL_K}")

🔧 RAG CONFIGURATION:
Provider: gemini
Model: gemini-1.5-flash
Embedding: all-MiniLM-L6-v2
Chunk size: 500
Retrieval K: 5


In [4]:
# 📄 DOCUMENT LOADING AND PROCESSING
class DocumentProcessor:
    def __init__(self, config: RAGConfig):
        self.config = config
        
    def load_parsed_documents(self) -> Dict[str, str]:
        """Load all parsed markdown files from Phase 1"""
        documents = {}
        input_path = Path(self.config.INPUT_DIR)
        
        print(f"Loading documents from: {input_path}")
        
        # Look for all markdown files
        md_files = list(input_path.glob("*.md"))
        print(f"Found {len(md_files)} markdown files")
        
        for file_path in md_files:
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    documents[file_path.stem] = content
                    print(f"Loaded: {file_path.name} ({len(content):,} chars)")
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        return documents
    
    def chunk_text(self, text: str, filename: str) -> List[Dict[str, Any]]:
        """Split text into overlapping chunks with metadata"""
        chunks = []
        
        # Simple character-based chunking
        # 🔧 OPTIMIZATION AREA: Implement smarter chunking strategies
        
        start = 0
        chunk_id = 0
        
        while start < len(text):
            end = start + self.config.CHUNK_SIZE
            chunk_text = text[start:end]
            
            # Try to break at sentence boundary
            if end < len(text):
                last_period = chunk_text.rfind('.')
                last_newline = chunk_text.rfind('\n')
                break_point = max(last_period, last_newline)
                
                if break_point > start + self.config.CHUNK_SIZE * 0.7:
                    chunk_text = text[start:start + break_point + 1]
                    end = start + break_point + 1
            
            # Create chunk with metadata
            chunk = {
                'id': f"{filename}_chunk_{chunk_id}",
                'text': chunk_text.strip(),
                'source_file': filename,
                'chunk_index': chunk_id,
                'start_char': start,
                'end_char': end
            }
            
            chunks.append(chunk)
            
            # Move to next chunk with overlap
            start = end - self.config.CHUNK_OVERLAP
            chunk_id += 1
        
        return chunks
    
    def process_all_documents(self) -> List[Dict[str, Any]]:
        """Process all documents into chunks"""
        documents = self.load_parsed_documents()
        all_chunks = []
        
        for filename, content in documents.items():
            print(f"\nProcessing: {filename}")
            chunks = self.chunk_text(content, filename)
            all_chunks.extend(chunks)
            print(f"Generated {len(chunks)} chunks")
        
        print(f"\nTotal chunks created: {len(all_chunks)}")
        return all_chunks

# Test document processing
processor = DocumentProcessor(config)
chunks = processor.process_all_documents()

Loading documents from: ../data/input_papers
Found 3 markdown files
Loaded: strategy1_examining_the_awareness_of_mobile_money_users_on_s_llamaparse.md (28,458 chars)
Loaded: strategy1_practical_machine_learning_25_05_04_14_32_34_llamaparse.md (543,588 chars)
Loaded: strategy1_mobile_based_deep_learning_models_for_banana_disease_llamaparse.md (18,055 chars)

Processing: strategy1_examining_the_awareness_of_mobile_money_users_on_s_llamaparse
Generated 64 chunks

Processing: strategy1_practical_machine_learning_25_05_04_14_32_34_llamaparse
Generated 1208 chunks

Processing: strategy1_mobile_based_deep_learning_models_for_banana_disease_llamaparse
Generated 41 chunks

Total chunks created: 1313


In [5]:
# 🔍 EMBEDDING AND VECTOR STORAGE
class ChromaDBRAG:
    def __init__(self, config: RAGConfig):
        self.config = config
        self.embedding_model = SentenceTransformer(config.EMBEDDING_MODEL)
        self.client = None
        self.collection = None
        
    def initialize_chromadb(self):
        """Initialize ChromaDB client and collection"""
        print("Initializing ChromaDB...")
        
        # Ensure the chromadb directory exists and is writable
        chroma_dir = os.path.join(self.config.OUTPUT_DIR, "chromadb")
        os.makedirs(chroma_dir, exist_ok=True)
        if not os.access(chroma_dir, os.W_OK):
            raise PermissionError(f"ChromaDB directory is not writable: {chroma_dir}")
        print(f"ChromaDB storage path: {chroma_dir}")
        
        # Create persistent client
        self.client = chromadb.PersistentClient(
            path=chroma_dir
        )
        
        # Create or get collection
        collection_name = "research_papers"
        try:
            self.collection = self.client.get_collection(collection_name)
            print(f"Found existing collection: {collection_name}")
        except Exception:
            self.collection = self.client.create_collection(
                name=collection_name,
                metadata={"hnsw:space": "cosine"}  # Use cosine similarity
            )
            print(f"Created new collection: {collection_name}")
    
    def embed_and_store_chunks(self, chunks: List[Dict[str, Any]]):
        """Generate embeddings and store in ChromaDB"""
        if not self.collection:
            self.initialize_chromadb()
        
        print(f"Generating embeddings for {len(chunks)} chunks...")
        
        # Extract text for embedding
        texts = [chunk['text'] for chunk in chunks]
        ids = [chunk['id'] for chunk in chunks]
        
        # Generate embeddings
        embeddings = self.embedding_model.encode(
            texts, 
            show_progress_bar=True,
            convert_to_numpy=True
        )
        
        # Prepare metadata
        metadatas = []
        for chunk in chunks:
            metadata = {
                'source_file': chunk['source_file'],
                'chunk_index': chunk['chunk_index'],
                'start_char': chunk['start_char'],
                'end_char': chunk['end_char']
            }
            metadatas.append(metadata)
        
        # Store in ChromaDB
        print("Storing in ChromaDB...")
        self.collection.add(
            ids=ids,
            documents=texts,
            embeddings=embeddings.tolist(),
            metadatas=metadatas
        )
        
        print(f"Successfully stored {len(chunks)} chunks in ChromaDB")
    
    def search_similar_chunks(self, query: str, k: Optional[int] = None) -> Dict[str, Any]:
        """Search for similar chunks using semantic similarity"""
        if not self.collection:
            raise ValueError("ChromaDB not initialized. Call initialize_chromadb() first.")
        
        k = k or self.config.RETRIEVAL_K
        
        # Generate query embedding
        query_embedding = self.embedding_model.encode([query])
        
        # Search in ChromaDB
        results = self.collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=k
        )
        
        # Format results
        retrieved_chunks = []
        for i in range(len(results['ids'][0])):
            chunk = {
                'id': results['ids'][0][i],
                'text': results['documents'][0][i],
                'metadata': results['metadatas'][0][i],
                'distance': results['distances'][0][i]
            }
            retrieved_chunks.append(chunk)
        
        return {
            'query': query,
            'retrieved_chunks': retrieved_chunks,
            'total_found': len(retrieved_chunks)
        }

# Initialize RAG system
rag_system = ChromaDBRAG(config)
rag_system.embed_and_store_chunks(chunks)

Initializing ChromaDB...
ChromaDB storage path: ../data/vector_store/chromadb
Created new collection: research_papers
Generating embeddings for 1313 chunks...


Batches: 100%|██████████| 42/42 [00:05<00:00,  8.31it/s]


Storing in ChromaDB...
Successfully stored 1313 chunks in ChromaDB


In [6]:
# 🤖 ANSWER GENERATION
from pydantic import SecretStr

class AnswerGenerator:
    def __init__(self, config: RAGConfig):
        self.config = config
        self.llm = None
        self.setup_llm()
    
    def setup_llm(self):
        """Initialize the language model"""
        if self.config.LLM_PROVIDER == "openai":
            api_key = os.getenv('OPENAI_API_KEY')
            if not api_key:
                print("Warning: OPENAI_API_KEY not found. Please set it in your .env file")
                return
            self.llm = ChatOpenAI(
                model=self.config.LLM_MODEL,
                temperature=self.config.LLM_TEMPERATURE,
                api_key=SecretStr(api_key)
            )
            print(f"Initialized OpenAI LLM: {self.config.LLM_MODEL}")
        elif self.config.LLM_PROVIDER == "gemini":
            api_key = os.getenv('GEMINI_API_KEY')
            if not api_key:
                print("Warning: GEMINI_API_KEY not found. Please set it in your .env file")
                return
            self.llm = ChatGoogleGenerativeAI(
                model=self.config.LLM_MODEL,
                temperature=self.config.LLM_TEMPERATURE,
                max_tokens=self.config.MAX_TOKENS,
                google_api_key=api_key
            )
            print(f"Initialized Gemini LLM: {self.config.LLM_MODEL}")
        elif self.config.LLM_PROVIDER == "anthropic":
            api_key = os.getenv('ANTHROPIC_API_KEY')
            if not api_key:
                print("Warning: ANTHROPIC_API_KEY not found. Please set it in your .env file")
                return
            try:
                from langchain_anthropic import ChatAnthropic
            except ImportError:
                print("langchain-anthropic is not installed. Please install it to use Anthropic provider.")
                return
            self.llm = ChatAnthropic(
                model_name=self.config.LLM_MODEL,
                temperature=self.config.LLM_TEMPERATURE,
                api_key=SecretStr(api_key),
                timeout=30,  # Optional timeout setting
                stop=None    # You can set this to a list of stop sequences if needed
            )
            print(f"Initialized Anthropic LLM: {self.config.LLM_MODEL}")
        else:
            print(f"Warning: Unknown LLM_PROVIDER '{self.config.LLM_PROVIDER}'. Supported: 'openai', 'gemini', 'anthropic'.")
            return
    
    def create_system_prompt(self) -> str:
        """Create system prompt for academic Q&A"""
        return """You are an expert academic research assistant. Your task is to answer questions about research papers based on the provided context.

INSTRUCTIONS:
1. Answer questions accurately based ONLY on the provided context
2. If the context doesn't contain enough information, say so clearly
3. Always cite your sources by mentioning the source file name
4. Maintain academic tone and precision
5. Include relevant quotes when appropriate
6. If asked about specific details (numbers, dates, names), be precise

RESPONSE FORMAT:
- Provide a clear, comprehensive answer
- Include citations: [Source: filename]
- Quote relevant passages when helpful
- End with a confidence assessment if uncertain"""
    
    def generate_answer(self, query: str, retrieved_chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Generate answer using retrieved context"""
        if not self.llm:
            return {
                'answer': 'Error: LLM not properly initialized. Please check your API key.',
                'sources': [],
                'error': 'LLM initialization failed'
            }
        
        # Prepare context from retrieved chunks
        context_parts = []
        sources = set()
        
        for i, chunk in enumerate(retrieved_chunks):
            source_file = chunk['metadata']['source_file']
            sources.add(source_file)
            context_parts.append(f"Context {i+1} [Source: {source_file}]:\n{chunk['text']}\n")
        
        context = "\n".join(context_parts)
        
        # Create prompt
        user_prompt = f"""CONTEXT:
{context}

QUESTION: {query}

Please answer the question based on the provided context."""
        
        try:
            # Generate response
            messages = [
                SystemMessage(content=self.create_system_prompt()),
                HumanMessage(content=user_prompt)
            ]
            
            response = self.llm.invoke(messages)
            
            return {
                'answer': response.content,
                'sources': list(sources),
                'context_used': len(retrieved_chunks),
                'query': query
            }
            
        except Exception as e:
            return {
                'answer': f'Error generating answer: {str(e)}',
                'sources': [],
                'error': str(e)
            }

# Initialize answer generator
answer_generator = AnswerGenerator(config)

Initialized Gemini LLM: gemini-1.5-flash


In [7]:
# 🎯 COMPLETE RAG PIPELINE
class RAGPipeline:
    def __init__(self, config: RAGConfig):
        self.config = config
        self.rag_system = ChromaDBRAG(config)
        self.answer_generator = AnswerGenerator(config)
        
        # Initialize ChromaDB
        self.rag_system.initialize_chromadb()
    
    def ask_question(self, query: str, k: Optional[int] = None) -> Dict[str, Any]:
        """Complete RAG pipeline: retrieve + generate"""
        print(f"\n🔍 Query: {query}")
        
        # Step 1: Retrieve relevant chunks
        start_time = time.time()
        search_results = self.rag_system.search_similar_chunks(query, k)
        retrieval_time = time.time() - start_time
        
        print(f"Retrieved {len(search_results['retrieved_chunks'])} chunks in {retrieval_time:.2f}s")
        
        # Step 2: Generate answer
        start_time = time.time()
        answer_result = self.answer_generator.generate_answer(
            query, 
            search_results['retrieved_chunks']
        )
        generation_time = time.time() - start_time
        
        print(f"Generated answer in {generation_time:.2f}s")
        
        # Combine results
        result = {
            'query': query,
            'answer': answer_result['answer'],
            'sources': answer_result['sources'],
            'retrieved_chunks': search_results['retrieved_chunks'],
            'performance': {
                'retrieval_time': retrieval_time,
                'generation_time': generation_time,
                'total_time': retrieval_time + generation_time,
                'chunks_retrieved': len(search_results['retrieved_chunks'])
            }
        }
        
        return result
    
    def display_result(self, result: Dict[str, Any]):
        """Display RAG result in a nice format"""
        print("\n" + "="*60)
        print(f"🤖 ANSWER:")
        print("-" * 30)
        print(result['answer'])
        
        print(f"\n📚 SOURCES: {', '.join(result['sources'])}")
        
        print(f"\n⚡ PERFORMANCE:")
        perf = result['performance']
        print(f"  Retrieval: {perf['retrieval_time']:.2f}s")
        print(f"  Generation: {perf['generation_time']:.2f}s") 
        print(f"  Total: {perf['total_time']:.2f}s")
        print(f"  Chunks used: {perf['chunks_retrieved']}")
        
        print("\n🔍 RETRIEVED CONTEXT:")
        for i, chunk in enumerate(result['retrieved_chunks'][:3]):  # Show top 3
            print(f"\n  Chunk {i+1} [Distance: {chunk['distance']:.3f}]:")
            print(f"  Source: {chunk['metadata']['source_file']}")
            print(f"  Text: {chunk['text'][:200]}...")

# Initialize complete RAG pipeline
rag_pipeline = RAGPipeline(config)

Initialized Gemini LLM: gemini-1.5-flash
Initializing ChromaDB...
ChromaDB storage path: ../data/vector_store/chromadb
Found existing collection: research_papers


In [8]:
# 🧪 TESTING WITH SAMPLE QUESTIONS

# Sample academic questions to test the system (specific to the banana disease paper)
test_questions = [
    "What are the main banana diseases addressed in the paper?",
    "What deep learning models were used for banana disease detection?",
    "How was the dataset for training and testing prepared?",
    "What were the key results and accuracy metrics for each model?",
    "Why was InceptionV3 chosen for mobile deployment?",
    "What are the main conclusions of the study?",
    "How does agroforestry influence disease suppressiveness in bananas?",
    "What are the main recommendations for smallholder farmers?",
    "List the main references cited in the paper.",
    "What are the limitations and future research directions discussed?"
]

print("🧪 TESTING RAG SYSTEM WITH SAMPLE QUESTIONS")
print("="*60)

# Test each question
test_results = []

for question in test_questions[:2]:  # Test first 2 questions
    try:
        result = rag_pipeline.ask_question(question)
        rag_pipeline.display_result(result)
        test_results.append(result)
        
    except Exception as e:
        print(f"Error with question '{question}': {e}")
    
    print("\n" + "-"*60 + "\n")

print(f"✅ Completed testing {len(test_results)} questions")

🧪 TESTING RAG SYSTEM WITH SAMPLE QUESTIONS

🔍 Query: What are the main banana diseases addressed in the paper?
Retrieved 5 chunks in 0.12s
Generated answer in 91.75s

🤖 ANSWER:
------------------------------
The provided text mentions Fusarium wilt race 1 and black Sigatoka as major fungal diseases affecting banana yield [Source: strategy1_mobile_based_deep_learning_models_for_banana_disease_llamaparse].  The research specifically focuses on Fusarium wilt in the context of agroforestry systems [Source: strategy1_mobile_based_deep_learning_models_for_banana_disease_llamaparse].

📚 SOURCES: strategy1_mobile_based_deep_learning_models_for_banana_disease_llamaparse

⚡ PERFORMANCE:
  Retrieval: 0.12s
  Generation: 91.75s
  Total: 91.87s
  Chunks used: 5

🔍 RETRIEVED CONTEXT:

  Chunk 1 [Distance: 0.379]:
  Source: strategy1_mobile_based_deep_learning_models_for_banana_disease_llamaparse
  Text: ing, and disease incidence was recorded in banana plants.

### 2.3 Statistical Analysis
Data were

In [9]:
# ⚡ OPTIMIZATION WORKSPACE - Strategy 1
print("🚀 Strategy 1 ChromaDB RAG Optimization Workspace")
print("="*60)

print("\n🔧 OPTIMIZATION AREAS:")
print("☐ Chunk size and overlap tuning")
print("☐ Embedding model comparison") 
print("☐ Retrieval parameter optimization")
print("☐ Prompt engineering for academic Q&A")
print("☐ Post-processing and citation enhancement")
print("☐ Query expansion and rephrasing")

print("\n💡 QUICK OPTIMIZATION IDEAS:")
print("\n1. CHUNKING STRATEGIES:")
print("   - Semantic chunking (sentence boundaries)")
print("   - Section-based chunking (headers)")
print("   - Hierarchical chunking (multi-level)")

print("\n2. EMBEDDING IMPROVEMENTS:")
print("   - Try: all-mpnet-base-v2 (better accuracy)")
print("   - Try: all-distilroberta-v1 (speed)")
print("   - Domain-specific embedding models")

print("\n3. RETRIEVAL ENHANCEMENTS:")
print("   - Increase k for more context")
print("   - Add metadata filtering")
print("   - Implement re-ranking")

print("\n4. GENERATION IMPROVEMENTS:")
print("   - Better system prompts")
print("   - Chain-of-thought reasoning")
print("   - Citation format standardization")

# TODO: Implement your optimizations here
'''
# Your optimization implementation space:

# Option 1: Better chunking
def semantic_chunk_text(text: str, max_chunk_size: int = 500):
    """Implement semantic chunking based on sentences/paragraphs"""
    # Your implementation here
    pass

# Option 2: Enhanced retrieval
def enhanced_search(query: str, k: int = 5):
    """Add query expansion and better search"""
    # Your implementation here
    pass

# Option 3: Better prompts
def create_enhanced_prompt():
    """Improve system prompt for academic Q&A"""
    # Your implementation here
    pass

# Test your optimizations:
# optimized_result = your_enhanced_pipeline.ask_question("test question")
'''

🚀 Strategy 1 ChromaDB RAG Optimization Workspace

🔧 OPTIMIZATION AREAS:
☐ Chunk size and overlap tuning
☐ Embedding model comparison
☐ Retrieval parameter optimization
☐ Prompt engineering for academic Q&A
☐ Post-processing and citation enhancement
☐ Query expansion and rephrasing

💡 QUICK OPTIMIZATION IDEAS:

1. CHUNKING STRATEGIES:
   - Semantic chunking (sentence boundaries)
   - Section-based chunking (headers)
   - Hierarchical chunking (multi-level)

2. EMBEDDING IMPROVEMENTS:
   - Try: all-mpnet-base-v2 (better accuracy)
   - Try: all-distilroberta-v1 (speed)
   - Domain-specific embedding models

3. RETRIEVAL ENHANCEMENTS:
   - Increase k for more context
   - Add metadata filtering
   - Implement re-ranking

4. GENERATION IMPROVEMENTS:
   - Better system prompts
   - Chain-of-thought reasoning
   - Citation format standardization


'\n# Your optimization implementation space:\n\n# Option 1: Better chunking\ndef semantic_chunk_text(text: str, max_chunk_size: int = 500):\n    """Implement semantic chunking based on sentences/paragraphs"""\n    # Your implementation here\n    pass\n\n# Option 2: Enhanced retrieval\ndef enhanced_search(query: str, k: int = 5):\n    """Add query expansion and better search"""\n    # Your implementation here\n    pass\n\n# Option 3: Better prompts\ndef create_enhanced_prompt():\n    """Improve system prompt for academic Q&A"""\n    # Your implementation here\n    pass\n\n# Test your optimizations:\n# optimized_result = your_enhanced_pipeline.ask_question("test question")\n'

In [10]:
# 📊 PERFORMANCE ANALYSIS
def analyze_rag_performance(test_results: List[Dict[str, Any]]):
    """Analyze RAG system performance"""
    
    if not test_results:
        print("No test results to analyze")
        return
    
    print("📊 RAG PERFORMANCE ANALYSIS")
    print("="*50)
    
    # Timing analysis
    retrieval_times = [r['performance']['retrieval_time'] for r in test_results]
    generation_times = [r['performance']['generation_time'] for r in test_results]
    total_times = [r['performance']['total_time'] for r in test_results]
    
    print(f"\n⏱️ TIMING METRICS:")
    print(f"Average retrieval time: {np.mean(retrieval_times):.2f}s")
    print(f"Average generation time: {np.mean(generation_times):.2f}s")
    print(f"Average total time: {np.mean(total_times):.2f}s")
    
    # Source diversity
    all_sources = set()
    for result in test_results:
        all_sources.update(result['sources'])
    
    print(f"\n📚 SOURCE COVERAGE:")
    print(f"Unique sources used: {len(all_sources)}")
    print(f"Sources: {', '.join(all_sources)}")
    
    # Chunk analysis
    chunks_per_query = [r['performance']['chunks_retrieved'] for r in test_results]
    print(f"\n🔍 RETRIEVAL METRICS:")
    print(f"Average chunks per query: {np.mean(chunks_per_query):.1f}")
    print(f"Retrieval consistency: {np.std(chunks_per_query):.1f} std dev")
    
    # Answer length analysis
    answer_lengths = [len(r['answer']) for r in test_results]
    print(f"\n📝 ANSWER METRICS:")
    print(f"Average answer length: {np.mean(answer_lengths):.0f} chars")
    print(f"Answer length range: {min(answer_lengths)}-{max(answer_lengths)} chars")

# Analyze the test results
analyze_rag_performance(test_results)

📊 RAG PERFORMANCE ANALYSIS

⏱️ TIMING METRICS:
Average retrieval time: 0.25s
Average generation time: 51.90s
Average total time: 52.16s

📚 SOURCE COVERAGE:
Unique sources used: 1
Sources: strategy1_mobile_based_deep_learning_models_for_banana_disease_llamaparse

🔍 RETRIEVAL METRICS:
Average chunks per query: 5.0
Retrieval consistency: 0.0 std dev

📝 ANSWER METRICS:
Average answer length: 292 chars
Answer length range: 208-375 chars


In [11]:
# 💾 SAVE RESULTS AND CONFIGURATION
def save_rag_results(test_results: List[Dict[str, Any]], config: RAGConfig):
    """Save test results and configuration for comparison"""
    
    output_data = {
        'strategy': 'ChromaDB Basic RAG',
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        'configuration': {
            'chunk_size': config.CHUNK_SIZE,
            'chunk_overlap': config.CHUNK_OVERLAP,
            'embedding_model': config.EMBEDDING_MODEL,
            'retrieval_k': config.RETRIEVAL_K,
            'llm_model': config.LLM_MODEL,
            'llm_temperature': config.LLM_TEMPERATURE
        },
        'test_results': test_results,
        'performance_summary': {
            'total_questions': len(test_results),
            'avg_retrieval_time': np.mean([r['performance']['retrieval_time'] for r in test_results]) if test_results else 0,
            'avg_generation_time': np.mean([r['performance']['generation_time'] for r in test_results]) if test_results else 0,
            'unique_sources': len(set().union(*[r['sources'] for r in test_results])) if test_results else 0
        }
    }
    
    # Save to JSON file
    output_file = os.path.join(config.OUTPUT_DIR, 'strategy1_chromadb_results.json')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Results saved to: {output_file}")
    return output_file

# Save results
if test_results:
    save_rag_results(test_results, config)

print("\n🏆 STRATEGY 1 CHROMADB RAG - COMPLETE!")
print("="*50)
print("✅ Vector database created and populated")
print("✅ Semantic search implemented") 
print("✅ Answer generation working")
print("✅ Test questions processed")
print("✅ Performance metrics collected")

print("\n🎯 NEXT STEPS:")
print("1. Implement your optimizations in the workspace above")
print("2. Test with more complex academic questions")
print("3. Compare with Strategy 2 (Advanced RAG)")
print("4. Consider building a web interface with Streamlit")

print("\n💡 READY FOR OPTIMIZATION!")
print("Head to the optimization workspace and make this RAG system even better! 🚀")

✅ Results saved to: ../data/vector_store/strategy1_chromadb_results.json

🏆 STRATEGY 1 CHROMADB RAG - COMPLETE!
✅ Vector database created and populated
✅ Semantic search implemented
✅ Answer generation working
✅ Test questions processed
✅ Performance metrics collected

🎯 NEXT STEPS:
1. Implement your optimizations in the workspace above
2. Test with more complex academic questions
3. Compare with Strategy 2 (Advanced RAG)
4. Consider building a web interface with Streamlit

💡 READY FOR OPTIMIZATION!
Head to the optimization workspace and make this RAG system even better! 🚀
