In [1]:
# Environmental Vector Database Setup using ChromaDB and OpenAI
import chromadb
from chromadb.config import Settings
import pandas as pd
import json
import os
from typing import List, Dict, Any
import numpy as np
from dotenv import load_dotenv
import glob
from pathlib import Path

# Set up OpenAI API (make sure you have your API key set)
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [2]:
class EnvironmentalVectorDB:
    def __init__(self, db_path: str = "./chroma_env_db", collection_name: str = "environmental_data"):
        """
        Initialize the ChromaDB vector database for environmental data with flexible embedding options
        """

        # Initialize ChromaDB client
        self.client = chromadb.PersistentClient(
            path=db_path,
            settings=Settings(
                anonymized_telemetry=False
            )
        )
        
        # Create or get collection with specified embeddings
        try:
            self.collection = self.client.get_collection(
                name=collection_name,
            )
            print(f"Retrieved existing collection '{collection_name}'")
        except:
            self.collection = self.client.create_collection(
                name=collection_name,
                metadata={"description": "Environmental data for carbon footprint analysis"}
            )
            print(f"Created new collection '{collection_name}'")
        
        print(f"ChromaDB initialized at {db_path}")
        print(f"Collection '{collection_name}' ready with {self.collection.count()} documents")
    
    def add_bulk_data_from_loaders(self, excel_datasets=None, pdf_datasets=None, text_corpus=None):
        """
        Add all data from the data loading functions (Excel, PDF, and text corpus)
        
        Args:
            excel_datasets: Dictionary of Excel DataFrames from load_excel_data()
            pdf_datasets: Dictionary of PDF extracted data from load_pdf_extracted_data()
            text_corpus: List of text corpus documents from create_text_corpus_for_rag()
        """
        print("="*60)
        print("ADDING BULK DATA TO ENVIRONMENTAL VECTOR DATABASE")
        print("="*60)
        
        total_added = 0
        
        # Process Excel datasets
        if excel_datasets:
            print(f"\n{'='*50}")
            print("PROCESSING EXCEL DATASETS")
            print(f"{'='*50}")
            
            for dataset_name, df in excel_datasets.items():
                try:
                    print(f"\nProcessing Excel dataset: {dataset_name}")
                    added = self._add_excel_dataframe(df, dataset_name)
                    total_added += added
                    print(f"✅ Added {added} documents from {dataset_name}")
                    
                except Exception as e:
                    print(f"❌ Error processing Excel dataset {dataset_name}: {str(e)}")
                    continue
        
        # Process PDF datasets
        if pdf_datasets:
            print(f"\n{'='*50}")
            print("PROCESSING PDF EXTRACTED DATASETS")
            print(f"{'='*50}")
            
            for dataset_name, data in pdf_datasets.items():
                try:
                    print(f"\nProcessing PDF dataset: {dataset_name}")
                    added = self._add_pdf_extracted_data(data, dataset_name)
                    total_added += added
                    print(f"✅ Added {added} documents from {dataset_name}")
                    
                except Exception as e:
                    print(f"❌ Error processing PDF dataset {dataset_name}: {str(e)}")
                    continue
        
        # Process text corpus
        if text_corpus:
            print(f"\n{'='*50}")
            print("PROCESSING TEXT CORPUS")
            print(f"{'='*50}")
            
            try:
                added = self._add_text_corpus(text_corpus)
                total_added += added
                print(f"✅ Added {added} documents from text corpus")
                
            except Exception as e:
                print(f"❌ Error processing text corpus: {str(e)}")
        
        print(f"\n{'='*60}")
        print("BULK DATA LOADING COMPLETE")
        print(f"Total documents added: {total_added}")
        print(f"{'='*60}")
        
        return total_added
    
    def _add_excel_dataframe(self, df, dataset_name):
        """
        Internal method to add Excel DataFrame to vector database
        """
        documents = []
        metadatas = []
        ids = []
        
        for idx, row in df.iterrows():
            # Create comprehensive text content from all columns
            text_parts = []
            for col in df.columns:
                if pd.notna(row[col]) and str(row[col]).strip():
                    text_parts.append(f"{col}: {str(row[col])}")
            
            if text_parts:  # Only add if there's actual content
                document_text = " | ".join(text_parts)
                documents.append(document_text)
                
                # Create metadata
                metadata = {}
                for col in df.columns:
                    if pd.notna(row[col]):
                        value = row[col]
                        # Convert numpy types to Python types
                        if isinstance(value, (np.integer, np.floating)):
                            value = value.item()
                        elif hasattr(value, 'item'):  # Handle other numpy types
                            try:
                                value = value.item()
                            except:
                                value = str(value)
                         # Convert datetime to string for ChromaDB compatibility
                        elif hasattr(value, 'strftime'):  # datetime objects
                            value = str(value)
                        metadata[col] = value
                
                metadata['source'] = f"excel_{dataset_name}"
                metadata['data_type'] = 'excel'
                metadata['dataset_name'] = dataset_name
                metadatas.append(metadata)
                
                # Create unique ID
                ids.append(f"excel_{dataset_name}_{idx}")
        
        # Add to vector database
        if documents:
            self.collection.add(
                documents=documents,
                metadatas=metadatas,
                ids=ids
            )
            return len(documents)
        return 0
    
    def _add_pdf_extracted_data(self, data, dataset_name):
        """
        Internal method to add PDF extracted data to vector database
        """
        documents = []
        metadatas = []
        ids = []
        
        if isinstance(data, pd.DataFrame):
            # Handle DataFrame from PDF (similar to Excel)
            for idx, row in data.iterrows():
                text_parts = []
                for col in data.columns:
                    if pd.notna(row[col]) and str(row[col]).strip():
                        text_parts.append(f"{col}: {str(row[col])}")
                
                if text_parts:
                    document_text = " | ".join(text_parts)
                    documents.append(document_text)
                    
                    metadata = {col: row[col] for col in data.columns if pd.notna(row[col])}
                    metadata['source'] = f"pdf_{dataset_name}"
                    metadata['data_type'] = 'pdf_tabular'
                    metadata['dataset_name'] = dataset_name
                    metadatas.append(metadata)
                    
                    ids.append(f"pdf_table_{dataset_name}_{idx}")
        
        elif isinstance(data, str):
            # Handle text content from PDF - chunk for better retrieval
            chunk_size = 1000
            overlap = 200
            
            text = data.strip()
            if text:
                # Create chunks with overlap
                for i in range(0, len(text), chunk_size - overlap):
                    chunk = text[i:i + chunk_size]
                    if chunk.strip():  # Only add non-empty chunks
                        documents.append(chunk)
                        
                        metadata = {
                            'source': f"pdf_{dataset_name}",
                            'data_type': 'pdf_text',
                            'dataset_name': dataset_name,
                            'chunk_index': len(documents) - 1,
                            'chunk_start': i,
                            'chunk_end': min(i + chunk_size, len(text))
                        }
                        metadatas.append(metadata)
                        ids.append(f"pdf_text_{dataset_name}_chunk_{len(documents) - 1}")
        
        elif isinstance(data, dict):
            # Handle JSON/dict data from PDF
            text_content = f"Document: {dataset_name}\n"
            text_content += json.dumps(data, indent=2, ensure_ascii=False)
            
            documents.append(text_content)
            metadata = {
                'source': f"pdf_{dataset_name}",
                'data_type': 'pdf_json',
                'dataset_name': dataset_name,
                'keys': list(data.keys()) if isinstance(data, dict) else []
            }
            metadatas.append(metadata)
            ids.append(f"pdf_json_{dataset_name}")
        
        elif isinstance(data, list):
            # Handle list data from PDF
            for i, item in enumerate(data):
                text_content = f"Item {i} from {dataset_name}: {str(item)}"
                documents.append(text_content)
                
                metadata = {
                    'source': f"pdf_{dataset_name}",
                    'data_type': 'pdf_list',
                    'dataset_name': dataset_name,
                    'item_index': i
                }
                metadatas.append(metadata)
                ids.append(f"pdf_list_{dataset_name}_{i}")
        
        # Add to vector database
        if documents:
            self.collection.add(
                documents=documents,
                metadatas=metadatas,
                ids=ids
            )
            return len(documents)
        return 0

    def search_similar_with_scores(self, query: str, n_results: int = 5, 
                                  score_threshold: float = None) -> Dict[str, Any]:
        """
        Enhanced search with similarity scores and optional filtering
        
        Args:
            query: Search query
            n_results: Number of results to return
            score_threshold: Optional minimum similarity score (0-1, higher is more similar)
            
        Returns:
            Dictionary containing search results with similarity scores
        """
        try:
            results = self.collection.query(
                query_texts=[query],
                n_results=n_results
            )
            
            # Convert distances to similarity scores (1 - distance)
            similarities = [1 - d for d in results['distances'][0]] if results['distances'] else []
            
            # Filter by score threshold if provided
            if score_threshold is not None:
                filtered_results = {
                    'documents': [],
                    'metadatas': [],
                    'similarities': [],
                    'ids': []
                }
                
                for i, similarity in enumerate(similarities):
                    if similarity >= score_threshold:
                        filtered_results['documents'].append(results['documents'][0][i])
                        filtered_results['metadatas'].append(results['metadatas'][0][i])
                        filtered_results['similarities'].append(similarity)
                        filtered_results['ids'].append(results['ids'][0][i])
                
                return filtered_results
            
            return {
                'documents': results['documents'][0] if results['documents'] else [],
                'metadatas': results['metadatas'][0] if results['metadatas'] else [],
                'similarities': similarities,
                'distances': results['distances'][0] if results['distances'] else [],
                'ids': results['ids'][0] if results['ids'] else []
            }
            
        except Exception as e:
            print(f"Error searching database: {str(e)}")
            return {}
        
    def query_environmental_data(self, user_question: str, max_results: int = 5, 
                                score_threshold: float = 0.2) -> dict:
        """
        Enhanced query method specifically designed for chatbot integration
        Query the environmental database for carbon footprint information
        
        Args:
            user_question: User's question about environmental impact
            max_results: Maximum number of results to return
            score_threshold: Minimum similarity threshold (0-1, higher is more similar)
            
        Returns:
            Dictionary with search results and metadata formatted for chatbot use
        """
        try:
            # Search the vector database
            results = self.search_similar_with_scores(
                query=user_question,
                n_results=max_results,
                score_threshold=score_threshold
            )
            
            if not results.get('documents'):
                return {
                    'success': False,
                    'message': 'No relevant environmental data found for your question.',
                    'query': user_question,
                    'results': []
                }
            
            # Format results for chatbot use
            formatted_results = []
            for doc, metadata, similarity in zip(
                results['documents'],
                results['metadatas'],
                results['similarities']
            ):
                formatted_results.append({
                    'content': doc,
                    'source': metadata.get('source', 'Unknown'),
                    'data_type': metadata.get('data_type', 'Unknown'),
                    'dataset_name': metadata.get('dataset_name', 'Unknown'),
                    'similarity_score': round(similarity, 3),
                    'metadata': {k: v for k, v in metadata.items() 
                               if k not in ['source', 'data_type', 'dataset_name']}
                })
            
            return {
                'success': True,
                'query': user_question,
                'results': formatted_results,
                'total_results': len(formatted_results)
            }
            
        except Exception as e:
            return {
                'success': False,
                'message': f'Error querying database: {str(e)}',
                'query': user_question,
                'results': []
            }

    def test_environmental_queries(self, test_queries=None):
        """
        Test the vector database with sample environmental queries
        
        Args:
            test_queries: Optional list of custom test queries
        """
        print("="*60)
        print("TESTING ENVIRONMENTAL VECTOR DATABASE QUERIES")
        print("="*60)
        
        # Default test queries if none provided
        if test_queries is None:
            test_queries = [
                "carbon emissions from transportation",
                "energy consumption and CO2",
                "renewable energy sources",
                "manufacturing environmental impact",
                "agricultural carbon footprint",
                "building energy efficiency",
                "waste management emissions",
                "aviation fuel consumption"
            ]
        
        for i, query in enumerate(test_queries, 1):
            print(f"\n{'-'*50}")
            print(f"QUERY {i}: '{query}'")
            print(f"{'-'*50}")
            
            # Use the chatbot query method
            result = self.query_environmental_data(query, max_results=3)
            
            if result['success'] and result['results']:
                print(f"Found {result['total_results']} relevant results:\n")
                
                for j, res in enumerate(result['results'], 1):
                    print(f"  Result {j} (Similarity: {res['similarity_score']}):")
                    print(f"    Source: {res['source']}")
                    print(f"    Dataset: {res['dataset_name']}")
                    print(f"    Type: {res['data_type']}")
                    
                    # Show preview of document
                    doc_preview = res['content'][:200] + "..." if len(res['content']) > 200 else res['content']
                    print(f"    Content: {doc_preview}")
                    
                    # Show relevant metadata
                    if res['metadata']:
                        relevant_metadata = {k: v for k, v in res['metadata'].items() 
                                          if k in ['category', 'emission_factor', 'unit'] and v}
                        if relevant_metadata:
                            print(f"    Metadata: {relevant_metadata}")
                    print()
            else:
                print(f"  {result['message']}")
        

In [4]:
env_db = EnvironmentalVectorDB(
    db_path="./chroma_env_db",
    collection_name="carbon_footprint_data"
)
#chroma run --host localhost --port 8000 --path ./chroma_env_db

Retrieved existing collection 'carbon_footprint_data'
ChromaDB initialized at ./chroma_env_db
Collection 'carbon_footprint_data' ready with 2500 documents


In [20]:
env_db.add_bulk_data_from_loaders(
    excel_datasets=excel_datasets,
    pdf_datasets=pdf_datasets, 
)

ADDING BULK DATA TO ENVIRONMENTAL VECTOR DATABASE

PROCESSING EXCEL DATASETS

Processing Excel dataset: ghg-conversion-factors-2023-condensed-set-update_Introduction
✅ Added 38 documents from ghg-conversion-factors-2023-condensed-set-update_Introduction

Processing Excel dataset: ghg-conversion-factors-2023-condensed-set-update_What's new
✅ Added 36 documents from ghg-conversion-factors-2023-condensed-set-update_What's new

Processing Excel dataset: ghg-conversion-factors-2023-condensed-set-update_Index
✅ Added 95 documents from ghg-conversion-factors-2023-condensed-set-update_Index

Processing Excel dataset: ghg-conversion-factors-2023-condensed-set-update_Fuels
✅ Added 145 documents from ghg-conversion-factors-2023-condensed-set-update_Fuels

Processing Excel dataset: ghg-conversion-factors-2023-condensed-set-update_Bioenergy
✅ Added 70 documents from ghg-conversion-factors-2023-condensed-set-update_Bioenergy

Processing Excel dataset: ghg-conversion-factors-2023-condensed-set-update

2500

In [12]:

# Export your variables
print(f"VITE_OPENAI_API_KEY={os.getenv('OPENAI_API_KEY')}")
print(f"VITE_CHROMA_DB_URL={os.getenv('CHROMA_DB_URL', 'http://localhost:8000')}")

import chromadb
from chromadb.config import Settings

# Start ChromaDB server
client = chromadb.HttpClient(
    host="localhost",
    port=8000,
    
    settings=Settings(allow_reset=True)
)


VITE_OPENAI_API_KEY=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
VITE_CHROMA_DB_URL=http://localhost:8000


In [34]:
env_db.test_environmental_queries()

TESTING ENVIRONMENTAL VECTOR DATABASE QUERIES

--------------------------------------------------
QUERY 1: 'carbon emissions from transportation'
--------------------------------------------------
Found 3 relevant results:

  Result 1 (Similarity: 0.449):
    Source: EPA
    Dataset: Unknown
    Type: Unknown
    Content: Transportation: Cars emit approximately 4.6 metric tons of CO2 per year per vehicle
    Metadata: {'emission_factor': 4.6, 'category': 'transportation', 'unit': 'metric tons CO2/year'}

  Result 2 (Similarity: 0.354):
    Source: pdf_ZyPDF
    Dataset: ZyPDF
    Type: pdf_text
    Content: ion on the emissions of the transportation
sector as a whole?
Visit EPA’s Fast Facts on Transportation Greenhouse Gas Emissions and Carbon Pollution from
Transportation.
Annually EPA also publishes in...

  Result 3 (Similarity: 0.258):
    Source: excel_ghg-emission-factors-hub-2025
    Dataset: ghg-emission-factors-hub-2025
    Type: excel
    Content: Unnamed: 2: Source: 
CO2, CH

In [18]:
def load_excel_data(processed_dir='data/raw/excel'):
    """
    Load all Excel files from the processed data directory
    Returns a dictionary with filename as key and DataFrame as value
    """
    excel_data = {}
    excel_extensions = ['*.xlsx', '*.xls', '*.xlsm']
    # Find all Excel files
    excel_files = []
    data_path = Path(processed_dir)
    for ext in excel_extensions:
        excel_files.extend(data_path.glob(ext))
    
    if not excel_files:
        print(f"No Excel files found in {processed_dir}")
        return excel_data
    
    print(f"Found {len(excel_files)} Excel files:")
    
    for file_path in excel_files:
        try:
            print(f"Loading: {file_path.name}")
            
            # Try to read the Excel file
            # Handle multiple sheets by reading all sheets
            excel_file = pd.ExcelFile(file_path)
            
            if len(excel_file.sheet_names) == 1:
                # Single sheet - store directly
                df = pd.read_excel(file_path, sheet_name=0)
                excel_data[file_path.stem] = df
                print(f"  - Loaded {len(df)} rows, {len(df.columns)} columns")
            else:
                # Multiple sheets - store each sheet separately
                for sheet_name in excel_file.sheet_names:
                    df = pd.read_excel(file_path, sheet_name=sheet_name)
                    key = f"{file_path.stem}_{sheet_name}"
                    excel_data[key] = df
                    print(f"  - Sheet '{sheet_name}': {len(df)} rows, {len(df.columns)} columns")
                    
        except Exception as e:
            print(f"Error loading {file_path.name}: {str(e)}")
            continue
    
    print(f"\nSuccessfully loaded {len(excel_data)} datasets from Excel files")
    return excel_data

# Load all Excel data
excel_datasets = load_excel_data()

# Display summary of loaded datasets
if excel_datasets:
    print("\n" + "="*50)
    print("EXCEL DATA SUMMARY")
    print("="*50)
    for name, df in excel_datasets.items():
        print(f"{name}:")
        print(f"  Shape: {df.shape}")
        print(f"  Columns: {list(df.columns[:5])}{'...' if len(df.columns) > 5 else ''}")
        print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        print()

Found 2 Excel files:
Loading: ghg-conversion-factors-2023-condensed-set-update.xlsx
  - Sheet 'Introduction': 43 rows, 4 columns
  - Sheet 'What's new': 49 rows, 4 columns
  - Sheet 'Index': 103 rows, 4 columns
  - Sheet 'Fuels': 159 rows, 8 columns
  - Sheet 'Bioenergy': 83 rows, 6 columns
  - Sheet 'Refrigerant & other': 229 rows, 7 columns
  - Sheet 'Passenger vehicles': 118 rows, 35 columns
  - Sheet 'SECR kWh pass & delivery vehs': 137 rows, 11 columns
  - Sheet 'UK electricity': 31 rows, 8 columns
  - Sheet 'UK electricity for EVs': 86 rows, 11 columns
  - Sheet 'SECR kWh UK electricity for EVs': 77 rows, 6 columns
  - Sheet 'Transmission and distribution': 32 rows, 8 columns
  - Sheet 'UK electricity T&D for EVs': 88 rows, 11 columns
  - Sheet 'Water supply': 22 rows, 6 columns
  - Sheet 'Water treatment': 19 rows, 6 columns
  - Sheet 'Material use': 92 rows, 14 columns
  - Sheet 'Waste disposal': 97 rows, 14 columns
  - Sheet 'Business travel- air': 55 rows, 12 columns
  - Shee

  warn("""Cannot parse header or footer so it will be ignored""")
  for idx, row in parser.parse():


In [19]:

def load_pdf_extracted_data(processed_dir='data/processed'):
    """
    Load extracted PDF data from the processed data directory
    Handles various formats: JSON, CSV, TXT files from PDF extraction
    Returns a dictionary with filename as key and content as value
    """
    pdf_data = {}
    
    # Common extensions for extracted PDF data
    pdf_extracted_extensions = ['*.json', '*.csv', '*.txt']
    
    # Create the directory path
    data_path = Path(processed_dir)
    
    if not data_path.exists():
        print(f"Directory {processed_dir} does not exist!")
        return pdf_data
    
    # Find all extracted PDF data files
    extracted_files = []
    for ext in pdf_extracted_extensions:
        extracted_files.extend(data_path.glob(ext))
    
    # Filter out Excel files that might have .csv extension
    extracted_files = [f for f in extracted_files if not any(excel_ext in f.name.lower() 
                                                           for excel_ext in ['.xlsx', '.xls', '.xlsm'])]
    
    if not extracted_files:
        print(f"No extracted PDF data files found in {processed_dir}")
        return pdf_data
    
    print(f"Found {len(extracted_files)} extracted PDF data files:")
    
    for file_path in extracted_files:
        try:
            print(f"Loading: {file_path.name}")
            
            if file_path.suffix.lower() == '.json':
                # Load JSON data
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    pdf_data[file_path.stem] = data
                    
                    # Try to provide summary based on data structure
                    if isinstance(data, dict):
                        print(f"  - JSON object with {len(data)} keys")
                        if 'pages' in data:
                            print(f"  - Contains {len(data['pages'])} pages")
                    elif isinstance(data, list):
                        print(f"  - JSON array with {len(data)} items")
                    
            elif file_path.suffix.lower() == '.csv':
                # Load CSV data (likely tabular data extracted from PDF)
                df = pd.read_csv(file_path)
                pdf_data[file_path.stem] = df
                print(f"  - CSV with {len(df)} rows, {len(df.columns)} columns")
                
            elif file_path.suffix.lower() == '.txt':
                # Load text data
                with open(file_path, 'r', encoding='utf-8') as f:
                    text_content = f.read()
                    pdf_data[file_path.stem] = text_content
                    
                    # Provide text summary
                    lines = len(text_content.split('\n'))
                    chars = len(text_content)
                    words = len(text_content.split())
                    print(f"  - Text file: {lines} lines, {words} words, {chars} characters")
                    
        except Exception as e:
            print(f"Error loading {file_path.name}: {str(e)}")
            continue
    
    print(f"\nSuccessfully loaded {len(pdf_data)} datasets from extracted PDF files")
    return pdf_data

# Load all extracted PDF data
pdf_datasets = load_pdf_extracted_data()

# Display summary of loaded datasets
if pdf_datasets:
    print("\n" + "="*50)
    print("EXTRACTED PDF DATA SUMMARY")
    print("="*50)
    for name, data in pdf_datasets.items():
        print(f"{name}:")
        
        if isinstance(data, pd.DataFrame):
            print(f"  Type: DataFrame")
            print(f"  Shape: {data.shape}")
            print(f"  Columns: {list(data.columns[:3])}{'...' if len(data.columns) > 3 else ''}")
        elif isinstance(data, dict):
            print(f"  Type: Dictionary")
            print(f"  Keys: {list(data.keys())[:5]}{'...' if len(data.keys()) > 5 else ''}")
        elif isinstance(data, str):
            print(f"  Type: Text")
            print(f"  Length: {len(data)} characters")
            print(f"  Preview: {data[:100]}{'...' if len(data) > 100 else ''}")
        elif isinstance(data, list):
            print(f"  Type: List")
            print(f"  Items: {len(data)}")
        else:
            print(f"  Type: {type(data).__name__}")
        print()

Found 1 extracted PDF data files:
Loading: ZyPDF.txt
  - Text file: 187 lines, 1814 words, 11426 characters

Successfully loaded 1 datasets from extracted PDF files

EXTRACTED PDF DATA SUMMARY
ZyPDF:
  Type: Text
  Length: 11426 characters
  Preview: Questions
and
Answers
Office of Transportation and Air Quality
EPA-420-F-23-014
June 2023
Tailpipe G...



In [9]:
# Optional: Create a combined text corpus for RAG vector database
def create_text_corpus_for_rag():
    """
    Create a combined text corpus from all loaded data for RAG implementation
    """
    corpus = []
    
    # Process Excel data
    for name, df in excel_datasets.items():
        if isinstance(df, pd.DataFrame):
            # Convert DataFrame to text representation
            text_content = f"Dataset: {name}\n"
            text_content += f"Columns: {', '.join(df.columns)}\n"
            text_content += df.to_string(max_rows=100, max_cols=10)
            corpus.append({
                'source': f"excel_{name}",
                'content': text_content,
                'type': 'tabular_data'
            })
    
    # Process PDF extracted data
    for name, data in pdf_datasets.items():
        if isinstance(data, str):
            corpus.append({
                'source': f"pdf_{name}",
                'content': data,
                'type': 'text'
            })
        elif isinstance(data, dict):
            # Convert dict to searchable text
            text_content = f"Document: {name}\n"
            text_content += json.dumps(data, indent=2)
            corpus.append({
                'source': f"pdf_{name}",
                'content': text_content,
                'type': 'structured_data'
            })
        elif isinstance(data, pd.DataFrame):
            text_content = f"Dataset: {name}\n"
            text_content += f"Columns: {', '.join(data.columns)}\n"
            text_content += data.to_string(max_rows=100, max_cols=10)
            corpus.append({
                'source': f"pdf_{name}",
                'content': text_content,
                'type': 'tabular_data'
            })
    
    print(f"Created text corpus with {len(corpus)} documents for RAG")
    return corpus

# Create corpus for RAG
text_corpus = create_text_corpus_for_rag()

Created text corpus with 28 documents for RAG
