In [None]:
# -*- coding: utf-8 -*-
"""
# Structured Data in RAG Systems - Part 1: Setup and Introduction

This notebook is Part 1 of our structured data processing series, accompanying
Chapter 4 of "Mastering Retrieval Augmented Generation". We'll establish our
development environment and create sample data for subsequent sections.

## What's in this Series
1. Part 1 (Current): Setup and Sample Data Creation
2. Part 2: CSV Processing and Validation
3. Part 3: Directory Management
4. Part 4: Hierarchical Data (JSON/XML)
5. Part 5: Performance Optimization
"""

# First, let's install all necessary packages
!pip install pandas numpy chardet xmltodict pytest-cov

In [17]:
"""## Section 1: Understanding Our Tools

Before we start working with structured data, let's import our necessary libraries
and understand what each one does in our data processing toolkit.
"""

import os
import json
import pandas as pd
import numpy as np
import chardet
import xml.etree.ElementTree as ET
import xmltodict
from typing import List, Optional, Any, Dict
from datetime import datetime
import logging
import io
import hashlib
from pathlib import Path
from typing import Optional


# Set up logging for our operations
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

"""## Section 2: Creating Our Working Directory

We'll create a directory to store our sample files that demonstrate different
structured data scenarios we'll encounter in real-world applications.
"""

# Create directory for our samples
!mkdir -p rag_sample_data

def create_clean_csv():
    """
    Creates a clean, well-formatted CSV file for basic demonstrations.
    This represents an ideal scenario with consistent data.
    """
    sales_data = """
Date,Product,Quantity,Revenue,Region
2025-01-01,Widget A,100,5000,North
2025-01-02,Widget B,150,8250,South
2025-01-03,Widget A,120,6000,East
2025-01-04,Widget C,80,4800,West
2025-01-05,Widget B,200,11000,North
""".strip()

    with open('rag_sample_data/sales_clean.csv', 'w') as f:
        f.write(sales_data)
    logger.info("Created clean CSV sample")

def create_messy_csv():
    """
    Creates a CSV file with common real-world challenges:
    - Different delimiter (semicolon)
    - Missing values
    - Inconsistent data patterns
    """
    messy_data = """
Product;Price;Stock;LastUpdated;Supplier
Widget A;49.99;100;2025-01-01;Acme Corp
Widget B;55.00;;2025-01-02;TechSupply
Widget C;75.50;75;;GlobalParts
Widget D;;;2025-01-04;Acme Corp
""".strip()

    with open('rag_sample_data/inventory_messy.csv', 'w') as f:
        f.write(messy_data)
    logger.info("Created messy CSV sample")

def create_sample_json():
    """
    Creates a JSON file with nested structures and various data types.
    This demonstrates hierarchical data relationships.
    """
    product_catalog = {
        "catalog": {
            "last_updated": "2025-01-07T10:00:00Z",
            "categories": [
                {
                    "name": "Electronics",
                    "products": [
                        {
                            "id": "E001",
                            "name": "Smart Watch",
                            "price": 199.99,
                            "specifications": {
                                "battery": "24h",
                                "waterproof": True,
                                "sensors": ["heart rate", "GPS"]
                            }
                        }
                    ]
                }
            ]
        }
    }

    with open('rag_sample_data/product_catalog.json', 'w') as f:
        json.dump(product_catalog, f, indent=2)
    logger.info("Created JSON sample")

def create_sample_xml():
    """
    Creates an XML file demonstrating structured document format.
    Shows nested elements and attributes.
    """
    xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<inventory last_updated="2025-01-07">
    <warehouse location="North">
        <product>
            <sku>W001</sku>
            <name>Widget Pro</name>
            <stock>150</stock>
        </product>
    </warehouse>
</inventory>
"""

    with open('rag_sample_data/inventory.xml', 'w') as f:
        f.write(xml_data)
    logger.info("Created XML sample")

# Create all our sample files
create_clean_csv()
create_messy_csv()
create_sample_json()
create_sample_xml()

# Verify creation and show file sizes
print("\nSample files created in 'rag_sample_data' directory:")
for file in os.listdir('rag_sample_data'):
    size = os.path.getsize(os.path.join('rag_sample_data', file))
    print(f"- {file}: {size} bytes")

"""## What's Next?

In Part 2 of this series, we'll implement a robust CSV processor that can handle:
- Automatic encoding detection
- Different delimiters
- Missing value handling
- Data validation
- Error recovery

We'll use the sample files we've created here to test our implementation against
both ideal and challenging scenarios."""


Sample files created in 'rag_sample_data' directory:
- product_catalog.json: 492 bytes
- inventory_messy.csv: 181 bytes
- sales_clean.csv: 209 bytes
- inventory.xml: 272 bytes


"## What's Next?\n\nIn Part 2 of this series, we'll implement a robust CSV processor that can handle:\n- Automatic encoding detection\n- Different delimiters\n- Missing value handling\n- Data validation\n- Error recovery\n\nWe'll use the sample files we've created here to test our implementation against\nboth ideal and challenging scenarios."

In [None]:
# -*- coding: utf-8 -*-
"""
# Structured Data in RAG Systems - Part 2: CSV Processing

This notebook demonstrates how to build a robust CSV processor for RAG systems.
We'll create a comprehensive solution that can handle common challenges in
real-world CSV files.

Note: Make sure you've run Part 1 first to create the sample data files we'll use here.
"""

# First, let's install and import our required packages
!pip install pandas numpy chardet

import pandas as pd
import numpy as np
import chardet
import os
import json
from typing import List, Optional, Dict, Any
from datetime import datetime
import logging

# Set up logging for better visibility into our operations
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class EnhancedCSVLoader:
    """
    A robust CSV loader designed for RAG systems that can handle:
    - Different encodings
    - Various delimiters
    - Missing values
    - Data validation
    - Error recovery
    """

    def __init__(self, file_path: str):
        """
        Initialize the CSV loader with configuration options.

        Args:
            file_path: Path to the CSV file to process
        """
        self.file_path = file_path
        self.metadata = {}
        self.delimiter = None
        self.encoding = None

    def detect_file_properties(self) -> dict:
        """
        Automatically detect CSV file properties including encoding and delimiter.
        This helps handle files from various sources reliably.

        Returns:
            Dictionary containing detected file properties
        """
        try:
            with open(self.file_path, 'rb') as file:
                # Read a sample of the file for detection
                raw_data = file.read(10000)  # Read first 10KB

                # Detect encoding
                result = chardet.detect(raw_data)
                self.encoding = result['encoding']

                # Convert to string for delimiter detection
                sample_text = raw_data.decode(self.encoding)

                # Count potential delimiters
                delimiters = [',', ';', '\t', '|']
                delimiter_counts = {d: sample_text.count(d) for d in delimiters}

                # Choose the most common delimiter
                self.delimiter = max(delimiter_counts.items(), key=lambda x: x[1])[0]

                return {
                    'encoding': self.encoding,
                    'delimiter': self.delimiter,
                    'confidence': result['confidence']
                }
        except Exception as e:
            logger.error(f"Error detecting file properties: {str(e)}")
            raise

    def validate_data(self, df: pd.DataFrame) -> dict:
        """
        Perform validation checks on the loaded data.

        Args:
            df: Pandas DataFrame containing the loaded CSV data

        Returns:
            Dictionary containing validation results and statistics
        """
        try:
            validation = {
                'total_rows': len(df),
                'total_columns': len(df.columns),
                'missing_values': df.isnull().sum().to_dict(),
                'column_types': df.dtypes.astype(str).to_dict(),
                'duplicate_rows': df.duplicated().sum(),
                'column_statistics': {}
            }

            # Calculate statistics for numeric columns
            numeric_columns = df.select_dtypes(include=[np.number]).columns
            for col in numeric_columns:
                validation['column_statistics'][col] = {
                    'mean': float(df[col].mean()),  # Convert numpy types to Python types
                    'std': float(df[col].std()),
                    'min': float(df[col].min()),
                    'max': float(df[col].max())
                }

            # Check for potential data quality issues
            validation['quality_checks'] = {
                'empty_columns': [col for col in df.columns if df[col].isnull().all()],
                'high_missing_ratio': [
                    col for col in df.columns
                    if df[col].isnull().sum() / len(df) > 0.5
                ],
                'constant_columns': [
                    col for col in df.columns
                    if df[col].nunique() == 1
                ]
            }

            return validation

        except Exception as e:
            logger.error(f"Error during data validation: {str(e)}")
            raise

    def load(self) -> tuple[pd.DataFrame, dict]:
        """
        Load and process the CSV file, returning both the data and metadata.
        Implements comprehensive error handling and data validation.

        Returns:
            Tuple containing:
            - Pandas DataFrame with the loaded data
            - Dictionary with metadata and validation results
        """
        try:
            # First check if file exists
            if not os.path.exists(self.file_path):
                raise FileNotFoundError(f"File not found: {self.file_path}")

            # Detect file properties
            properties = self.detect_file_properties()
            logger.info(f"Detected file properties: {properties}")

            # Read CSV with detected properties
            df = pd.read_csv(
                self.file_path,
                encoding=self.encoding,
                delimiter=self.delimiter,
                on_bad_lines='warn'
            )

            # Validate the data
            validation_results = self.validate_data(df)
            logger.info("Data validation completed")

            # Collect metadata
            self.metadata = {
                'file_properties': properties,
                'validation': validation_results,
                'file_size': os.path.getsize(self.file_path),
                'last_modified': datetime.fromtimestamp(
                    os.path.getmtime(self.file_path)
                ).isoformat(),
                'columns': list(df.columns)
            }

            return df, self.metadata

        except Exception as e:
            logger.error(f"Error loading CSV: {str(e)}")
            raise

"""## Testing Our Implementation

Let's test our CSV loader with both the clean and messy sample files we created
in Part 1. This will demonstrate how it handles different scenarios.
"""

def test_csv_loader():
    """
    Test the CSV loader with different sample files and analyze the results.
    """
    try:
        # Test 1: Process the clean CSV
        print("Testing Clean CSV Processing")
        print("-" * 50)

        loader = EnhancedCSVLoader("rag_sample_data/sales_clean.csv")
        df, metadata = loader.load()

        print("Clean CSV Preview:")
        print(df.head())
        print("\nValidation Results:")
        print(json.dumps(metadata['validation'], indent=2))

        # Test 2: Process the messy CSV
        print("\nTesting Messy CSV Processing")
        print("-" * 50)

        loader = EnhancedCSVLoader("rag_sample_data/inventory_messy.csv")
        df, metadata = loader.load()

        print("Messy CSV Preview:")
        print(df.head())
        print("\nValidation Results:")
        print(json.dumps(metadata['validation'], indent=2))

    except Exception as e:
        print(f"Error during testing: {str(e)}")

# Run our tests
test_csv_loader()

"""## What's Next?

In Part 3, we'll explore directory management for handling collections of
structured data files. We'll build upon the CSV processing capabilities
we've developed here to handle multiple files efficiently."""

In [None]:
# -*- coding: utf-8 -*-
"""
# Structured Data in RAG Systems - Part 3: Directory Management

This notebook focuses on managing collections of structured data files in RAG systems.
We'll build a robust system for organizing, tracking, and processing multiple files
efficiently. This is particularly important when dealing with large document collections
or regularly updated datasets.

Make sure you've run Parts 1 and 2 first, as we'll build upon the CSV processing
capabilities we developed there.
"""

# Import required libraries
import os
import json
import pandas as pd
import hashlib
from datetime import datetime
from typing import List, Dict, Optional, Any
import logging
from pathlib import Path
import shutil

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

"""## Section 1: Building a Document Collection Manager

First, let's create a comprehensive system for managing collections of structured
data files. This will help us organize files, track changes, and maintain metadata
about our document collection.
"""

class DocumentCollectionManager:
    """
    Manages collections of structured data files with features for:
    - Organizing files by type and category
    - Tracking file changes and versions
    - Maintaining a searchable index
    - Processing files in batches
    """

    def __init__(self, root_dir: str):
        """
        Initialize the document collection manager.

        Args:
            root_dir: Root directory for the document collection
        """
        self.root_dir = Path(root_dir)
        self.file_index = {}
        self.metadata = {}

        # Create directory structure if it doesn't exist
        self._initialize_directory_structure()

    def _initialize_directory_structure(self):
        """
        Creates the necessary directory structure for organizing files.
        We'll create separate directories for different file types and
        maintain an index directory for metadata.
        """
        # Create main directories
        directories = [
            'csv_files',
            'json_files',
            'xml_files',
            'index',
            'archive'
        ]

        for dir_name in directories:
            dir_path = self.root_dir / dir_name
            dir_path.mkdir(parents=True, exist_ok=True)
            logger.info(f"Initialized directory: {dir_path}")

    def _calculate_file_hash(self, file_path: Path) -> str:
        """
        Calculate a hash of the file contents for change detection.
        This helps us track when files have been modified.

        Args:
            file_path: Path to the file

        Returns:
            SHA-256 hash of the file contents
        """
        hasher = hashlib.sha256()
        with open(file_path, 'rb') as f:
            for chunk in iter(lambda: f.read(4096), b''):
                hasher.update(chunk)
        return hasher.hexdigest()

    def _get_file_metadata(self, file_path: Path) -> dict:
        """
        Extract comprehensive metadata about a file.
        This includes file properties, modification times, and content hash.

        Args:
            file_path: Path to the file

        Returns:
            Dictionary containing file metadata
        """
        stats = file_path.stat()
        return {
            'filename': file_path.name,
            'extension': file_path.suffix.lower(),
            'size': stats.st_size,
            'created': datetime.fromtimestamp(stats.st_ctime).isoformat(),
            'modified': datetime.fromtimestamp(stats.st_mtime).isoformat(),
            'content_hash': self._calculate_file_hash(file_path),
            'relative_path': str(file_path.relative_to(self.root_dir))
        }

    def add_file(self, source_path: str, category: Optional[str] = None) -> dict:
        """
        Add a new file to the collection, organizing it appropriately.

        Args:
            source_path: Path to the file to add
            category: Optional category for organizing files

        Returns:
            Metadata about the added file
        """
        source_path = Path(source_path)
        file_ext = source_path.suffix.lower()

        # Determine target directory based on file type
        if file_ext == '.csv':
            target_dir = self.root_dir / 'csv_files'
        elif file_ext == '.json':
            target_dir = self.root_dir / 'json_files'
        elif file_ext == '.xml':
            target_dir = self.root_dir / 'xml_files'
        else:
            raise ValueError(f"Unsupported file type: {file_ext}")

        # Add category subdirectory if specified
        if category:
            target_dir = target_dir / category
            target_dir.mkdir(parents=True, exist_ok=True)

        # Copy file to collection
        target_path = target_dir / source_path.name
        shutil.copy2(source_path, target_path)

        # Extract and store metadata
        metadata = self._get_file_metadata(target_path)
        self.file_index[str(target_path)] = metadata

        # Save updated index
        self._save_index()

        logger.info(f"Added file: {source_path.name} to {target_dir}")
        return metadata

    def _save_index(self):
        """
        Save the file index to disk for persistence.
        This maintains a record of all files and their metadata.
        """
        index_path = self.root_dir / 'index' / 'file_index.json'
        with open(index_path, 'w') as f:
            json.dump(self.file_index, f, indent=2)

    def scan_directory(self) -> dict:
        """
        Scan the entire directory structure and update the file index.
        This helps maintain an accurate record of all files in the collection.

        Returns:
            Statistics about the document collection
        """
        total_files = 0
        total_size = 0
        file_types = {}

        # Scan all subdirectories
        for root, _, files in os.walk(self.root_dir):
            root_path = Path(root)

            # Skip index and archive directories
            if 'index' in root_path.parts or 'archive' in root_path.parts:
                continue

            for file in files:
                file_path = root_path / file

                # Update file index
                self.file_index[str(file_path)] = self._get_file_metadata(file_path)

                # Update statistics
                total_files += 1
                total_size += file_path.stat().st_size
                file_types[file_path.suffix] = file_types.get(file_path.suffix, 0) + 1

        # Save updated index
        self._save_index()

        return {
            'total_files': total_files,
            'total_size': total_size,
            'file_types': file_types,
            'last_scan': datetime.now().isoformat()
        }

"""## Section 2: Testing Our Implementation

Let's test our document collection manager with the sample files we created
in Part 1. This will demonstrate how it handles different file types and
maintains organization.
"""

def test_document_manager():
    """
    Test the document collection manager with our sample files.
    This demonstrates the key features of file organization and tracking.
    """
    # Create a test collection
    collection_dir = Path("rag_test_collection")
    manager = DocumentCollectionManager(collection_dir)

    # Add our sample files
    sample_files = [
        ("rag_sample_data/sales_clean.csv", "sales"),
        ("rag_sample_data/inventory_messy.csv", "inventory"),
        ("rag_sample_data/product_catalog.json", "products"),
        ("rag_sample_data/inventory.xml", "inventory")
    ]

    print("Adding sample files to collection...")
    for file_path, category in sample_files:
        metadata = manager.add_file(file_path, category)
        print(f"\nAdded {metadata['filename']}:")
        print(json.dumps(metadata, indent=2))

    # Scan the directory and show statistics
    print("\nCollection Statistics:")
    stats = manager.scan_directory()
    print(json.dumps(stats, indent=2))

# Run our tests
test_document_manager()

"""## Section 3: Batch Processing Implementation

Now let's implement batch processing capabilities for handling multiple files
efficiently. This is particularly useful when working with large collections
or when files need regular updates.
"""

class BatchProcessor:
    """
    Processes multiple structured data files in batches with support for:
    - Parallel processing
    - Progress tracking
    - Error handling and recovery
    """

    def __init__(self, collection_manager: DocumentCollectionManager):
        """
        Initialize the batch processor.

        Args:
            collection_manager: Instance of DocumentCollectionManager
        """
        self.collection_manager = collection_manager
        self.processing_stats = {
            'processed': 0,
            'failed': 0,
            'skipped': 0
        }

    def process_files(self, file_type: str, processor_func: callable) -> dict:
        """
        Process all files of a specific type using the provided function.

        Args:
            file_type: File extension to process (e.g., '.csv')
            processor_func: Function to apply to each file

        Returns:
            Processing statistics and results
        """
        results = []

        # Get all files of specified type
        files_to_process = [
            path for path in self.collection_manager.file_index
            if Path(path).suffix.lower() == file_type
        ]

        logger.info(f"Starting batch processing of {len(files_to_process)} files")

        # Process each file
        for file_path in files_to_process:
            try:
                # Process the file
                result = processor_func(file_path)
                results.append({
                    'file': file_path,
                    'status': 'success',
                    'result': result
                })
                self.processing_stats['processed'] += 1

            except Exception as e:
                logger.error(f"Error processing {file_path}: {str(e)}")
                results.append({
                    'file': file_path,
                    'status': 'failed',
                    'error': str(e)
                })
                self.processing_stats['failed'] += 1

        return {
            'stats': self.processing_stats,
            'results': results
        }

"""## Example: Batch Processing CSV Files

Let's demonstrate batch processing by analyzing all CSV files in our collection.
We'll calculate summary statistics for each file.
"""

def csv_analyzer(file_path: str) -> dict:
    """
    Analyze a CSV file and return summary statistics.
    This is an example processor function for batch processing.

    Args:
        file_path: Path to the CSV file

    Returns:
        Dictionary containing analysis results
    """
    # Use our EnhancedCSVLoader from Part 2
    from Part2 import EnhancedCSVLoader

    loader = EnhancedCSVLoader(file_path)
    df, metadata = loader.load()

    # Calculate additional statistics
    analysis = {
        'row_count': len(df),
        'column_count': len(df.columns),
        'numeric_columns': {
            col: {
                'mean': df[col].mean(),
                'std': df[col].std(),
                'min': df[col].min(),
                'max': df[col].max()
            }
            for col in df.select_dtypes(include=[np.number]).columns
        }
    }

    return analysis

# Test batch processing
collection_dir = Path("rag_test_collection")
manager = DocumentCollectionManager(collection_dir)
processor = BatchProcessor(manager)

print("Starting batch analysis of CSV files...")
results = processor.process_files('.csv', csv_analyzer)
print("\nProcessing Results:")
print(json.dumps(results, indent=2))

"""## What's Next?

In Part 4, we'll explore processing hierarchical data formats (JSON and XML),
building upon our directory management capabilities. We'll implement specialized
processors for these formats and show how to integrate them with our batch
processing system."""

In [None]:
# -*- coding: utf-8 -*-
"""
# Structured Data in RAG Systems - Part 4: Hierarchical Data Processing

This notebook focuses on processing hierarchical data formats (JSON and XML) in RAG systems.
We'll build robust processors that can handle complex nested structures while maintaining
data relationships and context. This is particularly important when working with API
responses, configuration files, and structured documents.

Make sure you've run the previous parts first, as we'll build upon concepts introduced
there and use some of the sample files we created.
"""

# First, let's install and import our required packages
!pip install pandas numpy xmltodict jsonpath-ng

import json
import xml.etree.ElementTree as ET
import xmltodict
from typing import List, Dict, Any, Optional
import logging
from pathlib import Path
from jsonpath_ng import jsonpath, parse
import pandas as pd
from datetime import datetime

# Set up logging for better visibility into our operations
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

"""## Understanding Hierarchical Data Structures

Before we dive into implementation, let's understand why hierarchical data requires
special handling. Consider our product catalog JSON from Part 1:

1. It has multiple levels of nesting (catalog → categories → products)
2. It contains mixed data types (strings, numbers, booleans, arrays)
3. Some fields might be optional or have varying structures
4. Relationships between elements are important for context

Our implementation needs to handle these complexities while making the data
accessible for RAG systems.
"""

class HierarchicalDataProcessor:
    """
    A comprehensive processor for hierarchical data formats (JSON and XML).
    Handles complex nested structures while preserving relationships and context.
    """

    def __init__(self, file_path: str):
        """
        Initialize the hierarchical data processor.

        Args:
            file_path: Path to the JSON or XML file to process
        """
        self.file_path = Path(file_path)
        self.file_type = self.file_path.suffix.lower()
        self.data = None
        self.metadata = {}

        if self.file_type not in ['.json', '.xml']:
            raise ValueError(f"Unsupported file type: {self.file_type}")

    def _calculate_depth(self, obj: Any, current_depth: int = 0) -> int:
        """
        Calculate the maximum nesting depth of a hierarchical structure.
        This helps us understand the complexity of our data.

        Args:
            obj: The object to analyze
            current_depth: Current nesting level

        Returns:
            Maximum nesting depth found
        """
        if isinstance(obj, dict):
            if not obj:
                return current_depth
            return max(self._calculate_depth(v, current_depth + 1) for v in obj.values())
        elif isinstance(obj, list):
            if not obj:
                return current_depth
            return max(self._calculate_depth(item, current_depth + 1) for item in obj)
        else:
            return current_depth

    def _extract_schema(self, obj: Any, path: str = '') -> Dict:
        """
        Extract the implicit schema from the data structure.
        This helps understand the structure and relationships in our data.

        Args:
            obj: The object to analyze
            path: Current path in the hierarchy

        Returns:
            Dictionary describing the data structure
        """
        if isinstance(obj, dict):
            return {
                'type': 'object',
                'properties': {
                    k: self._extract_schema(v, f"{path}.{k}" if path else k)
                    for k, v in obj.items()
                }
            }
        elif isinstance(obj, list):
            if obj:
                # Analyze the first item as an example
                return {
                    'type': 'array',
                    'items': self._extract_schema(obj[0], f"{path}[]")
                }
            return {'type': 'array', 'items': {}}
        else:
            return {'type': type(obj).__name__}

    def load(self) -> tuple[Any, dict]:
        """
        Load and process the hierarchical data file.

        Returns:
            Tuple containing:
            - The processed data structure
            - Dictionary with metadata and analysis results
        """
        try:
            # Read and parse the file based on its type
            if self.file_type == '.json':
                with open(self.file_path, 'r', encoding='utf-8') as f:
                    self.data = json.load(f)
            else:  # XML
                with open(self.file_path, 'r', encoding='utf-8') as f:
                    self.data = xmltodict.parse(f.read())

            # Analyze the structure
            depth = self._calculate_depth(self.data)
            schema = self._extract_schema(self.data)

            # Collect metadata
            self.metadata = {
                'file_info': {
                    'name': self.file_path.name,
                    'size': self.file_path.stat().st_size,
                    'last_modified': datetime.fromtimestamp(
                        self.file_path.stat().st_mtime
                    ).isoformat()
                },
                'structure': {
                    'max_depth': depth,
                    'schema': schema
                }
            }

            return self.data, self.metadata

        except Exception as e:
            logger.error(f"Error processing {self.file_path}: {str(e)}")
            raise

    def query(self, path_expr: str) -> List[Any]:
        """
        Query the data structure using JSONPath expressions.
        This provides a flexible way to extract specific information.

        Args:
            path_expr: JSONPath expression to evaluate

        Returns:
            List of matching elements
        """
        if self.data is None:
            raise ValueError("Data not loaded. Call load() first.")

        jsonpath_expr = parse(path_expr)
        return [match.value for match in jsonpath_expr.find(self.data)]

    def to_tabular(self, path_expr: str, columns: List[str]) -> pd.DataFrame:
        """
        Convert a portion of the hierarchical data to tabular format.
        This is useful when working with structured sections of the data.

        Args:
            path_expr: JSONPath expression to locate the records
            columns: List of column names to extract

        Returns:
            Pandas DataFrame containing the extracted data
        """
        records = self.query(path_expr)

        # Extract specified columns from each record
        tabular_data = []
        for record in records:
            row = {}
            for col in columns:
                try:
                    # Handle nested paths in column names
                    if '.' in col:
                        parts = col.split('.')
                        value = record
                        for part in parts:
                            value = value.get(part, None)
                    else:
                        value = record.get(col, None)
                    row[col] = value
                except AttributeError:
                    row[col] = None
            tabular_data.append(row)

        return pd.DataFrame(tabular_data)

"""## Testing Our Implementation

Let's test our hierarchical data processor with the sample files we created in Part 1.
This will demonstrate how it handles different types of nested structures.
"""

def test_hierarchical_processor():
    """
    Test the hierarchical data processor with our sample files.
    This demonstrates processing of both JSON and XML data.
    """
    try:
        # Test 1: Process JSON catalog
        print("Testing JSON Processing")
        print("-" * 50)

        json_processor = HierarchicalDataProcessor("rag_sample_data/product_catalog.json")
        json_data, json_metadata = json_processor.load()

        print("JSON Structure Analysis:")
        print(json.dumps(json_metadata['structure'], indent=2))

        # Extract product information using JSONPath
        print("\nProduct Information:")
        products = json_processor.query("$.catalog.categories[*].products[*]")
        for product in products:
            print(f"- {product['name']}: ${product['price']}")

        # Convert products to tabular format
        columns = ['id', 'name', 'price']
        df = json_processor.to_tabular(
            "$.catalog.categories[*].products[*]",
            columns
        )
        print("\nTabular Product Data:")
        print(df)

        # Test 2: Process XML inventory
        print("\nTesting XML Processing")
        print("-" * 50)

        xml_processor = HierarchicalDataProcessor("rag_sample_data/inventory.xml")
        xml_data, xml_metadata = xml_processor.load()

        print("XML Structure Analysis:")
        print(json.dumps(xml_metadata['structure'], indent=2))

        # Query warehouse information
        warehouses = xml_processor.query("$.inventory.warehouse")
        print("\nWarehouse Information:")
        for warehouse in warehouses:
            print(f"Location: {warehouse['@location']}")
            if 'product' in warehouse:
                products = warehouse['product']
                if not isinstance(products, list):
                    products = [products]
                for product in products:
                    print(f"- {product['name']}: {product['stock']} in stock")

    except Exception as e:
        print(f"Error during testing: {str(e)}")

# Run our tests
test_hierarchical_processor()

"""## Building a Search Index

For RAG systems, we often need to search through hierarchical data efficiently.
Let's implement a search index that makes our nested data searchable.
"""

class HierarchicalSearchIndex:
    """
    Creates a searchable index for hierarchical data structures.
    This makes it efficient to find information across nested documents.
    """

    def __init__(self):
        """Initialize the search index."""
        self.index = {}
        self.documents = {}

    def _extract_text(self, obj: Any, path: List[str] = None) -> List[tuple[str, List[str]]]:
        """
        Recursively extract text content from nested structures.
        Maintains the path to each piece of text for context.

        Args:
            obj: Object to process
            path: Current path in the hierarchy

        Returns:
            List of (text, path) tuples
        """
        if path is None:
            path = []

        results = []

        if isinstance(obj, dict):
            for key, value in obj.items():
                current_path = path + [key]
                if isinstance(value, (dict, list)):
                    results.extend(self._extract_text(value, current_path))
                else:
                    results.append((str(value), current_path))
        elif isinstance(obj, list):
            for i, item in enumerate(obj):
                current_path = path + [f"[{i}]"]
                results.extend(self._extract_text(item, current_path))
        else:
            results.append((str(obj), path))

        return results

    def add_document(self, doc_id: str, content: Any):
        """
        Add a document to the search index.

        Args:
            doc_id: Unique identifier for the document
            content: The hierarchical data structure to index
        """
        # Store the original document
        self.documents[doc_id] = content

        # Extract and index all text content
        text_items = self._extract_text(content)

        for text, path in text_items:
            # Create tokens from the text
            tokens = text.lower().split()

            # Index each token
            for token in tokens:
                if token not in self.index:
                    self.index[token] = {}
                if doc_id not in self.index[token]:
                    self.index[token][doc_id] = []
                self.index[token][doc_id].append(path)

    def search(self, query: str) -> Dict[str, List[List[str]]]:
        """
        Search the index for documents matching the query.

        Args:
            query: Search terms

        Returns:
            Dictionary mapping document IDs to lists of matching paths
        """
        tokens = query.lower().split()
        results = {}

        # Find documents containing all search terms
        for token in tokens:
            if token in self.index:
                for doc_id, paths in self.index[token].items():
                    if doc_id not in results:
                        results[doc_id] = paths
                    else:
                        # Combine paths for multiple matching terms
                        results[doc_id].extend(paths)

        return results

"""## Testing the Search Index

Let's test our search capabilities with our sample hierarchical data.
"""

def test_search_index():
    """Test the hierarchical search index with our sample documents."""
    # Create and populate the search index
    search_index = HierarchicalSearchIndex()

    # Add our JSON and XML samples
    json_processor = HierarchicalDataProcessor("rag_sample_data/product_catalog.json")
    xml_processor = HierarchicalDataProcessor("rag_sample_data/inventory.xml")

    json_data, _ = json_processor.load()
    xml_data, _ = xml_processor.load()

    search_index.add_document("product_catalog", json_data)
    search_index.add_document("inventory", xml_data)

    # Perform some test searches
    test_queries = [
        "watch",
        "warehouse",
        "stock",
        "GPS"
    ]

    print("Search Results:")
    print("-" * 50)

    for query in test_queries:
        print(f"\nSearching for: {query}")
        results = search_index.search(query)

        for doc_id, paths in results.items():
            print(f"\nDocument: {doc_id}")
            print("Matching paths:")
            for path in paths:
                print(f"- {' → '.join(path)}")

# Run the search index test
test_search_index()

"""## What's Next?

In Part 5, we'll focus on performance optimization and testing. We'll explore:
1. Efficient processing of large hierarchical datasets
2. Caching strategies for improved performance
3. Comprehensive testing approaches
4. Integration with RAG system components"""

In [None]:
# -*- coding: utf-8 -*-
"""
# Structured Data in RAG Systems - Part 5: Performance Optimization and Testing

This notebook demonstrates practical approaches to optimizing and testing structured
data processing in RAG systems. We'll focus on real-world performance improvements
and reliable testing strategies.

Note: Make sure you've run the previous parts to have the sample data available.
"""

# Install required packages
!pip install pytest pandas numpy memory_profiler

import pandas as pd
import numpy as np
import json
import time
from typing import Any, Dict, List, Optional
from pathlib import Path
import logging
from datetime import datetime
import hashlib
import os

# Set up logging with a clear format
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

"""## Section 1: Performance Monitoring

First, let's create a simple but effective performance monitoring system. This will
help us measure the impact of our optimizations.
"""

class PerformanceMonitor:
    """
    Tracks execution time and memory usage of operations.
    Provides insights into performance bottlenecks.
    """

    def __init__(self):
        """Initialize the performance monitor."""
        self.metrics = {}
        self.start_times = {}

    def start_operation(self, operation_name: str):
        """
        Start timing an operation.

        Args:
            operation_name: Name of the operation to track
        """
        self.start_times[operation_name] = time.time()

    def end_operation(self, operation_name: str) -> float:
        """
        End timing an operation and record its duration.

        Args:
            operation_name: Name of the operation to record

        Returns:
            Duration of the operation in seconds
        """
        if operation_name not in self.start_times:
            raise ValueError(f"Operation {operation_name} was not started")

        duration = time.time() - self.start_times[operation_name]

        if operation_name not in self.metrics:
            self.metrics[operation_name] = []

        self.metrics[operation_name].append(duration)
        return duration

    def get_statistics(self, operation_name: str) -> Dict:
        """
        Get performance statistics for an operation.

        Args:
            operation_name: Name of the operation to analyze

        Returns:
            Dictionary containing performance statistics
        """
        if operation_name not in self.metrics:
            raise ValueError(f"No metrics found for {operation_name}")

        durations = self.metrics[operation_name]
        return {
            'count': len(durations),
            'mean_duration': np.mean(durations),
            'min_duration': min(durations),
            'max_duration': max(durations)
        }

"""Let's test our performance monitor with a simple operation:"""

def test_performance_monitor():
    """Test the performance monitoring system."""
    monitor = PerformanceMonitor()

    # Test a simple operation
    monitor.start_operation("test_op")
    time.sleep(1)  # Simulate work
    duration = monitor.end_operation("test_op")

    print(f"Operation duration: {duration:.2f} seconds")
    print("\nOperation statistics:")
    print(json.dumps(monitor.get_statistics("test_op"), indent=2))

# Run the test
test_performance_monitor()

"""## Section 2: Optimized File Processing

Now let's create an optimized file processor that includes caching and
chunked processing capabilities.
"""

class OptimizedFileProcessor:
    """
    Processes files efficiently using caching and chunked reading.
    Designed for handling large data files in RAG systems.
    """

    def __init__(self, file_path: str, chunk_size: int = 10000):
        """
        Initialize the optimized file processor.

        Args:
            file_path: Path to the file to process
            chunk_size: Size of chunks for reading large files
        """
        self.file_path = Path(file_path)
        self.chunk_size = chunk_size
        self.monitor = PerformanceMonitor()

        # Create cache directory if it doesn't exist
        self.cache_dir = Path('file_cache')
        self.cache_dir.mkdir(exist_ok=True)

    def _calculate_file_hash(self) -> str:
        """
        Calculate a hash of the file content for cache validation.
        Uses chunked reading to handle large files efficiently.

        Returns:
            SHA-256 hash of the file content
        """
        hasher = hashlib.sha256()
        with open(self.file_path, 'rb') as f:
            while chunk := f.read(8192):  # Read in 8KB chunks
                hasher.update(chunk)
        return hasher.hexdigest()

    def _get_cache_path(self, file_hash: str) -> Path:
        """Get the cache file path for a given hash."""
        return self.cache_dir / f"{file_hash}.cache"

    def process_file(self) -> pd.DataFrame:
        """
        Process a file with caching and performance monitoring.

        Returns:
            Processed DataFrame
        """
        self.monitor.start_operation('total_processing')

        try:
            # Check cache
            file_hash = self._calculate_file_hash()
            cache_path = self._get_cache_path(file_hash)

            if cache_path.exists():
                logger.info("Using cached result")
                df = pd.read_pickle(cache_path)
                self.monitor.end_operation('total_processing')
                return df

            logger.info("Cache miss - processing file")

            # Process in chunks
            chunks = []
            for chunk_num, chunk in enumerate(pd.read_csv(self.file_path, chunksize=self.chunk_size)):
                logger.info(f"Processing chunk {chunk_num + 1}")
                chunks.append(chunk)

            # Combine chunks
            df = pd.concat(chunks, ignore_index=True)

            # Cache the result
            df.to_pickle(cache_path)

            duration = self.monitor.end_operation('total_processing')
            logger.info(f"Processing completed in {duration:.2f} seconds")

            return df

        except Exception as e:
            logger.error(f"Error processing file: {str(e)}")
            raise

"""Let's test our optimized file processor with a sample CSV:"""

def test_optimized_processor():
    """Test the optimized file processor with sample data."""
    # Create a test CSV file
    test_data = pd.DataFrame({
        'id': range(100),
        'value': np.random.randn(100)
    })

    test_file = Path('test_data.csv')
    test_data.to_csv(test_file, index=False)

    try:
        # Process the file twice to test caching
        processor = OptimizedFileProcessor(test_file)

        print("First run (no cache):")
        result1 = processor.process_file()
        print(f"Result shape: {result1.shape}")

        print("\nSecond run (with cache):")
        result2 = processor.process_file()
        print(f"Result shape: {result2.shape}")

    finally:
        # Clean up test file
        test_file.unlink(missing_ok=True)

# Run the test
test_optimized_processor()

"""## Section 3: Performance Testing Framework

Let's create a framework for testing performance under different conditions.
"""

class PerformanceTester:
    """
    Runs performance tests on data processing operations.
    Helps compare different optimization strategies.
    """

    def __init__(self):
        """Initialize the performance tester."""
        self.results = {}

    def run_test(self, name: str, func: callable, iterations: int = 3):
        """
        Run a performance test multiple times.

        Args:
            name: Name of the test
            func: Function to test
            iterations: Number of times to run the test
        """
        durations = []

        for i in range(iterations):
            logger.info(f"Running {name} - iteration {i + 1}")
            start_time = time.time()

            try:
                func()
                duration = time.time() - start_time
                durations.append(duration)
            except Exception as e:
                logger.error(f"Error in test {name}: {str(e)}")
                raise

        self.results[name] = {
            'mean_duration': np.mean(durations),
            'min_duration': min(durations),
            'max_duration': max(durations),
            'iterations': iterations
        }

    def compare_results(self):
        """Print a comparison of test results."""
        print("\nPerformance Test Results:")
        print("-" * 50)

        for name, metrics in self.results.items():
            print(f"\n{name}:")
            print(f"  Mean duration: {metrics['mean_duration']:.2f} seconds")
            print(f"  Min duration: {metrics['min_duration']:.2f} seconds")
            print(f"  Max duration: {metrics['max_duration']:.2f} seconds")

"""Let's use our performance testing framework to compare different processing approaches:"""

def test_performance_comparison():
    """Compare performance of different processing strategies."""
    # Create test data
    test_data = pd.DataFrame({
        'id': range(1000),
        'value': np.random.randn(1000)
    })

    test_file = Path('perf_test.csv')
    test_data.to_csv(test_file, index=False)

    try:
        tester = PerformanceTester()

        # Test different chunk sizes
        for chunk_size in [100, 500, 1000]:
            processor = OptimizedFileProcessor(test_file, chunk_size=chunk_size)
            tester.run_test(
                f"Chunk size {chunk_size}",
                processor.process_file
            )

        # Compare results
        tester.compare_results()

    finally:
        # Clean up
        test_file.unlink(missing_ok=True)

# Run the performance comparison
test_performance_comparison()

"""## Key Insights

Through our optimization and testing work, we've learned several important lessons:

1. Caching significantly improves performance for unchanged files
2. Chunked processing helps manage memory usage with large files
3. Performance monitoring helps identify optimization opportunities
4. Regular testing ensures optimizations don't introduce bugs

These techniques can be applied to improve the performance of any RAG system
that processes structured data."""