In [None]:
# Install required packages
!pip install pandas numpy chardet xmltodict

# Import necessary libraries
import os
import json
import pandas as pd
import numpy as np
import chardet
import xmltodict
from typing import List, Dict, Any
from pathlib import Path
from datetime import datetime

print("Structured data processing environment setup complete!")

**CSV Files: The Workhorses of Data Analysis**

In [3]:
class EnhancedCSVLoader:
    """
    A comprehensive CSV loader for RAG systems.
    Handles encoding detection, delimiter inference, and data validation.
    """
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.metadata = {}
        self.encoding = None
        self.delimiter = None

    def detect_file_properties(self) -> dict:
        """
        Automatically detect CSV file properties including
        encoding and delimiter.
        """
        with open(self.file_path, 'rb') as file:
            # Read a sample of the file
            raw_data = file.read(10000)  # Read first 10KB

            # Detect encoding
            result = chardet.detect(raw_data)
            self.encoding = result['encoding']

            # Detect delimiter
            sample_text = raw_data.decode(self.encoding)
            delimiters = [',', ';', '\t', '|']
            delimiter_counts = {d: sample_text.count(d) for d in delimiters}
            self.delimiter = max(delimiter_counts.items(), key=lambda x: x[1])[0]

            return {
                'encoding': self.encoding,
                'delimiter': self.delimiter,
                'confidence': result['confidence']
            }

    def validate_data(self, df: pd.DataFrame) -> dict:
       """
       Perform validation checks on the loaded data.
       Returns a dictionary of validation results.
       """
       validation = {
           'total_rows': len(df),
           'total_columns': len(df.columns),
           'missing_values': df.isnull().sum().to_dict(),
           'column_types': df.dtypes.astype(str).to_dict(),
           'duplicate_rows': df.duplicated().sum()
       }

       # Check for common data quality issues
       validation['warnings'] = []

       # Check for high percentage of missing values
       missing_percentages = (df.isnull().sum() / len(df)) * 100
       for column, pct in missing_percentages.items():
           if pct > 20:
               validation['warnings'].append(
                   f"Column '{column}' has {pct:.1f}% missing values"
               )

       # Check for mixed data types
       for column in df.columns:
           if df[column].dtype == 'object':
               try:
                   pd.to_numeric(df[column], errors='raise')
                   validation['warnings'].append(
                       f"Column '{column}' contains mixed numeric and non-numeric values"
                   )
               except:
                   pass

       return validation

    def load(self) -> tuple[pd.DataFrame, dict]:
       """
       Load and process the CSV file.
       Returns both the data and metadata about the file and its contents.
       """
       try:
           # Detect file properties
           properties = self.detect_file_properties()

           # Read the CSV file
           df = pd.read_csv(
               self.file_path,
               encoding=self.encoding,
               delimiter=self.delimiter,
               on_bad_lines='warn'
           )

           # Validate the data
           validation_results = self.validate_data(df)

           # Collect metadata
           self.metadata = {
               'file_properties': properties,
               'validation': validation_results,
               'file_size': os.path.getsize(self.file_path),
               'last_modified': os.path.getmtime(self.file_path)
           }

           return df, self.metadata

       except Exception as e:
           print(f"Error loading CSV: {str(e)}")
           return None, None

In [None]:
def test_csv_loader():
    """Test the CSV loader with sample data"""

    # Create a test CSV file with various data scenarios
    sample_data = """
name,age,city,salary
John Doe,30,New York,75000
Jane Smith,25,Los Angeles,82000
Bob Johnson,,Chicago,68000
Alice Brown,35,Houston,91000.5
"""

    # Ensure we have a directory for our samples
    os.makedirs('rag_sample_data', exist_ok=True)

    # Write the sample data to a file
    with open('rag_sample_data/sample.csv', 'w') as f:
        f.write(sample_data.strip())

    # Test our loader with the sample file
    loader = EnhancedCSVLoader('rag_sample_data/sample.csv')
    df, metadata = loader.load()

    # Convert numpy types to Python native types for JSON serialization
    metadata['validation']['total_rows'] = int(metadata['validation']['total_rows'])
    metadata['validation']['total_columns'] = int(metadata['validation']['total_columns'])
    metadata['validation']['duplicate_rows'] = int(metadata['validation']['duplicate_rows'])

    # Display the results
    print("Loaded Data Preview:")
    print(df.head())

    print("\nMetadata and Validation Results:")
    print(json.dumps(metadata, indent=2))

# Create sample directory and run the test
os.makedirs('rag_sample_data', exist_ok=True)
test_csv_loader()

**Directory Structures: Managing Document Collections**

In [5]:
class DocumentCollectionManager:
    """
    Manages collections of documents organized in directories.
    Supports various file types and maintains a searchable index.
    """
    def __init__(self, root_dir: str):
        self.root_dir = root_dir
        self.file_index = {}
        self.metadata = {}

    def _count_file_types(self) -> dict:
        """Count the number of files of each type."""
        type_counts = {}
        for file_info in self.file_index.values():
            ext = file_info['extension']
            type_counts[ext] = type_counts.get(ext, 0) + 1
        return type_counts

    def _calculate_max_depth(self) -> int:
        """Calculate the maximum directory depth."""
        max_depth = 0
        for file_path in self.file_index:
            relative_path = os.path.relpath(file_path, self.root_dir)
            depth = len(relative_path.split(os.sep))
            max_depth = max(max_depth, depth)
        return max_depth

    def scan_directory(self) -> dict:
        """
        Scan the directory structure and catalog all documents.
        Returns information about the document collection.
        """
        for root, dirs, files in os.walk(self.root_dir):
            for file in files:
                file_path = os.path.join(root, file)
                file_ext = os.path.splitext(file)[1].lower()

                # Get file metadata
                stats = os.stat(file_path)

                # Store file information
                self.file_index[file_path] = {
                    'extension': file_ext,
                    'size': stats.st_size,
                    'modified': datetime.fromtimestamp(stats.st_mtime).isoformat(),
                    'relative_path': os.path.relpath(file_path, self.root_dir)
                }

        # Collect collection statistics
        self.metadata = {
            'total_files': len(self.file_index),
            'file_types': self._count_file_types(),
            'total_size': sum(f['size'] for f in self.file_index.values()),
            'directory_depth': self._calculate_max_depth()
        }

        return self.metadata

In [None]:
def test_document_manager():
    """Test the document collection manager"""

    # Create a test directory structure
    base_dir = 'rag_sample_data/documents'
    os.makedirs(f'{base_dir}/texts', exist_ok=True)
    os.makedirs(f'{base_dir}/pdfs', exist_ok=True)

    # Create some sample files
    with open(f'{base_dir}/texts/doc1.txt', 'w') as f:
        f.write("Sample text document")
    with open(f'{base_dir}/pdfs/doc2.pdf', 'w') as f:
        f.write("Sample PDF content")

    # Test the manager
    manager = DocumentCollectionManager(base_dir)
    metadata = manager.scan_directory()

    print("Directory Analysis:")
    print(json.dumps(metadata, indent=2))

    print("\nFile Index:")
    for path, info in manager.file_index.items():
        print(f"\nFile: {path}")
        print(json.dumps(info, indent=2))

# Run the test
test_document_manager()

**JSON and XML: Handling Hierarchical Data**

In [9]:
import json
import xmltodict
from typing import Union, Any

class HierarchicalDataLoader:
    """
    Handles both JSON and XML files with support for schema validation
    and data transformation.
    """
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.file_type = os.path.splitext(file_path)[1].lower()
        self.metadata = {}

    def _load_json(self) -> tuple[dict, dict]:
        """Load and parse JSON files with error handling."""
        try:
            with open(self.file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Extract metadata about the JSON structure
            self.metadata = {
                'depth': self._calculate_json_depth(data),
                'keys': self._extract_json_keys(data),
                'size': os.path.getsize(self.file_path)
            }

            return data, self.metadata

        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {str(e)}")
            return None, None

    def _calculate_json_depth(self, obj: Any, current_depth: int = 1) -> int:
        """Calculate the maximum depth of a JSON object."""
        if isinstance(obj, dict):
            if not obj:
                return current_depth
            return max(
                self._calculate_json_depth(value, current_depth + 1)
                for value in obj.values()
            )
        elif isinstance(obj, list):
            if not obj:
                return current_depth
            return max(
                self._calculate_json_depth(item, current_depth + 1)
                for item in obj
            )
        return current_depth

    def _load_xml(self) -> tuple[dict, dict]:
        """Load and parse XML files with error handling."""
        try:
            with open(self.file_path, 'r', encoding='utf-8') as f:
                data = xmltodict.parse(f.read())

            # Extract metadata about the XML structure
            self.metadata = {
                'root_tag': list(data.keys())[0],
                'size': os.path.getsize(self.file_path),
                'depth': self._calculate_xml_depth(data)
            }

            return data, self.metadata

        except Exception as e:
            print(f"Error parsing XML: {str(e)}")
            return None, None

    def _extract_json_keys(self, obj: Any, prefix: str = '') -> list:
        """Extract all keys from a JSON object with their full paths."""
        keys = []
        if isinstance(obj, dict):
            for key, value in obj.items():
                full_key = f"{prefix}.{key}" if prefix else key
                keys.append(full_key)
                if isinstance(value, (dict, list)):
                    keys.extend(self._extract_json_keys(value, full_key))
        elif isinstance(obj, list):
            for i, item in enumerate(obj):
                full_key = f"{prefix}[{i}]"
                if isinstance(item, (dict, list)):
                    keys.extend(self._extract_json_keys(item, full_key))
        return keys

In [None]:
def test_hierarchical_loader():
    """Test the hierarchical data loader"""

    # Create sample JSON file
    json_data = {
        "users": [
            {
                "id": 1,
                "name": "John Doe",
                "address": {
                    "city": "New York",
                    "country": "USA"
                }
            }
        ]
    }

    with open('rag_sample_data/sample.json', 'w') as f:
        json.dump(json_data, f)

    # Create sample XML file
    xml_data = """<?xml version="1.0" encoding="UTF-8"?>
    <root>
        <users>
            <user>
                <id>1</id>
                <name>John Doe</name>
                <address>
                    <city>New York</city>
                    <country>USA</country>
                </address>
            </user>
        </users>
    </root>
    """

    with open('rag_sample_data/sample.xml', 'w') as f:
        f.write(xml_data)

    # Test both formats
    json_loader = HierarchicalDataLoader('rag_sample_data/sample.json')
    json_content, json_metadata = json_loader._load_json()

    print("JSON Processing Results:")
    print("\nContent:")
    print(json.dumps(json_content, indent=2))
    print("\nMetadata:")
    print(json.dumps(json_metadata, indent=2))

    xml_loader = HierarchicalDataLoader('rag_sample_data/sample.xml')
    xml_content, xml_metadata = xml_loader._load_xml()

    print("\nXML Processing Results:")
    print("\nContent:")
    print(json.dumps(xml_content, indent=2))
    print("\nMetadata:")
    print(json.dumps(xml_metadata, indent=2))

# Run the test
test_hierarchical_loader()

**Bringing It All Together**

In [11]:
class UnifiedDataProcessor:
    """
    A comprehensive processor that handles CSV, JSON, XML,
    and directory structures in a unified way.
    """
    def __init__(self, base_dir: str):
        self.base_dir = base_dir
        self.collection_manager = DocumentCollectionManager(base_dir)
        self.data_cache = {}

    def process_document(self, file_path: str) -> tuple[Any, dict]:
        """
        Process any supported document type and return its content
        and metadata.
        """
        file_ext = os.path.splitext(file_path)[1].lower()

        if file_ext == '.csv':
            loader = EnhancedCSVLoader(file_path)
            return loader.load()

        elif file_ext in ['.json', '.xml']:
            loader = HierarchicalDataLoader(file_path)
            if file_ext == '.json':
                return loader._load_json()
            else:
                return loader._load_xml()
        else:
            raise ValueError(f"Unsupported file type: {file_ext}")

    def build_knowledge_base(self) -> dict:
        """
        Scan the directory and process all supported documents
        into a unified knowledge base.
        """
        # Scan directory structure
        collection_metadata = self.collection_manager.scan_directory()

        # Process each file
        for file_path in self.collection_manager.file_index:
            try:
                content, metadata = self.process_document(file_path)
                if content is not None:
                    self.data_cache[file_path] = {
                        'content': content,
                        'metadata': metadata
                    }
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")

        return {
            'collection_metadata': collection_metadata,
            'processed_files': len(self.data_cache),
            'total_files': len(self.collection_manager.file_index)
        }

In [None]:
import numpy as np
from json import JSONEncoder

class CustomJSONEncoder(JSONEncoder):
    """Custom JSON encoder to handle numpy and other special types."""
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super().default(obj)

def test_unified_processor():
    """Test the unified data processor with multiple file types"""

    # Create test files of different types
    os.makedirs('rag_sample_data/unified_test', exist_ok=True)

    # Create a CSV file
    with open('rag_sample_data/unified_test/data.csv', 'w') as f:
        f.write("name,age\nJohn,30\nJane,25")

    # Create a JSON file
    with open('rag_sample_data/unified_test/config.json', 'w') as f:
        json.dump({"setting1": "value1", "setting2": 42}, f)

    # Create an XML file
    with open('rag_sample_data/unified_test/data.xml', 'w') as f:
        f.write('<root><item>Test</item></root>')

    # Test the unified processor
    processor = UnifiedDataProcessor('rag_sample_data/unified_test')
    results = processor.build_knowledge_base()

    print("Unified Processing Results:")
    print(json.dumps(results, indent=2, cls=CustomJSONEncoder))

    print("\nProcessed Documents:")
    for path, data in processor.data_cache.items():
        print(f"\nFile: {path}")
        print("Metadata:", json.dumps(data['metadata'], indent=2, cls=CustomJSONEncoder))

# Run the test
test_unified_processor()

**Best Practices and Common Challenges**

In [15]:
def validate_structured_data(data: Any, format_type: str) -> dict:
    """
    Comprehensive validation for structured data.
    Returns validation results and recommendations.
    """
    validation_results = {
        'passed_checks': [],
        'warnings': [],
        'errors': [],
        'recommendations': []
    }

    if format_type == 'csv':
        # Check for missing values
        if data.isnull().any().any():
            validation_results['warnings'].append(
                'Missing values detected. Consider imputation strategy.'
            )

        # Check for data type consistency
        for column in data.columns:
            unique_types = data[column].apply(type).unique()
            if len(unique_types) > 1:
                validation_results['warnings'].append(
                    f'Mixed data types in column {column}'
                )

    elif format_type in ['json', 'xml']:
        # Check for structural consistency
        validation_results.update(
            check_hierarchical_structure(data)
        )

    return validation_results

In [16]:
class StreamingDataProcessor:
    """
    Process large structured data files in chunks to manage memory usage.
    """
    def __init__(self, chunk_size: int = 1000):
        self.chunk_size = chunk_size

    def process_large_csv(self, file_path: str, processor_func):
        """Process a large CSV file in chunks."""
        for chunk in pd.read_csv(file_path, chunksize=self.chunk_size):
            processed_chunk = processor_func(chunk)
            yield processed_chunk

In [17]:
class StructuredDataLogger:
    """Logging system for structured data processing."""
    def __init__(self, log_path: str):
        self.log_path = log_path
        self.error_counts = {}

    def log_processing_error(self, error_type: str, details: str):
        """Log processing errors with context for debugging."""
        timestamp = pd.Timestamp.now()
        with open(self.log_path, 'a') as f:
            f.write(f"{timestamp} - {error_type}: {details}\n")

        # Track error frequencies
        self.error_counts[error_type] = self.error_counts.get(error_type, 0) + 1