In [1]:
# -*- coding: utf-8 -*-
"""
# Structured Data in RAG Systems - Part 2: CSV Processing (Continued)

Continuing our CSV processing implementation with validation, error handling,
and practical examples using our sample files.
"""

class EnhancedCSVLoader:
    def validate_data(self, df: pd.DataFrame) -> dict:
        """
        Perform comprehensive validation checks on the loaded data.
        This helps identify potential issues before processing.

        Args:
            df: Pandas DataFrame containing the loaded CSV data

        Returns:
            Dictionary containing validation results and statistics
        """
        validation = {
            'total_rows': len(df),
            'total_columns': len(df.columns),
            'missing_values': df.isnull().sum().to_dict(),
            'column_types': df.dtypes.astype(str).to_dict(),
            'duplicate_rows': df.duplicated().sum(),
            'column_statistics': {}
        }

        # Calculate statistics for numeric columns
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        for col in numeric_columns:
            validation['column_statistics'][col] = {
                'mean': df[col].mean(),
                'std': df[col].std(),
                'min': df[col].min(),
                'max': df[col].max()
            }

        # Check for potential data quality issues
        validation['quality_checks'] = {
            'empty_columns': [col for col in df.columns if df[col].isnull().all()],
            'high_missing_ratio': [
                col for col in df.columns
                if df[col].isnull().sum() / len(df) > 0.5
            ],
            'constant_columns': [
                col for col in df.columns
                if df[col].nunique() == 1
            ]
        }

        return validation

    def load(self) -> tuple[pd.DataFrame, dict]:
        """
        Load and process the CSV file, returning both the data and metadata.
        Implements comprehensive error handling and data validation.

        Returns:
            Tuple containing:
            - Pandas DataFrame with the loaded data
            - Dictionary with metadata and validation results
        """
        try:
            # Detect file properties
            properties = self.detect_file_properties()
            logger.info(f"Detected file properties: {properties}")

            # Read CSV with detected properties
            df = pd.read_csv(
                self.file_path,
                encoding=self.encoding,
                delimiter=self.delimiter,
                on_bad_lines='warn'
            )

            # Validate the data
            validation_results = self.validate_data(df)
            logger.info("Data validation completed")

            # Collect metadata
            self.metadata = {
                'file_properties': properties,
                'validation': validation_results,
                'file_size': os.path.getsize(self.file_path),
                'last_modified': datetime.fromtimestamp(
                    os.path.getmtime(self.file_path)
                ).isoformat(),
                'columns': list(df.columns)
            }

            return df, self.metadata

        except Exception as e:
            logger.error(f"Error loading CSV: {str(e)}")
            raise

NameError: name 'pd' is not defined