# My ETL Pipeline

In [1]:
import logging
import sys
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional, Callable
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text, inspect
from sqlalchemy.exc import SQLAlchemyError, IntegrityError
from dataclasses import dataclass, field
import json
from scipy import stats
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')

In [2]:
# SQLAlchemy connection string (inside Docker network)
DB_USER = "analytics_user"
DB_PASS = "analyticspass123"
DB_HOST = "mysql"          # service name from docker-compose
DB_PORT = "3306"
DB_NAME = "supply_chain_db"

connection_string = f"mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_string)

# Quick test: list tables
with engine.connect() as conn:
    tables = pd.read_sql("SHOW TABLES;", conn)
tables

Unnamed: 0,Tables_in_supply_chain_db
0,inventory
1,orders
2,price_history
3,products
4,sales
5,suppliers
6,warehouses


In [3]:
@dataclass
class ETLConfig:
    """Configuration for ETL pipeline"""
    db_host: str = "mysql"
    db_port: int = 3306
    db_user: str = "analytics_user"
    db_password: str = "analyticspass123"
    db_name: str = "supply_chain_db"
    batch_size: int = 1000
    max_retries: int = 3
    retry_delay: int = 5
    null_threshold: float = 0.05
    duplicate_threshold: float = 0.01
    log_level: str = "INFO"
    log_file: str = "etl_pipeline.log"


@dataclass
class DataQualityReport:
    """Data quality validation results"""
    table_name: str
    total_rows: int
    null_count: Dict[str, int] = field(default_factory=dict)
    duplicate_count: int = 0
    missing_foreign_keys: Dict[str, int] = field(default_factory=dict)
    validation_passed: bool = True
    issues: List[str] = field(default_factory=list)
    timestamp: datetime = field(default_factory=datetime.now)
    
    def add_issue(self, issue: str):
        self.issues.append(issue)
        self.validation_passed = False

In [4]:
def setup_logging(config: ETLConfig) -> logging.Logger:
    """Configure logging with file and console handlers"""
    logger = logging.getLogger('ETL_Pipeline')
    logger.setLevel(getattr(logging, config.log_level))
    logger.handlers = []
    
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(logging.INFO)
    console_format = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    console_handler.setFormatter(console_format)
    
    file_handler = logging.FileHandler(config.log_file)
    file_handler.setLevel(logging.DEBUG)
    file_format = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
    )
    file_handler.setFormatter(file_format)
    
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    
    return logger

In [5]:
class DatabaseConnection:
    """Context manager for database connections with error handling"""
    
    def __init__(self, config: ETLConfig, logger: logging.Logger):
        self.config = config
        self.logger = logger
        self.engine = None
        self.connection = None
        
    def __enter__(self):
        try:
            connection_string = (
                f"mysql+pymysql://{self.config.db_user}:{self.config.db_password}"
                f"@{self.config.db_host}:{self.config.db_port}/{self.config.db_name}"
            )
            self.engine = create_engine(
                connection_string,
                pool_pre_ping=True,
                pool_recycle=3600,
                echo=False
            )
            self.connection = self.engine.connect()
            self.logger.info(f"‚úì Connected to database: {self.config.db_name}")
            return self
        except SQLAlchemyError as e:
            self.logger.error(f"‚úó Database connection failed: {str(e)}")
            raise
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.connection:
            self.connection.close()
            self.logger.info("‚úì Database connection closed")
        if exc_type:
            self.logger.error(f"‚úó Error during database operation: {exc_val}")
        return False

In [6]:
class DataExtractor:
    """Extract data from various sources"""
    
    def __init__(self, connection: DatabaseConnection, logger: logging.Logger):
        self.connection = connection
        self.logger = logger
    
    def extract_table(self, table_name: str, 
                     date_column: Optional[str] = None,
                     start_date: Optional[datetime] = None,
                     end_date: Optional[datetime] = None) -> pd.DataFrame:
        try:
            self.logger.info(f"Extracting data from table: {table_name}")
            
            query = f"SELECT * FROM {table_name}"
            
            if date_column and start_date and end_date:
                query += f" WHERE {date_column} BETWEEN :start_date AND :end_date"
                params = {'start_date': start_date, 'end_date': end_date}
                df = pd.read_sql_query(text(query), self.connection.connection, params=params)
            else:
                df = pd.read_sql_query(query, self.connection.connection)
            
            self.logger.info(f"‚úì Extracted {len(df):,} rows from {table_name}")
            return df
        except SQLAlchemyError as e:
            self.logger.error(f"‚úó Failed to extract from {table_name}: {str(e)}")
            raise
    
    def extract_with_joins(self, query: str, params: Optional[Dict] = None) -> pd.DataFrame:
        try:
            self.logger.info("Executing custom extraction query")
            if params:
                df = pd.read_sql_query(text(query), self.connection.connection, params=params)
            else:
                df = pd.read_sql_query(query, self.connection.connection)
            self.logger.info(f"‚úì Extracted {len(df):,} rows from custom query")
            return df
        except SQLAlchemyError as e:
            self.logger.error(f"‚úó Custom query failed: {str(e)}")
            raise


In [7]:
class DataTransformer:
    """Transform and clean extracted data"""
    
    def __init__(self, logger: logging.Logger):
        self.logger = logger
    
    def clean_nulls(self, df: pd.DataFrame, strategy: str = 'drop') -> pd.DataFrame:
        null_count = df.isnull().sum().sum()
        self.logger.info(f"Handling {null_count} null values using strategy: {strategy}")
        
        if strategy == 'drop':
            return df.dropna()
        elif strategy == 'fill_mean':
            return df.fillna(df.mean(numeric_only=True))
        elif strategy == 'fill_median':
            return df.fillna(df.median(numeric_only=True))
        elif strategy == 'fill_forward':
            return df.fillna(method='ffill')
        else:
            self.logger.warning(f"Unknown strategy '{strategy}', returning original DataFrame")
            return df
    
    def remove_duplicates(self, df: pd.DataFrame, 
                         subset: Optional[List[str]] = None) -> pd.DataFrame:
        initial_rows = len(df)
        df_clean = df.drop_duplicates(subset=subset, keep='first')
        removed = initial_rows - len(df_clean)
        
        if removed > 0:
            self.logger.warning(f"Removed {removed} duplicate rows")
        else:
            self.logger.info("‚úì No duplicates found")
        
        return df_clean
    
    def standardize_dates(self, df: pd.DataFrame, 
                         date_columns: List[str],
                         date_format: str = '%Y-%m-%d') -> pd.DataFrame:
        for col in date_columns:
            if col in df.columns:
                try:
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                    self.logger.info(f"‚úì Standardized date column: {col}")
                except Exception as e:
                    self.logger.error(f"‚úó Failed to standardize {col}: {str(e)}")
        return df
    
    def add_derived_columns(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        self.logger.info(f"Adding derived columns for {table_name}")
        
        if table_name == 'inventory':
            if 'quantity_on_hand' in df.columns and 'quantity_reserved' in df.columns:
                df['quantity_available'] = df['quantity_on_hand'] - df['quantity_reserved']
                self.logger.info("‚úì Added 'quantity_available' column")
        
        elif table_name == 'orders':
            if 'expected_delivery_date' in df.columns and 'actual_delivery_date' in df.columns:
                df['delivery_delay_days'] = (
                    pd.to_datetime(df['actual_delivery_date']) - 
                    pd.to_datetime(df['expected_delivery_date'])
                ).dt.days
                df['is_late'] = df['delivery_delay_days'] > 0
                self.logger.info("‚úì Added delivery metrics columns")
        
        elif table_name == 'sales':
            if 'revenue' in df.columns and 'quantity_sold' in df.columns:
                df['unit_price'] = df['revenue'] / df['quantity_sold']
                self.logger.info("‚úì Added 'unit_price' column")
        
        return df
    
    def apply_business_rules(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        self.logger.info(f"Applying business rules for {table_name}")
        initial_rows = len(df)
        
        if table_name == 'products':
            df = df[df['unit_cost'] > 0]
            df = df[df['reorder_level'] >= 0]
        elif table_name == 'inventory':
            df = df[df['quantity_on_hand'] >= 0]
            df = df[df['quantity_reserved'] >= 0]
        elif table_name == 'orders':
            df = df[df['order_quantity'] > 0]
            df = df[df['order_cost'] >= 0]
        elif table_name == 'sales':
            df = df[df['quantity_sold'] > 0]
            df = df[df['revenue'] >= 0]
        
        removed = initial_rows - len(df)
        if removed > 0:
            self.logger.warning(f"Removed {removed} rows violating business rules")
        
        return df

In [8]:
class DataQualityValidator:
    """Comprehensive data quality checks"""
    
    def __init__(self, connection: DatabaseConnection, 
                 config: ETLConfig, logger: logging.Logger):
        self.connection = connection
        self.config = config
        self.logger = logger
    
    def validate_table(self, df: pd.DataFrame, 
                      table_name: str,
                      required_columns: Optional[List[str]] = None) -> DataQualityReport:
        self.logger.info(f"=== Validating data quality for: {table_name} ===")
        report = DataQualityReport(table_name=table_name, total_rows=len(df))
        
        if required_columns:
            missing_cols = set(required_columns) - set(df.columns)
            if missing_cols:
                report.add_issue(f"Missing required columns: {missing_cols}")
                self.logger.error(f"‚úó Missing columns: {missing_cols}")
        
        null_counts = df.isnull().sum()
        for col, count in null_counts.items():
            if count > 0:
                null_pct = count / len(df)
                report.null_count[col] = count
                
                if null_pct > self.config.null_threshold:
                    report.add_issue(
                        f"Column '{col}' has {null_pct:.1%} nulls (threshold: {self.config.null_threshold:.1%})"
                    )
                    self.logger.warning(f"‚ö† High null count in '{col}': {count} ({null_pct:.1%})")
        
        if len(df) > 0:
            duplicate_count = df.duplicated().sum()
            report.duplicate_count = duplicate_count
            
            if duplicate_count > 0:
                dup_pct = duplicate_count / len(df)
                if dup_pct > self.config.duplicate_threshold:
                    report.add_issue(f"Found {duplicate_count} duplicates ({dup_pct:.1%})")
                    self.logger.warning(f"‚ö† Duplicates found: {duplicate_count}")
        
        fk_issues = self._validate_foreign_keys(df, table_name)
        if fk_issues:
            report.missing_foreign_keys = fk_issues
            for fk, count in fk_issues.items():
                report.add_issue(f"Missing foreign key references in '{fk}': {count} rows")
                self.logger.error(f"‚úó Foreign key issue in '{fk}': {count} orphaned rows")
        
        if report.validation_passed:
            self.logger.info(f"‚úì Data quality validation PASSED for {table_name}")
        else:
            self.logger.error(f"‚úó Data quality validation FAILED for {table_name}")
        
        return report
    
    def _validate_foreign_keys(self, df: pd.DataFrame, table_name: str) -> Dict[str, int]:
        issues = {}
        fk_mapping = {
            'products': {'supplier_id': 'suppliers'},
            'inventory': {'product_id': 'products', 'warehouse_id': 'warehouses'},
            'orders': {'supplier_id': 'suppliers'},
            'sales': {'product_id': 'products', 'warehouse_id': 'warehouses'},
            'price_history': {'product_id': 'products', 'supplier_id': 'suppliers'}
        }
        
        if table_name not in fk_mapping:
            return issues
        
        for fk_col, parent_table in fk_mapping[table_name].items():
            if fk_col not in df.columns:
                continue
            
            try:
                parent_ids = pd.read_sql_query(
                    f"SELECT {fk_col.replace('_id', '')}_id FROM {parent_table}",
                    self.connection.connection
                )
                valid_ids = set(parent_ids.iloc[:, 0])
                orphaned = ~df[fk_col].isin(valid_ids)
                orphaned_count = orphaned.sum()
                
                if orphaned_count > 0:
                    issues[fk_col] = orphaned_count
            except Exception as e:
                self.logger.warning(f"Could not validate FK {fk_col}: {str(e)}")
        
        return issues
    
    def generate_quality_summary(self, reports: List[DataQualityReport]) -> pd.DataFrame:
        summary_data = []
        for report in reports:
            summary_data.append({
                'table_name': report.table_name,
                'total_rows': report.total_rows,
                'null_columns': len(report.null_count),
                'total_nulls': sum(report.null_count.values()),
                'duplicates': report.duplicate_count,
                'fk_issues': len(report.missing_foreign_keys),
                'validation_passed': report.validation_passed,
                'issue_count': len(report.issues),
                'timestamp': report.timestamp
            })
        return pd.DataFrame(summary_data)


class DataLoader:
    """Load transformed data into database"""
    
    def __init__(self, connection: DatabaseConnection, 
                 config: ETLConfig, logger: logging.Logger):
        self.connection = connection
        self.config = config
        self.logger = logger
    
    def load_data(self, df: pd.DataFrame, table_name: str,
                  if_exists: str = 'append',
                  create_backup: bool = True) -> Tuple[int, int]:
        self.logger.info(f"Loading {len(df)} rows into table: {table_name}")
        
        rows_loaded = 0
        rows_failed = 0
        
        try:
            if create_backup and if_exists == 'replace':
                self._create_backup(table_name)
            
            for i in range(0, len(df), self.config.batch_size):
                batch = df.iloc[i:i + self.config.batch_size]
                
                try:
                    batch.to_sql(
                        name=table_name,
                        con=self.connection.engine,
                        if_exists=if_exists if i == 0 else 'append',
                        index=False,
                        method='multi'
                    )
                    rows_loaded += len(batch)
                except IntegrityError as e:
                    self.logger.error(f"Integrity error in batch: {str(e)}")
                    rows_failed += len(batch)
                except SQLAlchemyError as e:
                    self.logger.error(f"Database error in batch: {str(e)}")
                    rows_failed += len(batch)
            
            self.logger.info(f"‚úì Load completed: {rows_loaded} rows loaded, {rows_failed} rows failed")
            return rows_loaded, rows_failed
        except Exception as e:
            self.logger.error(f"‚úó Load failed: {str(e)}")
            raise
    
    def _create_backup(self, table_name: str):
        backup_name = f"{table_name}_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        try:
            with self.connection.connection.begin():
                self.connection.connection.execute(
                    text(f"CREATE TABLE {backup_name} AS SELECT * FROM {table_name}")
                )
            self.logger.info(f"‚úì Created backup table: {backup_name}")
        except SQLAlchemyError as e:
            self.logger.warning(f"Could not create backup: {str(e)}")


class ETLPipeline:
    """Main ETL pipeline orchestrator"""
    
    def __init__(self, config: Optional[ETLConfig] = None):
        self.config = config or ETLConfig()
        self.logger = setup_logging(self.config)
        self.quality_reports = []
        
        self.logger.info("=" * 70)
        self.logger.info("ETL PIPELINE INITIALIZED")
        self.logger.info("=" * 70)
    
    def run_full_pipeline(self, tables: List[str]) -> Dict[str, any]:
        results = {
            'success': False,
            'tables_processed': [],
            'tables_failed': [],
            'quality_reports': [],
            'summary': {}
        }
        
        try:
            with DatabaseConnection(self.config, self.logger) as db:
                extractor = DataExtractor(db, self.logger)
                transformer = DataTransformer(self.logger)
                validator = DataQualityValidator(db, self.config, self.logger)
                loader = DataLoader(db, self.config, self.logger)
                
                for table in tables:
                    try:
                        self.logger.info(f"\n{'='*70}")
                        self.logger.info(f"Processing table: {table}")
                        self.logger.info(f"{'='*70}")
                        
                        df = extractor.extract_table(table)
                        # df = transformer.clean_nulls(df, strategy='drop')
                        df = transformer.remove_duplicates(df)
                        df = transformer.standardize_dates(
                            df, 
                            date_columns=[col for col in df.columns if 'date' in col.lower()]
                        )
                        df = transformer.add_derived_columns(df, table)
                        df = transformer.apply_business_rules(df, table)
                        
                        report = validator.validate_table(df, table)
                        self.quality_reports.append(report)
                        
                        if not report.validation_passed:
                            self.logger.warning(f"‚ö† Quality validation failed for {table}, skipping load")
                            results['tables_failed'].append(table)
                            continue
                        
                        results['tables_processed'].append(table)
                        self.logger.info(f"‚úì Successfully processed {table}")
                    except Exception as e:
                        self.logger.error(f"‚úó Failed to process {table}: {str(e)}")
                        results['tables_failed'].append(table)
                
                results['quality_reports'] = self.quality_reports
                results['summary'] = validator.generate_quality_summary(self.quality_reports)
                results['success'] = len(results['tables_failed']) == 0
                
                self.logger.info("\n" + "="*70)
                self.logger.info("ETL PIPELINE COMPLETED")
                self.logger.info("="*70)
        except Exception as e:
            self.logger.error(f"‚úó Pipeline failed: {str(e)}")
            results['success'] = False
        
        return results


if __name__ == "__main__":
    config = ETLConfig()
    pipeline = ETLPipeline(config)
    tables_to_process = ['suppliers', 'products', 'warehouses', 'inventory', 'orders', 'sales', 'price_history']
    results = pipeline.run_full_pipeline(tables_to_process)
    
    if results['success']:
        print("\n‚úì Pipeline completed successfully!")
    else:
        print("\n‚úó Pipeline completed with errors")
    
    print(f"\nProcessed: {len(results['tables_processed'])} tables")
    print(f"Failed: {len(results['tables_failed'])} tables")


2026-02-01 16:45:02 - ETL_Pipeline - INFO - ETL PIPELINE INITIALIZED
2026-02-01 16:45:02 - ETL_Pipeline - INFO - ‚úì Connected to database: supply_chain_db
2026-02-01 16:45:02 - ETL_Pipeline - INFO - 
2026-02-01 16:45:02 - ETL_Pipeline - INFO - Processing table: suppliers
2026-02-01 16:45:02 - ETL_Pipeline - INFO - Extracting data from table: suppliers
2026-02-01 16:45:02 - ETL_Pipeline - INFO - ‚úì Extracted 30 rows from suppliers
2026-02-01 16:45:02 - ETL_Pipeline - INFO - ‚úì No duplicates found
2026-02-01 16:45:02 - ETL_Pipeline - INFO - ‚úì Standardized date column: created_date
2026-02-01 16:45:02 - ETL_Pipeline - INFO - Adding derived columns for suppliers
2026-02-01 16:45:02 - ETL_Pipeline - INFO - Applying business rules for suppliers
2026-02-01 16:45:02 - ETL_Pipeline - INFO - === Validating data quality for: suppliers ===
2026-02-01 16:45:02 - ETL_Pipeline - INFO - ‚úì Data quality validation PASSED for suppliers
2026-02-01 16:45:02 - ETL_Pipeline - INFO - ‚úì Successfully p

In [9]:
@dataclass
class ValidationRule:
    """Individual validation rule definition"""
    rule_name: str
    rule_type: str
    column: Optional[str] = None
    threshold: Optional[float] = None
    condition: Optional[Callable] = None
    severity: str = 'WARNING'
    description: str = ""


@dataclass
class ValidationResult:
    """Result of a validation rule execution"""
    rule_name: str
    passed: bool
    severity: str
    message: str
    affected_rows: int = 0
    affected_percentage: float = 0.0
    details: Dict = field(default_factory=dict)
    timestamp: datetime = field(default_factory=datetime.now)


class DataQualityRulesEngine:
    """Engine to define and execute data quality rules"""
    
    def __init__(self):
        self.rules: Dict[str, List[ValidationRule]] = {}
        self.results: List[ValidationResult] = []
    
    def add_rule(self, table_name: str, rule: ValidationRule):
        """Add validation rule for a specific table"""
        if table_name not in self.rules:
            self.rules[table_name] = []
        self.rules[table_name].append(rule)
    
    def define_standard_rules(self):
        """Define standard data quality rules for all tables"""
        
        # SUPPLIERS
        self.add_rule('suppliers', ValidationRule(
            rule_name='supplier_id_unique',
            rule_type='uniqueness',
            column='supplier_id',
            severity='CRITICAL',
            description='Supplier ID must be unique'
        ))
        
        self.add_rule('suppliers', ValidationRule(
            rule_name='reliability_score_range',
            rule_type='validity',
            column='reliability_score',
            condition=lambda df: (df['reliability_score'] >= 0) & (df['reliability_score'] <= 100),
            severity='CRITICAL',
            description='Reliability score must be between 0 and 100'
        ))
        
        # PRODUCTS
        self.add_rule('products', ValidationRule(
            rule_name='product_id_unique',
            rule_type='uniqueness',
            column='product_id',
            severity='CRITICAL',
            description='Product ID must be unique'
        ))
        
        self.add_rule('products', ValidationRule(
            rule_name='unit_cost_positive',
            rule_type='validity',
            column='unit_cost',
            condition=lambda df: df['unit_cost'] > 0,
            severity='CRITICAL',
            description='Unit cost must be positive'
        ))
        
        # INVENTORY
        self.add_rule('inventory', ValidationRule(
            rule_name='quantity_on_hand_valid',
            rule_type='validity',
            column='quantity_on_hand',
            condition=lambda df: df['quantity_on_hand'] >= 0,
            severity='CRITICAL',
            description='Quantity on hand cannot be negative'
        ))
        
        self.add_rule('inventory', ValidationRule(
            rule_name='reserved_not_exceed_onhand',
            rule_type='consistency',
            column='quantity_reserved',
            condition=lambda df: df['quantity_reserved'] <= df['quantity_on_hand'],
            severity='CRITICAL',
            description='Reserved quantity cannot exceed quantity on hand'
        ))
        
        # ORDERS
        self.add_rule('orders', ValidationRule(
            rule_name='order_quantity_positive',
            rule_type='validity',
            column='order_quantity',
            condition=lambda df: df['order_quantity'] > 0,
            severity='CRITICAL',
            description='Order quantity must be positive'
        ))
        
        # SALES
        self.add_rule('sales', ValidationRule(
            rule_name='quantity_sold_positive',
            rule_type='validity',
            column='quantity_sold',
            condition=lambda df: df['quantity_sold'] > 0,
            severity='CRITICAL',
            description='Quantity sold must be positive'
        ))
    
    def execute_rules(self, df: pd.DataFrame, table_name: str) -> List[ValidationResult]:
        """Execute all rules for a given table"""
        if table_name not in self.rules:
            return []
        
        results = []
        total_rows = len(df)
        
        for rule in self.rules[table_name]:
            try:
                if rule.rule_type == 'uniqueness':
                    result = self._check_uniqueness(df, rule, total_rows)
                elif rule.rule_type == 'completeness':
                    result = self._check_completeness(df, rule, total_rows)
                elif rule.rule_type in ['validity', 'consistency']:
                    result = self._check_condition(df, rule, total_rows)
                else:
                    continue
                
                results.append(result)
            except Exception as e:
                results.append(ValidationResult(
                    rule_name=rule.rule_name,
                    passed=False,
                    severity='CRITICAL',
                    message=f"Rule execution failed: {str(e)}"
                ))
        
        self.results.extend(results)
        return results
    
    def _check_uniqueness(self, df: pd.DataFrame, rule: ValidationRule, 
                         total_rows: int) -> ValidationResult:
        """Check if column values are unique"""
        duplicates = df[rule.column].duplicated().sum()
        passed = duplicates == 0
        
        return ValidationResult(
            rule_name=rule.rule_name,
            passed=passed,
            severity=rule.severity,
            message=f"{'‚úì PASS' if passed else '‚úó FAIL'}: {rule.description}",
            affected_rows=duplicates,
            affected_percentage=(duplicates / total_rows * 100) if total_rows > 0 else 0,
            details={'duplicate_count': int(duplicates)}
        )
    
    def _check_completeness(self, df: pd.DataFrame, rule: ValidationRule,
                           total_rows: int) -> ValidationResult:
        """Check for null/missing values"""
        null_count = df[rule.column].isnull().sum()
        null_pct = (null_count / total_rows * 100) if total_rows > 0 else 0
        passed = null_pct <= (rule.threshold or 0.0)
        
        return ValidationResult(
            rule_name=rule.rule_name,
            passed=passed,
            severity=rule.severity,
            message=f"{'‚úì PASS' if passed else '‚úó FAIL'}: {rule.description}",
            affected_rows=null_count,
            affected_percentage=null_pct,
            details={'null_count': int(null_count), 'null_percentage': null_pct}
        )
    
    def _check_condition(self, df: pd.DataFrame, rule: ValidationRule,
                        total_rows: int) -> ValidationResult:
        """Check custom condition"""
        if rule.condition is None:
            return ValidationResult(
                rule_name=rule.rule_name,
                passed=False,
                severity='CRITICAL',
                message="No condition defined for rule"
            )
        
        try:
            valid = rule.condition(df)
            invalid_count = (~valid).sum()
            passed = invalid_count == 0
            
            return ValidationResult(
                rule_name=rule.rule_name,
                passed=passed,
                severity=rule.severity,
                message=f"{'‚úì PASS' if passed else '‚úó FAIL'}: {rule.description}",
                affected_rows=invalid_count,
                affected_percentage=(invalid_count / total_rows * 100) if total_rows > 0 else 0,
                details={'invalid_count': int(invalid_count)}
            )
        except Exception as e:
            return ValidationResult(
                rule_name=rule.rule_name,
                passed=False,
                severity='CRITICAL',
                message=f"Condition evaluation failed: {str(e)}"
            )
    
    def get_summary(self) -> pd.DataFrame:
        """Get summary of all validation results"""
        summary_data = []
        
        for result in self.results:
            summary_data.append({
                'rule_name': result.rule_name,
                'status': '‚úì PASS' if result.passed else '‚úó FAIL',
                'severity': result.severity,
                'affected_rows': result.affected_rows,
                'affected_percentage': f"{result.affected_percentage:.2f}%",
                'timestamp': result.timestamp.strftime('%Y-%m-%d %H:%M:%S')
            })
        
        return pd.DataFrame(summary_data)


class DataProfiler:
    """Generate comprehensive data profiles"""
    
    @staticmethod
    def profile_numeric_column(series: pd.Series) -> Dict:
        """Profile a numeric column"""
        return {
            'count': int(series.count()),
            'missing': int(series.isnull().sum()),
            'mean': float(series.mean()),
            'median': float(series.median()),
            'std': float(series.std()),
            'min': float(series.min()),
            'max': float(series.max()),
            'q25': float(series.quantile(0.25)),
            'q75': float(series.quantile(0.75)),
            'skewness': float(series.skew()),
            'kurtosis': float(series.kurtosis())
        }
    
    @staticmethod
    def profile_categorical_column(series: pd.Series) -> Dict:
        """Profile a categorical column"""
        value_counts = series.value_counts()
        
        return {
            'count': int(series.count()),
            'missing': int(series.isnull().sum()),
            'unique': int(series.nunique()),
            'top_value': str(value_counts.index) if len(value_counts) > 0 else None,
            'top_frequency': int(value_counts.iloc) if len(value_counts) > 0 else 0,
            'value_distribution': value_counts.head(10).to_dict()
        }
    
    @staticmethod
    def profile_date_column(series: pd.Series) -> Dict:
        """Profile a date column"""
        series_dt = pd.to_datetime(series, errors='coerce')
        
        return {
            'count': int(series_dt.count()),
            'missing': int(series_dt.isnull().sum()),
            'min_date': str(series_dt.min()),
            'max_date': str(series_dt.max()),
            'date_range_days': int((series_dt.max() - series_dt.min()).days) if series_dt.count() > 0 else 0
        }
    
    @staticmethod
    def profile_dataframe(df: pd.DataFrame, table_name: str) -> Dict:
        """Generate comprehensive profile for entire DataFrame"""
        profile = {
            'table_name': table_name,
            'row_count': len(df),
            'column_count': len(df.columns),
            'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
            'duplicate_rows': int(df.duplicated().sum()),
            'columns': {}
        }
        
        for col in df.columns:
            col_profile = {
                'dtype': str(df[col].dtype),
                'null_count': int(df[col].isnull().sum()),
                'null_percentage': float(df[col].isnull().sum() / len(df) * 100)
            }
            
            if pd.api.types.is_numeric_dtype(df[col]):
                col_profile['stats'] = DataProfiler.profile_numeric_column(df[col])
            elif pd.api.types.is_datetime64_any_dtype(df[col]) or 'date' in col.lower():
                col_profile['stats'] = DataProfiler.profile_date_column(df[col])
            else:
                col_profile['stats'] = DataProfiler.profile_categorical_column(df[col])
            
            profile['columns'][col] = col_profile
        
        return profile


class AnomalyDetector:
    """Detect anomalies in data using statistical methods"""
    
    @staticmethod
    def detect_outliers_iqr(series: pd.Series, multiplier: float = 1.5) -> pd.Series:
        """Detect outliers using IQR method"""
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR
        
        return (series < lower_bound) | (series > upper_bound)
    
    @staticmethod
    def detect_outliers_zscore(series: pd.Series, threshold: float = 3.0) -> pd.Series:
        """Detect outliers using Z-score method"""
        z_scores = np.abs(stats.zscore(series.dropna()))
        return pd.Series([z > threshold for z in z_scores], index=series.dropna().index)
    
    @staticmethod
    def detect_outliers_isolation_forest(df: pd.DataFrame, 
                                        columns: List[str],
                                        contamination: float = 0.1) -> np.ndarray:
        """Detect multivariate outliers using Isolation Forest"""
        X = df[columns].select_dtypes(include=[np.number]).dropna()
        
        if len(X) == 0:
            return np.array([])
        
        iso_forest = IsolationForest(
            contamination=contamination,
            random_state=42,
            n_estimators=100
        )
        
        predictions = iso_forest.fit_predict(X)
        return predictions
    
    @staticmethod
    def analyze_table_anomalies(df: pd.DataFrame, table_name: str) -> Dict:
        """Comprehensive anomaly analysis for a table"""
        results = {
            'table_name': table_name,
            'total_rows': len(df),
            'columns_analyzed': [],
            'outliers_detected': {}
        }
        
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        for col in numeric_cols:
            if df[col].nunique() > 10:
                outliers_iqr = AnomalyDetector.detect_outliers_iqr(df[col])
                outlier_count = outliers_iqr.sum()
                
                results['columns_analyzed'].append(col)
                results['outliers_detected'][col] = {
                    'method': 'IQR',
                    'count': int(outlier_count),
                    'percentage': float(outlier_count / len(df) * 100),
                    'outlier_values': df[col][outliers_iqr].tolist()[:10]
                }
        
        return results


class DataQualityReportGenerator:
    """Generate comprehensive data quality reports"""
    
    @staticmethod
    def generate_html_report(validation_results: List[ValidationResult],
                            profiles: Dict[str, Dict],
                            anomalies: Dict[str, Dict],
                            output_path: str = 'data_quality_report.html'):
        """Generate HTML data quality report"""
        
        passed = sum(1 for r in validation_results if r.passed)
        failed = sum(1 for r in validation_results if not r.passed)
        
        validation_rows = ""
        for result in validation_results:
            status_class = 'pass' if result.passed else 'fail'
            validation_rows += f"""
            <tr>
                <td><span class="{status_class}">{'‚úì' if result.passed else '‚úó'}</span> {result.rule_name}</td>
                <td class="{result.severity.lower()}">{result.severity}</td>
                <td>{result.message}</td>
                <td>{result.affected_rows}</td>
                <td>{result.affected_percentage:.2f}%</td>
            </tr>
            """
        
        html_content = f"""
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>Data Quality Report</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }}
        .container {{ max-width: 1200px; margin: 0 auto; background: white; padding: 30px; }}
        h1 {{ color: #2c3e50; border-bottom: 3px solid #3498db; }}
        .summary {{ display: flex; justify-content: space-around; margin: 20px 0; }}
        .metric {{ text-align: center; padding: 20px; background: #ecf0f1; border-radius: 8px; }}
        .metric-value {{ font-size: 2em; font-weight: bold; color: #2c3e50; }}
        .pass {{ color: #27ae60; }}
        .fail {{ color: #e74c3c; }}
        table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
        th {{ background: #34495e; color: white; padding: 12px; text-align: left; }}
        td {{ padding: 10px; border-bottom: 1px solid #ecf0f1; }}
    </style>
</head>
<body>
    <div class="container">
        <h1>üîç Data Quality Report</h1>
        <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        
        <div class="summary">
            <div class="metric">
                <div class="metric-value pass">{passed}</div>
                <div>Rules Passed</div>
            </div>
            <div class="metric">
                <div class="metric-value fail">{failed}</div>
                <div>Rules Failed</div>
            </div>
            <div class="metric">
                <div class="metric-value">{len(profiles)}</div>
                <div>Tables Analyzed</div>
            </div>
        </div>
        
        <h2>Validation Results</h2>
        <table>
            <tr>
                <th>Rule</th>
                <th>Severity</th>
                <th>Message</th>
                <th>Affected Rows</th>
                <th>Percentage</th>
            </tr>
            {validation_rows}
        </table>
    </div>
</body>
</html>
        """
        
        with open(output_path, 'w') as f:
            f.write(html_content)
        
        return output_path


if __name__ == "__main__":
    print("Data Quality Framework - Ready for integration with ETL Pipeline")

Data Quality Framework - Ready for integration with ETL Pipeline
