In [9]:
"""
Supply Chain Intelligence Hub - Data Quality Framework
======================================================
Advanced data quality validation framework with profiling,
anomaly detection, and automated reporting.
"""

import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional, Callable
from dataclasses import dataclass, field
from datetime import datetime
import json
from scipy import stats
from sklearn.ensemble import IsolationForest














@dataclass
class ValidationRule:
    """Individual validation rule definition"""
    rule_name: str
    rule_type: str
    column: Optional[str] = None
    threshold: Optional[float] = None
    condition: Optional[Callable] = None
    severity: str = 'WARNING'
    description: str = ""


@dataclass
class ValidationResult:
    """Result of a validation rule execution"""
    rule_name: str
    passed: bool
    severity: str
    message: str
    affected_rows: int = 0
    affected_percentage: float = 0.0
    details: Dict = field(default_factory=dict)
    timestamp: datetime = field(default_factory=datetime.now)


class DataQualityRulesEngine:
    """Engine to define and execute data quality rules"""
    
    def __init__(self):
        self.rules: Dict[str, List[ValidationRule]] = {}
        self.results: List[ValidationResult] = []
    
    def add_rule(self, table_name: str, rule: ValidationRule):
        """Add validation rule for a specific table"""
        if table_name not in self.rules:
            self.rules[table_name] = []
        self.rules[table_name].append(rule)
    
    def define_standard_rules(self):
        """Define standard data quality rules for all tables"""
        
        # SUPPLIERS
        self.add_rule('suppliers', ValidationRule(
            rule_name='supplier_id_unique',
            rule_type='uniqueness',
            column='supplier_id',
            severity='CRITICAL',
            description='Supplier ID must be unique'
        ))
        
        self.add_rule('suppliers', ValidationRule(
            rule_name='reliability_score_range',
            rule_type='validity',
            column='reliability_score',
            condition=lambda df: (df['reliability_score'] >= 0) & (df['reliability_score'] <= 100),
            severity='CRITICAL',
            description='Reliability score must be between 0 and 100'
        ))
        
        # PRODUCTS
        self.add_rule('products', ValidationRule(
            rule_name='product_id_unique',
            rule_type='uniqueness',
            column='product_id',
            severity='CRITICAL',
            description='Product ID must be unique'
        ))
        
        self.add_rule('products', ValidationRule(
            rule_name='unit_cost_positive',
            rule_type='validity',
            column='unit_cost',
            condition=lambda df: df['unit_cost'] > 0,
            severity='CRITICAL',
            description='Unit cost must be positive'
        ))
        
        # INVENTORY
        self.add_rule('inventory', ValidationRule(
            rule_name='quantity_on_hand_valid',
            rule_type='validity',
            column='quantity_on_hand',
            condition=lambda df: df['quantity_on_hand'] >= 0,
            severity='CRITICAL',
            description='Quantity on hand cannot be negative'
        ))
        
        self.add_rule('inventory', ValidationRule(
            rule_name='reserved_not_exceed_onhand',
            rule_type='consistency',
            column='quantity_reserved',
            condition=lambda df: df['quantity_reserved'] <= df['quantity_on_hand'],
            severity='CRITICAL',
            description='Reserved quantity cannot exceed quantity on hand'
        ))
        
        # ORDERS
        self.add_rule('orders', ValidationRule(
            rule_name='order_quantity_positive',
            rule_type='validity',
            column='order_quantity',
            condition=lambda df: df['order_quantity'] > 0,
            severity='CRITICAL',
            description='Order quantity must be positive'
        ))
        
        # SALES
        self.add_rule('sales', ValidationRule(
            rule_name='quantity_sold_positive',
            rule_type='validity',
            column='quantity_sold',
            condition=lambda df: df['quantity_sold'] > 0,
            severity='CRITICAL',
            description='Quantity sold must be positive'
        ))
    
    def execute_rules(self, df: pd.DataFrame, table_name: str) -> List[ValidationResult]:
        """Execute all rules for a given table"""
        if table_name not in self.rules:
            return []
        
        results = []
        total_rows = len(df)
        
        for rule in self.rules[table_name]:
            try:
                if rule.rule_type == 'uniqueness':
                    result = self._check_uniqueness(df, rule, total_rows)
                elif rule.rule_type == 'completeness':
                    result = self._check_completeness(df, rule, total_rows)
                elif rule.rule_type in ['validity', 'consistency']:
                    result = self._check_condition(df, rule, total_rows)
                else:
                    continue
                
                results.append(result)
            except Exception as e:
                results.append(ValidationResult(
                    rule_name=rule.rule_name,
                    passed=False,
                    severity='CRITICAL',
                    message=f"Rule execution failed: {str(e)}"
                ))
        
        self.results.extend(results)
        return results
    
    def _check_uniqueness(self, df: pd.DataFrame, rule: ValidationRule, 
                         total_rows: int) -> ValidationResult:
        """Check if column values are unique"""
        duplicates = df[rule.column].duplicated().sum()
        passed = duplicates == 0
        
        return ValidationResult(
            rule_name=rule.rule_name,
            passed=passed,
            severity=rule.severity,
            message=f"{'‚úì PASS' if passed else '‚úó FAIL'}: {rule.description}",
            affected_rows=duplicates,
            affected_percentage=(duplicates / total_rows * 100) if total_rows > 0 else 0,
            details={'duplicate_count': int(duplicates)}
        )
    
    def _check_completeness(self, df: pd.DataFrame, rule: ValidationRule,
                           total_rows: int) -> ValidationResult:
        """Check for null/missing values"""
        null_count = df[rule.column].isnull().sum()
        null_pct = (null_count / total_rows * 100) if total_rows > 0 else 0
        passed = null_pct <= (rule.threshold or 0.0)
        
        return ValidationResult(
            rule_name=rule.rule_name,
            passed=passed,
            severity=rule.severity,
            message=f"{'‚úì PASS' if passed else '‚úó FAIL'}: {rule.description}",
            affected_rows=null_count,
            affected_percentage=null_pct,
            details={'null_count': int(null_count), 'null_percentage': null_pct}
        )
    
    def _check_condition(self, df: pd.DataFrame, rule: ValidationRule,
                        total_rows: int) -> ValidationResult:
        """Check custom condition"""
        if rule.condition is None:
            return ValidationResult(
                rule_name=rule.rule_name,
                passed=False,
                severity='CRITICAL',
                message="No condition defined for rule"
            )
        
        try:
            valid = rule.condition(df)
            invalid_count = (~valid).sum()
            passed = invalid_count == 0
            
            return ValidationResult(
                rule_name=rule.rule_name,
                passed=passed,
                severity=rule.severity,
                message=f"{'‚úì PASS' if passed else '‚úó FAIL'}: {rule.description}",
                affected_rows=invalid_count,
                affected_percentage=(invalid_count / total_rows * 100) if total_rows > 0 else 0,
                details={'invalid_count': int(invalid_count)}
            )
        except Exception as e:
            return ValidationResult(
                rule_name=rule.rule_name,
                passed=False,
                severity='CRITICAL',
                message=f"Condition evaluation failed: {str(e)}"
            )
    
    def get_summary(self) -> pd.DataFrame:
        """Get summary of all validation results"""
        summary_data = []
        
        for result in self.results:
            summary_data.append({
                'rule_name': result.rule_name,
                'status': '‚úì PASS' if result.passed else '‚úó FAIL',
                'severity': result.severity,
                'affected_rows': result.affected_rows,
                'affected_percentage': f"{result.affected_percentage:.2f}%",
                'timestamp': result.timestamp.strftime('%Y-%m-%d %H:%M:%S')
            })
        
        return pd.DataFrame(summary_data)


class DataProfiler:
    """Generate comprehensive data profiles"""
    
    @staticmethod
    def profile_numeric_column(series: pd.Series) -> Dict:
        """Profile a numeric column"""
        return {
            'count': int(series.count()),
            'missing': int(series.isnull().sum()),
            'mean': float(series.mean()),
            'median': float(series.median()),
            'std': float(series.std()),
            'min': float(series.min()),
            'max': float(series.max()),
            'q25': float(series.quantile(0.25)),
            'q75': float(series.quantile(0.75)),
            'skewness': float(series.skew()),
            'kurtosis': float(series.kurtosis())
        }
    
    @staticmethod
    def profile_categorical_column(series: pd.Series) -> Dict:
        """Profile a categorical column"""
        value_counts = series.value_counts()
        
        return {
            'count': int(series.count()),
            'missing': int(series.isnull().sum()),
            'unique': int(series.nunique()),
            'top_value': str(value_counts.index) if len(value_counts) > 0 else None,
            'top_frequency': int(value_counts.iloc) if len(value_counts) > 0 else 0,
            'value_distribution': value_counts.head(10).to_dict()
        }
    
    @staticmethod
    def profile_date_column(series: pd.Series) -> Dict:
        """Profile a date column"""
        series_dt = pd.to_datetime(series, errors='coerce')
        
        return {
            'count': int(series_dt.count()),
            'missing': int(series_dt.isnull().sum()),
            'min_date': str(series_dt.min()),
            'max_date': str(series_dt.max()),
            'date_range_days': int((series_dt.max() - series_dt.min()).days) if series_dt.count() > 0 else 0
        }
    
    @staticmethod
    def profile_dataframe(df: pd.DataFrame, table_name: str) -> Dict:
        """Generate comprehensive profile for entire DataFrame"""
        profile = {
            'table_name': table_name,
            'row_count': len(df),
            'column_count': len(df.columns),
            'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
            'duplicate_rows': int(df.duplicated().sum()),
            'columns': {}
        }
        
        for col in df.columns:
            col_profile = {
                'dtype': str(df[col].dtype),
                'null_count': int(df[col].isnull().sum()),
                'null_percentage': float(df[col].isnull().sum() / len(df) * 100)
            }
            
            if pd.api.types.is_numeric_dtype(df[col]):
                col_profile['stats'] = DataProfiler.profile_numeric_column(df[col])
            elif pd.api.types.is_datetime64_any_dtype(df[col]) or 'date' in col.lower():
                col_profile['stats'] = DataProfiler.profile_date_column(df[col])
            else:
                col_profile['stats'] = DataProfiler.profile_categorical_column(df[col])
            
            profile['columns'][col] = col_profile
        
        return profile


class AnomalyDetector:
    """Detect anomalies in data using statistical methods"""
    
    @staticmethod
    def detect_outliers_iqr(series: pd.Series, multiplier: float = 1.5) -> pd.Series:
        """Detect outliers using IQR method"""
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR
        
        return (series < lower_bound) | (series > upper_bound)
    
    @staticmethod
    def detect_outliers_zscore(series: pd.Series, threshold: float = 3.0) -> pd.Series:
        """Detect outliers using Z-score method"""
        z_scores = np.abs(stats.zscore(series.dropna()))
        return pd.Series([z > threshold for z in z_scores], index=series.dropna().index)
    
    @staticmethod
    def detect_outliers_isolation_forest(df: pd.DataFrame, 
                                        columns: List[str],
                                        contamination: float = 0.1) -> np.ndarray:
        """Detect multivariate outliers using Isolation Forest"""
        X = df[columns].select_dtypes(include=[np.number]).dropna()
        
        if len(X) == 0:
            return np.array([])
        
        iso_forest = IsolationForest(
            contamination=contamination,
            random_state=42,
            n_estimators=100
        )
        
        predictions = iso_forest.fit_predict(X)
        return predictions
    
    @staticmethod
    def analyze_table_anomalies(df: pd.DataFrame, table_name: str) -> Dict:
        """Comprehensive anomaly analysis for a table"""
        results = {
            'table_name': table_name,
            'total_rows': len(df),
            'columns_analyzed': [],
            'outliers_detected': {}
        }
        
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        for col in numeric_cols:
            if df[col].nunique() > 10:
                outliers_iqr = AnomalyDetector.detect_outliers_iqr(df[col])
                outlier_count = outliers_iqr.sum()
                
                results['columns_analyzed'].append(col)
                results['outliers_detected'][col] = {
                    'method': 'IQR',
                    'count': int(outlier_count),
                    'percentage': float(outlier_count / len(df) * 100),
                    'outlier_values': df[col][outliers_iqr].tolist()[:10]
                }
        
        return results


class DataQualityReportGenerator:
    """Generate comprehensive data quality reports"""
    
    @staticmethod
    def generate_html_report(validation_results: List[ValidationResult],
                            profiles: Dict[str, Dict],
                            anomalies: Dict[str, Dict],
                            output_path: str = 'data_quality_report.html'):
        """Generate HTML data quality report"""
        
        passed = sum(1 for r in validation_results if r.passed)
        failed = sum(1 for r in validation_results if not r.passed)
        
        validation_rows = ""
        for result in validation_results:
            status_class = 'pass' if result.passed else 'fail'
            validation_rows += f"""
            <tr>
                <td><span class="{status_class}">{'‚úì' if result.passed else '‚úó'}</span> {result.rule_name}</td>
                <td class="{result.severity.lower()}">{result.severity}</td>
                <td>{result.message}</td>
                <td>{result.affected_rows}</td>
                <td>{result.affected_percentage:.2f}%</td>
            </tr>
            """
        
        html_content = f"""
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>Data Quality Report</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; background: #f5f5f5; }}
        .container {{ max-width: 1200px; margin: 0 auto; background: white; padding: 30px; }}
        h1 {{ color: #2c3e50; border-bottom: 3px solid #3498db; }}
        .summary {{ display: flex; justify-content: space-around; margin: 20px 0; }}
        .metric {{ text-align: center; padding: 20px; background: #ecf0f1; border-radius: 8px; }}
        .metric-value {{ font-size: 2em; font-weight: bold; color: #2c3e50; }}
        .pass {{ color: #27ae60; }}
        .fail {{ color: #e74c3c; }}
        table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
        th {{ background: #34495e; color: white; padding: 12px; text-align: left; }}
        td {{ padding: 10px; border-bottom: 1px solid #ecf0f1; }}
    </style>
</head>
<body>
    <div class="container">
        <h1>üîç Data Quality Report</h1>
        <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        
        <div class="summary">
            <div class="metric">
                <div class="metric-value pass">{passed}</div>
                <div>Rules Passed</div>
            </div>
            <div class="metric">
                <div class="metric-value fail">{failed}</div>
                <div>Rules Failed</div>
            </div>
            <div class="metric">
                <div class="metric-value">{len(profiles)}</div>
                <div>Tables Analyzed</div>
            </div>
        </div>
        
        <h2>Validation Results</h2>
        <table>
            <tr>
                <th>Rule</th>
                <th>Severity</th>
                <th>Message</th>
                <th>Affected Rows</th>
                <th>Percentage</th>
            </tr>
            {validation_rows}
        </table>
    </div>
</body>
</html>
        """
        
        with open(output_path, 'w') as f:
            f.write(html_content)
        
        return output_path


if __name__ == "__main__":
    print("Data Quality Framework - Ready for integration with ETL Pipeline")


Data Quality Framework - Ready for integration with ETL Pipeline


# Extract with Date Filter

In [7]:
from etl_pipeline import DatabaseConnection, DataExtractor
from datetime import datetime

config = ETLConfig()

with DatabaseConnection(config, logger) as db:
    extractor = DataExtractor(db, logger)
    
    # Extract sales from specific date range
    df = extractor.extract_table(
        'sales',
        date_column='sale_date',
        start_date=datetime(2025, 1, 1),
        end_date=datetime(2026, 1, 31)
    )
    print(f"Extracted {len(df)} sales records")


ModuleNotFoundError: No module named 'etl_pipeline'

#  Custom Transformation

In [11]:
from etl_pipeline import DataTransformer

transformer = DataTransformer(logger)

# Multiple transformation strategies
df_clean = transformer.clean_nulls(df, strategy='drop')
df_clean = transformer.remove_duplicates(df_clean, subset=['order_id'])
df_clean = transformer.standardize_dates(df_clean, ['order_date', 'delivery_date'])
df_clean = transformer.add_derived_columns(df_clean, 'orders')
df_clean = transformer.apply_business_rules(df_clean, 'orders')


ModuleNotFoundError: No module named 'etl_pipeline'

# Custom Validation Rules

In [8]:
from data_quality_framework import DataQualityRulesEngine, ValidationRule

engine = DataQualityRulesEngine()

# Add custom rule
engine.add_rule('inventory', ValidationRule(
    rule_name='low_stock_warning',
    rule_type='validity',
    column='quantity_on_hand',
    condition=lambda df: df['quantity_on_hand'] > df['reorder_level'],
    severity='WARNING',
    description='Stock should be above reorder point'
))

# Execute rules
results = engine.execute_rules(df, 'inventory')

# Get summary
summary = engine.get_summary()
print(summary)


ModuleNotFoundError: No module named 'data_quality_framework'

# Data Profiling

In [None]:
from data_quality_framework import DataProfiler

# Profile entire table
profile = DataProfiler.profile_dataframe(df, 'products')

print(f"Table: {profile['table_name']}")
print(f"Rows: {profile['row_count']:,}")
print(f"Columns: {profile['column_count']}")
print(f"Memory: {profile['memory_usage_mb']:.2f} MB")
print(f"Duplicates: {profile['duplicate_rows']}")

# Column-level statistics
for col, stats in profile['columns'].items():
    print(f"\n{col}:")
    print(f"  Nulls: {stats['null_percentage']:.2f}%")
    if 'stats' in stats and 'mean' in stats['stats']:
        print(f"  Mean: {stats['stats']['mean']:.2f}")
        print(f"  Range: [{stats['stats']['min']:.2f}, {stats['stats']['max']:.2f}]")


# Anomaly Detection

In [None]:
from data_quality_framework import AnomalyDetector

# Detect outliers using IQR method
outliers_iqr = AnomalyDetector.detect_outliers_iqr(df['unit_cost'])
print(f"IQR outliers: {outliers_iqr.sum()}")

# Detect outliers using Z-score
outliers_z = AnomalyDetector.detect_outliers_zscore(df['unit_cost'], threshold=3.0)
print(f"Z-score outliers: {outliers_z.sum()}")

# Multivariate anomaly detection
predictions = AnomalyDetector.detect_outliers_isolation_forest(
    df,
    columns=['unit_cost', 'reorder_level', 'lead_time_days'],
    contamination=0.1
)
anomalies = predictions == -1
print(f"Isolation Forest anomalies: {anomalies.sum()}")

# Comprehensive analysis
report = AnomalyDetector.analyze_table_anomalies(df, 'products')
for col, details in report['outliers_detected'].items():
    print(f"\n{col}: {details['count']} outliers ({details['percentage']:.2f}%)")


# Complete Custom Workflow

In [None]:
from etl_pipeline import ETLPipeline, ETLConfig, DatabaseConnection
from etl_pipeline import DataExtractor, DataTransformer, DataQualityValidator
from data_quality_framework import DataQualityRulesEngine, DataProfiler, AnomalyDetector

config = ETLConfig()

with DatabaseConnection(config, logger) as db:
    extractor = DataExtractor(db, logger)
    transformer = DataTransformer(logger)
    validator = DataQualityValidator(db, config, logger)
    
    # Extract
    df = extractor.extract_table('products')
    print(f"Extracted: {len(df)} rows")
    
    # Transform
    df = transformer.clean_nulls(df, strategy='drop')
    df = transformer.remove_duplicates(df)
    df = transformer.apply_business_rules(df, 'products')
    print(f"Cleaned: {len(df)} rows")
    
    # Validate with custom rules
    rules_engine = DataQualityRulesEngine()
    rules_engine.define_standard_rules()
    validation_results = rules_engine.execute_rules(df, 'products')
    
    passed = sum(1 for r in validation_results if r.passed)
    total = len(validation_results)
    print(f"Validation: {passed}/{total} rules passed")
    
    # Profile
    profile = DataProfiler.profile_dataframe(df, 'products')
    print(f"Profile: {profile['column_count']} columns analyzed")
    
    # Detect anomalies
    anomalies = AnomalyDetector.analyze_table_anomalies(df, 'products')
    outlier_count = sum([v['count'] for v in anomalies['outliers_detected'].values()])
    print(f"Anomalies: {outlier_count} outliers detected")
