In [1]:
# Cell 1: Setup
from pipeline import ETLPipeline, setup_logging
from etl import ETLConfig

# Setup logging
logger = setup_logging(log_level='INFO', log_file='etl_execution.log')

# Create configuration
config = ETLConfig(
    db_host='mysql',
    db_port=3306,
    db_name='supply_chain_db'
)

# Cell 2: Run pipeline
pipeline = ETLPipeline(config, logger)

tables_to_process = ['suppliers', 'products', 'inventory', 'orders', 'sales']

stats = pipeline.run_full_pipeline(
    tables=tables_to_process,
    enable_validation=True,
    enable_transformation=True
)

# Cell 3: View results
print(f"Tables processed: {stats['tables_processed']}")
print(f"Total rows extracted: {stats['total_rows_extracted']:,}")
print(f"Duration: {(stats['end_time'] - stats['start_time']).total_seconds():.2f}s")


2026-02-01 23:52:44 - ETL_Pipeline - INFO - ETL PIPELINE EXECUTION STARTED
2026-02-01 23:52:44 - ETL_Pipeline - INFO - ✓ Connected to database: supply_chain_db
2026-02-01 23:52:44 - ETL_Pipeline - INFO - 
2026-02-01 23:52:44 - ETL_Pipeline - INFO - Processing table: suppliers
2026-02-01 23:52:44 - ETL_Pipeline - INFO - Extracting data from table: suppliers
2026-02-01 23:52:44 - ETL_Pipeline - INFO - ✓ Extracted 30 rows from suppliers
2026-02-01 23:52:44 - ETL_Pipeline - INFO - ✓ No duplicates found
2026-02-01 23:52:44 - ETL_Pipeline - INFO - ✓ Standardized date column: created_date
2026-02-01 23:52:44 - ETL_Pipeline - INFO - Adding derived columns for suppliers
2026-02-01 23:52:44 - ETL_Pipeline - INFO - Applying business rules for suppliers
2026-02-01 23:52:44 - ETL_Pipeline - INFO - === Validating data quality for: suppliers ===
2026-02-01 23:52:44 - ETL_Pipeline - INFO - ✓ Data quality validation PASSED for suppliers
2026-02-01 23:52:44 - ETL_Pipeline - INFO - ✓ Validation passed fo

Tables processed: 5
Total rows extracted: 572
Duration: 0.30s


In [2]:
# Cell 1: Imports
from etl import ETLConfig, DatabaseConnection, DataExtractor
from quality.rules_engine import DataQualityRulesEngine
from quality.profiler import DataProfiler
from quality.anomaly import AnomalyDetector
from quality.reporter import DataQualityReporter
from pipeline import setup_logging

logger = setup_logging()
config = ETLConfig()

# Cell 2: Extract data
with DatabaseConnection(config, logger) as db:
    extractor = DataExtractor(db, logger)
    products_df = extractor.extract_table('products')
    sales_df = extractor.extract_table('sales')

# Cell 3: Run validation rules
rules_engine = DataQualityRulesEngine()
rules_engine.define_standard_rules()

products_results = rules_engine.execute_rules(products_df, 'products')
sales_results = rules_engine.execute_rules(sales_df, 'sales')

# View summary
summary_df = rules_engine.get_summary()
summary_df

# Cell 4: Profile data
products_profile = DataProfiler.profile_dataframe(products_df, 'products')
sales_profile = DataProfiler.profile_dataframe(sales_df, 'sales')

print(f"Products: {products_profile['row_count']} rows, {products_profile['column_count']} columns")
print(f"Sales: {sales_profile['row_count']} rows, {sales_profile['column_count']} columns")

# Cell 5: Detect anomalies
products_anomalies = AnomalyDetector.analyze_table_anomalies(products_df, 'products')
sales_anomalies = AnomalyDetector.analyze_table_anomalies(sales_df, 'sales')

anomaly_summary = AnomalyDetector.get_outlier_summary(products_anomalies)
anomaly_summary

# Cell 6: Generate comprehensive report
all_results = products_results + sales_results
all_profiles = {'products': products_profile, 'sales': sales_profile}
all_anomalies = {'products': products_anomalies, 'sales': sales_anomalies}

# Generate HTML report
DataQualityReporter.generate_html_report(
    validation_results=all_results,
    profiles=all_profiles,
    anomalies=all_anomalies,
    output_path='supply_chain_quality_report.html'
)

# Export to JSON
summary_report = DataQualityReporter.generate_summary_report(
    validation_results=all_results,
    profiles=all_profiles,
    anomalies=all_anomalies
)
DataQualityReporter.export_to_json(summary_report, 'quality_summary.json')


2026-02-01 23:55:45 - ETL_Pipeline - INFO - ✓ Connected to database: supply_chain_db
2026-02-01 23:55:45 - ETL_Pipeline - INFO - ✓ Connected to database: supply_chain_db
2026-02-01 23:55:45 - ETL_Pipeline - INFO - Extracting data from table: products
2026-02-01 23:55:45 - ETL_Pipeline - INFO - Extracting data from table: products
2026-02-01 23:55:45 - ETL_Pipeline - INFO - ✓ Extracted 46 rows from products
2026-02-01 23:55:45 - ETL_Pipeline - INFO - ✓ Extracted 46 rows from products
2026-02-01 23:55:45 - ETL_Pipeline - INFO - Extracting data from table: sales
2026-02-01 23:55:45 - ETL_Pipeline - INFO - Extracting data from table: sales
2026-02-01 23:55:45 - ETL_Pipeline - INFO - ✓ Extracted 166 rows from sales
2026-02-01 23:55:45 - ETL_Pipeline - INFO - ✓ Extracted 166 rows from sales
2026-02-01 23:55:45 - ETL_Pipeline - INFO - ✓ Database connection closed
2026-02-01 23:55:45 - ETL_Pipeline - INFO - ✓ Database connection closed


Products: 46 rows, 6 columns
Sales: 166 rows, 6 columns
✓ HTML report generated: supply_chain_quality_report.html
✓ Report exported to: quality_summary.json


In [3]:
# Cell 1: Setup
from etl import ETLConfig, DatabaseConnection, DataExtractor
from quality.rules_engine import DataQualityRulesEngine, ValidationRule
from pipeline import setup_logging

logger = setup_logging()
config = ETLConfig()

# Cell 2: Define custom rules
rules_engine = DataQualityRulesEngine()

# Custom rule: Products with very low reorder levels
rules_engine.add_rule('products', ValidationRule(
    rule_name='reorder_level_reasonable',
    rule_type='validity',
    column='reorder_level',
    condition=lambda df: df['reorder_level'] >= 10,
    severity='WARNING',
    description='Reorder level should be at least 10'
))

# Custom rule: Sales revenue matches calculated value
rules_engine.add_rule('sales', ValidationRule(
    rule_name='revenue_matches_calculation',
    rule_type='consistency',
    column='revenue',
    condition=lambda df: abs(df['revenue'] - (df['quantity_sold'] * df['revenue'] / df['quantity_sold'])) < 0.01,
    severity='CRITICAL',
    description='Revenue should match quantity * unit_price'
))

# Cell 3: Execute custom rules
with DatabaseConnection(config, logger) as db:
    extractor = DataExtractor(db, logger)
    products_df = extractor.extract_table('products')
    sales_df = extractor.extract_table('sales')

products_results = rules_engine.execute_rules(products_df, 'products')
sales_results = rules_engine.execute_rules(sales_df, 'sales')

# Cell 4: View results
for result in products_results + sales_results:
    status = "✓ PASS" if result.passed else "✗ FAIL"
    print(f"{status} - {result.rule_name}: {result.message}")


2026-02-01 23:56:16 - ETL_Pipeline - INFO - ✓ Connected to database: supply_chain_db
2026-02-01 23:56:16 - ETL_Pipeline - INFO - ✓ Connected to database: supply_chain_db
2026-02-01 23:56:16 - ETL_Pipeline - INFO - ✓ Connected to database: supply_chain_db
2026-02-01 23:56:16 - ETL_Pipeline - INFO - Extracting data from table: products
2026-02-01 23:56:16 - ETL_Pipeline - INFO - Extracting data from table: products
2026-02-01 23:56:16 - ETL_Pipeline - INFO - Extracting data from table: products
2026-02-01 23:56:16 - ETL_Pipeline - INFO - ✓ Extracted 46 rows from products
2026-02-01 23:56:16 - ETL_Pipeline - INFO - ✓ Extracted 46 rows from products
2026-02-01 23:56:16 - ETL_Pipeline - INFO - ✓ Extracted 46 rows from products
2026-02-01 23:56:16 - ETL_Pipeline - INFO - Extracting data from table: sales
2026-02-01 23:56:16 - ETL_Pipeline - INFO - Extracting data from table: sales
2026-02-01 23:56:16 - ETL_Pipeline - INFO - Extracting data from table: sales
2026-02-01 23:56:16 - ETL_Pipeline

✓ PASS - reorder_level_reasonable: ✓ PASS: Reorder level should be at least 10
✓ PASS - revenue_matches_calculation: ✓ PASS: Revenue should match quantity * unit_price
