## 🔧 Environment Detection & Validation

In [2]:
# --- UNIFIED CONFIGURATION ---
import sys
import time
from pathlib import Path
from datetime import datetime

print("Starting configuration...")
start_time = time.time()

# Add project root to path for imports
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Import centralized configuration
try:
    from notebooks.config import settings
    from notebooks.lib import platform_utils, logging_utils
    from notebooks.lib.storage import StorageManager
    
    # Platform detection
    IS_FABRIC = platform_utils.IS_FABRIC
    
    # Paths from settings
    BASE_DIR = settings.base_dir
    BRONZE_DIR = settings.bronze_dir
    SILVER_DIR = settings.silver_dir
    GOLD_DIR = settings.gold_dir
    CONFIG_DIR = settings.config_dir
    RESULTS_DIR = settings.validation_results_dir  # Use correct property name
    
    # Performance settings
    NUM_WORKERS = settings.max_workers
    SAMPLE_SIZE = settings.sample_size
    STORAGE_FORMAT = settings.storage_format
    
    # Initialize storage manager
    storage_manager = StorageManager()
    
    # Setup logger for orchestration
    logger = logging_utils.setup_notebook_logger("orchestration")
    
    print(f"✅ Loaded configuration for environment: {settings.environment}")
    print(f"   Platform: {'Microsoft Fabric' if IS_FABRIC else 'Local'}")
    
except ImportError as e:
    print(f"⚠️ Falling back to inline configuration: {e}")
    
    # Fallback: Manual environment detection
    IS_FABRIC = Path("/lakehouse/default/Files").exists()
    
    if IS_FABRIC:
        BASE_DIR = Path("/lakehouse/default/Files")
    else:
        _notebook_dir = Path.cwd()
        _candidate = _notebook_dir
        for _ in range(5):
            if (_candidate / "aims_data_platform").exists() or (_candidate / "pyproject.toml").exists():
                BASE_DIR = _candidate
                break
            _candidate = _candidate.parent
        else:
            BASE_DIR = _notebook_dir.parent if _notebook_dir.name == "notebooks" else _notebook_dir
    
    BRONZE_DIR = BASE_DIR / "data/Samples_LH_Bronze_Aims_26_parquet"
    SILVER_DIR = BASE_DIR / "data/Silver"
    GOLD_DIR = BASE_DIR / "data/Gold"
    CONFIG_DIR = BASE_DIR / "config/data_quality"
    RESULTS_DIR = BASE_DIR / "config/validation_results"
    
    NUM_WORKERS = 8 if IS_FABRIC else 4
    SAMPLE_SIZE = 100000
    STORAGE_FORMAT = "parquet"
    
    # Create directories
    for dir_path in [CONFIG_DIR, RESULTS_DIR, SILVER_DIR, GOLD_DIR]:
        dir_path.mkdir(exist_ok=True, parents=True)
    
    # Fallback storage manager is None
    storage_manager = None
    logger = None
    
    print(f"   Using fallback configuration for {'Fabric' if IS_FABRIC else 'Local'}")

print(f"Environment detection took {time.time() - start_time:.2f}s")

# Validate Bronze data exists
if not BRONZE_DIR.exists():
    raise FileNotFoundError(f"Bronze directory not found: {BRONZE_DIR}")

# Count Bronze files
print(f"Scanning Bronze directory: {BRONZE_DIR}")
parquet_files = list(BRONZE_DIR.glob("*.parquet"))
if len(parquet_files) == 0:
    raise FileNotFoundError(f"No parquet files found in {BRONZE_DIR}")

print(f"\nConfiguration Summary:")
print(f"   Environment: {'Fabric' if IS_FABRIC else 'Local'}")
print(f"   Bronze (Source): {BRONZE_DIR}")
print(f"   Silver (Target): {SILVER_DIR}")
print(f"   Config Dir: {CONFIG_DIR}")
print(f"   Workers: {NUM_WORKERS}")
print(f"   Found {len(parquet_files)} parquet files")
print(f"Configuration complete in {time.time() - start_time:.2f}s")

Starting configuration...
✅ Loaded configuration for environment: local
   Platform: Local
Environment detection took 0.00s
Scanning Bronze directory: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Samples_LH_Bronze_Aims_26_parquet

Configuration Summary:
   Environment: Local
   Bronze (Source): /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Samples_LH_Bronze_Aims_26_parquet
   Silver (Target): /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Silver
   Config Dir: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality
   Workers: 4
   Found 68 parquet files
Configuration complete in 0.00s


## 📦 Package Installation & Imports

In [2]:
#!pip install --quiet --upgrade great-expectations==0.18.8 ydata-profiling==4.5.1 pyarrow fastparquet

In [3]:
import os
import json
import pandas as pd
from datetime import datetime

# Disable Great Expectations analytics to speed up import
os.environ["GX_ANALYTICS_ENABLED"] = "False"

# Use the local library to ensure end-to-end alignment
from aims_data_platform import BatchProfiler, DataQualityValidator, DataLoader, ConfigLoader

# Import logging utilities if available
try:
    from notebooks.lib.logging_utils import timed_operation, log_phase
    LOGGING_UTILS_AVAILABLE = True
except ImportError:
    LOGGING_UTILS_AVAILABLE = False
    # Fallback: simple context manager
    from contextlib import contextmanager
    @contextmanager
    def timed_operation(description, logger=None):
        print(f"⏱️ {description}...")
        start = time.time()
        yield
        print(f"⏱️ {description} completed in {time.time() - start:.2f}s")

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


## ⚙️ Pipeline Configuration

In [4]:
# Pipeline configuration - using settings where available
try:
    # Use settings pipeline_phases if available
    PIPELINE_CONFIG = {
        "run_profiling": settings.pipeline_phases.get("profiling", True),
        "run_ingestion": settings.pipeline_phases.get("ingestion", True),
        "run_monitoring": settings.pipeline_phases.get("monitoring", True),
        "run_dq_modeling": settings.pipeline_phases.get("dq_modeling", False),
        "run_bi_analytics": settings.pipeline_phases.get("bi_analytics", False),
        "force_reprocess": False,
        "dq_threshold": settings.get_dq_threshold("medium"),
        "max_workers": settings.max_workers,
        "continue_on_error": False,
    }
except (NameError, AttributeError):
    # Fallback to hardcoded defaults
    PIPELINE_CONFIG = {
        "run_profiling": True,
        "run_ingestion": True,
        "run_monitoring": True,
        "run_dq_modeling": False,
        "run_bi_analytics": False,
        "force_reprocess": False,
        "dq_threshold": 85.0,
        "max_workers": NUM_WORKERS,
        "continue_on_error": False,
    }

# Display Configuration
print("⚙️ Pipeline Configuration:")
for key, value in PIPELINE_CONFIG.items():
    print(f"   {key}: {value}")

# Initialize Execution Log
execution_log = {
    "start_time": datetime.now().isoformat(),
    "environment": "Fabric" if IS_FABRIC else "Local",
    "storage_format": STORAGE_FORMAT,
    "config": PIPELINE_CONFIG,
    "phases": []
}

print("\n✅ Configuration Complete")

⚙️ Pipeline Configuration:
   run_profiling: True
   run_ingestion: True
   run_monitoring: True
   run_dq_modeling: False
   run_bi_analytics: False
   force_reprocess: False
   dq_threshold: 85.0
   max_workers: 4
   continue_on_error: False

✅ Configuration Complete


## 🚀 Phase 1: Data Profiling

**Purpose:** Generate DQ validation configs for all Bronze layer tables

**Process:**
1. Profile each Bronze parquet file
2. Generate validation YAML configs
3. Save configs to `config/data_quality/`

In [5]:
if PIPELINE_CONFIG["run_profiling"]:
    print("\n" + "="*80)
    print("PHASE 1: DATA PROFILING")
    print("="*80)
    
    phase_start = datetime.now()
    
    try:
        with timed_operation("Phase 1: Data Profiling", logger):
            # Import profiling modules
            from aims_data_platform import BatchProfiler
            
            print(f"\n📊 Profiling Bronze layer: {BRONZE_DIR}")
            print(f"   Workers: {PIPELINE_CONFIG['max_workers']}")
            print(f"   Output: {CONFIG_DIR}")
            
            # Define custom thresholds
            custom_thresholds = {
                "severity_threshold": "medium",
                "null_tolerance": 5.0,
                "include_structural": True,
                "include_completeness": True,
                "include_validity": True
            }
            
            # Run parallel profiling using BatchProfiler
            results = BatchProfiler.run_parallel_profiling(
                input_dir=str(BRONZE_DIR),
                output_dir=str(CONFIG_DIR),
                workers=PIPELINE_CONFIG['max_workers'],
                sample_size=SAMPLE_SIZE,
                thresholds=custom_thresholds
            )
            
            # Count successes and errors
            success_results = [r for r in results if r.get('status') == 'success']
            error_results = [r for r in results if r.get('status') != 'success']
            
            # Display results
            print(f"\n✅ Profiling Complete:")
            print(f"   Files Profiled: {len(success_results)}")
            print(f"   Configs Generated: {len(list(CONFIG_DIR.glob('*.yml')))}")
            if error_results:
                print(f"   Errors: {len(error_results)}")
                for err in error_results[:5]:
                    print(f"      - {err.get('file', 'unknown')}: {err.get('error', 'unknown error')}")
        
        # Log phase execution
        execution_log["phases"].append({
            "phase": "profiling",
            "status": "success" if len(error_results) == 0 else "partial",
            "duration_seconds": (datetime.now() - phase_start).total_seconds(),
            "files_profiled": len(success_results),
            "configs_generated": len(list(CONFIG_DIR.glob('*.yml'))),
            "errors": len(error_results)
        })
        
    except Exception as e:
        print(f"\n❌ Profiling Failed: {e}")
        import traceback
        traceback.print_exc()
        
        execution_log["phases"].append({
            "phase": "profiling",
            "status": "failed",
            "error": str(e),
            "duration_seconds": (datetime.now() - phase_start).total_seconds()
        })
        
        if not PIPELINE_CONFIG.get("continue_on_error", False):
            raise
else:
    print("⏭️ Skipping Phase 1: Data Profiling (disabled in config)")


PHASE 1: DATA PROFILING
2026-01-19 13:09:35 | INFO     | orchestration | ⏱️ Phase 1: Data Profiling...

📊 Profiling Bronze layer: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Samples_LH_Bronze_Aims_26_parquet
   Workers: 4
   Output: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality

✅ Profiling Complete:
   Files Profiled: 68
   Configs Generated: 68
2026-01-19 13:09:43 | INFO     | orchestration | ⏱️ Phase 1: Data Profiling completed in 7.52s


## ✅ Phase 2: Data Validation & Ingestion

**Purpose:** Validate Bronze data and ingest to Silver layer

**Process:**
1. Load validation configs
2. Validate each Bronze table
3. Ingest passing records to Silver (Delta Lake in Fabric, Parquet locally)
4. Quarantine failing records

In [None]:
if PIPELINE_CONFIG["run_ingestion"]:
    print("\n" + "="*80)
    print("PHASE 2: DATA VALIDATION & INGESTION")
    print("="*80)
    
    phase_start = datetime.now()
    
    try:
        with timed_operation("Phase 2: Validation & Ingestion", logger):
            # Import validation modules
            from aims_data_platform import DataQualityValidator, DataLoader
            
            # Clear Silver layer for complete overwrite (no append/delta)
            # Raw files are archived in landing zone with date stamps
            if storage_manager is not None:
                clear_result = storage_manager.clear_layer("silver")
                print(f"   Cleared Silver layer: {clear_result['files_cleared']} tables removed")
            elif SILVER_DIR.exists():
                import shutil
                for f in SILVER_DIR.glob("*.parquet"):
                    f.unlink()
                print(f"   Cleared Silver directory for fresh write")
            
            # Track validation results
            validation_results = {
                "timestamp": datetime.now().isoformat(),
                "threshold": PIPELINE_CONFIG['dq_threshold'],
                "storage_format": STORAGE_FORMAT,
                "files": {},
                "summary": {"total": 0, "passed": 0, "failed": 0, "skipped": 0, "errors": 0}
            }
            
            validation_results["summary"]["total"] = len(parquet_files)
            print(f"   Found {len(parquet_files)} parquet files to validate\n")
            
            # Validate each file
            for parquet_file in sorted(parquet_files):
                table_name = parquet_file.stem
                config_file = CONFIG_DIR / f"{table_name}_validation.yml"
                
                if not config_file.exists():
                    print(f"⚠️ SKIPPED: {table_name}.parquet (no config)")
                    validation_results["summary"]["skipped"] += 1
                    continue
                
                try:
                    # Load validator and data
                    validator = DataQualityValidator(config_path=str(config_file))
                    
                    # Use DataLoader for safe loading (handles sampling)
                    df_batch = DataLoader.load_data(str(parquet_file), sample_size=SAMPLE_SIZE)
                    result = validator.validate(df_batch)
                    
                    # Store results
                    validation_results["files"][table_name] = {
                        "overall_success": result.get('success', False),
                        "success_percentage": result.get('success_rate', 0.0),
                        "statistics": {
                            "evaluated_expectations": result.get('evaluated_checks', 0),
                            "successful_expectations": result.get('successful_checks', 0)
                        }
                    }
                    
                    # Update summary
                    if result.get('success', False):
                        validation_results["summary"]["passed"] += 1
                        print(f"✅ PASSED: {table_name}.parquet ({result.get('success_rate', 0):.1f}%)")
                        
                        # Ingest to Silver layer using StorageManager if available
                        silver_file = SILVER_DIR / f"{table_name}.parquet"
                        
                        if storage_manager is not None:
                            # Use storage manager for platform-aware write
                            try:
                                storage_manager.write_to_silver(df_batch, table_name)
                                print(f"   → Ingested to Silver via StorageManager: {table_name}")
                            except Exception as sm_err:
                                # Fallback to direct write
                                df_batch.to_parquet(silver_file, index=False, engine='pyarrow')
                                print(f"   → Ingested to Silver (fallback): {silver_file.name}")
                        else:
                            # Direct write fallback
                            df_batch.to_parquet(silver_file, index=False, engine='pyarrow')
                            print(f"   → Ingested to Silver: {silver_file.name}")
                            
                    else:
                        validation_results["summary"]["failed"] += 1
                        print(f"❌ FAILED: {table_name}.parquet ({result.get('success_rate', 0):.1f}%)")
                        
                except Exception as e:
                    validation_results["summary"]["errors"] += 1
                    print(f"💥 ERROR: {table_name}.parquet - {e}")
            
            # Save validation results
            results_file = RESULTS_DIR / "validation_results.json"
            with open(results_file, 'w') as f:
                json.dump(validation_results, f, indent=2)
            
            # Display summary
            print(f"\n{'='*70}")
            print("VALIDATION SUMMARY")
            print(f"{'='*70}")
            summary = validation_results["summary"]
            print(f"Total Files:  {summary['total']}")
            print(f"✅ Passed:     {summary['passed']}")
            print(f"❌ Failed:     {summary['failed']}")
            print(f"⚠️  Skipped:    {summary['skipped']}")
            print(f"💥 Errors:     {summary['errors']}")
            print(f"\nPass Rate: {(summary['passed']/max(summary['total'], 1)*100):.1f}%")
            print(f"Results saved to: {results_file}")
            print(f"{'='*70}")
        
        # Log phase execution
        execution_log["phases"].append({
            "phase": "validation_ingestion",
            "status": "success",
            "duration_seconds": (datetime.now() - phase_start).total_seconds(),
            "validation_summary": validation_results["summary"]
        })
        
    except Exception as e:
        print(f"\n❌ Validation/Ingestion Failed: {e}")
        import traceback
        traceback.print_exc()
        
        execution_log["phases"].append({
            "phase": "validation_ingestion",
            "status": "failed",
            "error": str(e),
            "duration_seconds": (datetime.now() - phase_start).total_seconds()
        })
        
        if not PIPELINE_CONFIG.get("continue_on_error", False):
            raise


PHASE 2: DATA VALIDATION & INGESTION
2026-01-19 13:09:47 | INFO     | orchestration | ⏱️ Phase 2: Validation & Ingestion...
   Found 68 parquet files to validate



Calculating Metrics:   0%|          | 0/69 [00:00<?, ?it/s]

✅ PASSED: aims_activitydates.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_activitydates


Calculating Metrics:   0%|          | 0/526 [00:00<?, ?it/s]

✅ PASSED: aims_assetattributes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_assetattributes


Calculating Metrics:   0%|          | 0/193 [00:00<?, ?it/s]

✅ PASSED: aims_assetclassattributes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_assetclassattributes


Calculating Metrics:   0%|          | 0/46 [00:00<?, ?it/s]

✅ PASSED: aims_assetclasschangelogs.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_assetclasschangelogs


Calculating Metrics:   0%|          | 0/382 [00:00<?, ?it/s]

✅ PASSED: aims_assetclasses.parquet (99.1%)
   → Ingested to Silver via StorageManager: aims_assetclasses


Calculating Metrics:   0%|          | 0/140 [00:00<?, ?it/s]

✅ PASSED: aims_assetclassrelationships.parquet (91.5%)
   → Ingested to Silver via StorageManager: aims_assetclassrelationships


Calculating Metrics:   0%|          | 0/58 [00:00<?, ?it/s]

✅ PASSED: aims_assetconsents.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_assetconsents


Calculating Metrics:   0%|          | 0/39 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 85.7%)


❌ FAILED: aims_assethierarchymap.parquet (92.9%)


Calculating Metrics:   0%|          | 0/319 [00:00<?, ?it/s]

Validation FAILED: 3 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 97.4%)


❌ FAILED: aims_assetlocations.parquet (97.0%)


Calculating Metrics:   0%|          | 0/219 [00:00<?, ?it/s]

✅ PASSED: aims_assets.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_assets


Calculating Metrics:   0%|          | 0/127 [00:00<?, ?it/s]

✅ PASSED: aims_attributedomains.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_attributedomains


Calculating Metrics:   0%|          | 0/94 [00:00<?, ?it/s]

✅ PASSED: aims_attributedomainvalues.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_attributedomainvalues


Calculating Metrics:   0%|          | 0/110 [00:00<?, ?it/s]

✅ PASSED: aims_attributegroups.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_attributegroups


Calculating Metrics:   0%|          | 0/301 [00:00<?, ?it/s]

✅ PASSED: aims_attributes.parquet (96.7%)
   → Ingested to Silver via StorageManager: aims_attributes


Calculating Metrics:   0%|          | 0/54 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.9%)


❌ FAILED: aims_consentlinks.parquet (95.5%)


Calculating Metrics:   0%|          | 0/69 [00:00<?, ?it/s]

✅ PASSED: aims_consentmilestones.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_consentmilestones


Calculating Metrics:   0%|          | 0/46 [00:00<?, ?it/s]

✅ PASSED: aims_consentmilestonetypes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_consentmilestonetypes


Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

✅ PASSED: aims_consents.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_consents


Calculating Metrics:   0%|          | 0/90 [00:00<?, ?it/s]

✅ PASSED: aims_consenttypemilestones.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_consenttypemilestones


Calculating Metrics:   0%|          | 0/67 [00:00<?, ?it/s]

✅ PASSED: aims_consenttypes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_consenttypes


Calculating Metrics:   0%|          | 0/106 [00:00<?, ?it/s]

✅ PASSED: aims_informationneedassetclass.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_informationneedassetclass


Calculating Metrics:   0%|          | 0/131 [00:00<?, ?it/s]

✅ PASSED: aims_informationneedattributes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_informationneedattributes


Calculating Metrics:   0%|          | 0/74 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 93.3%)


❌ FAILED: aims_informationneeddocs.parquet (96.7%)


Calculating Metrics:   0%|          | 0/214 [00:00<?, ?it/s]

✅ PASSED: aims_informationneedgeometries.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_informationneedgeometries


Calculating Metrics:   0%|          | 0/148 [00:00<?, ?it/s]

✅ PASSED: aims_informationneedlinks.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_informationneedlinks


Calculating Metrics:   0%|          | 0/338 [00:00<?, ?it/s]

✅ PASSED: aims_informationneedpropchngs.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_informationneedpropchngs


Calculating Metrics:   0%|          | 0/240 [00:00<?, ?it/s]

✅ PASSED: aims_informationneeds.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_informationneeds


Calculating Metrics:   0%|          | 0/99 [00:00<?, ?it/s]

✅ PASSED: aims_informationneedsourcedocs.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_informationneedsourcedocs


Calculating Metrics:   0%|          | 0/73 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.0%)


❌ FAILED: aims_informationneedstatusupd.parquet (95.5%)


Calculating Metrics:   0%|          | 0/54 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.9%)


❌ FAILED: aims_informationpackages.parquet (95.5%)


Calculating Metrics:   0%|          | 0/126 [00:00<?, ?it/s]

✅ PASSED: aims_links.parquet (97.6%)
   → Ingested to Silver via StorageManager: aims_links


Calculating Metrics:   0%|          | 0/136 [00:00<?, ?it/s]

✅ PASSED: aims_linktypes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_linktypes


Calculating Metrics:   0%|          | 0/1045 [00:00<?, ?it/s]

✅ PASSED: aims_noncompliances.parquet (98.7%)
   → Ingested to Silver via StorageManager: aims_noncompliances


Calculating Metrics:   0%|          | 0/89 [00:00<?, ?it/s]

✅ PASSED: aims_organisations.parquet (96.3%)
   → Ingested to Silver via StorageManager: aims_organisations


Calculating Metrics:   0%|          | 0/98 [00:00<?, ?it/s]

✅ PASSED: aims_owners.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_owners


Calculating Metrics:   0%|          | 0/138 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 94.1%)


❌ FAILED: aims_people.parquet (97.6%)


Calculating Metrics:   0%|          | 0/91 [00:00<?, ?it/s]

✅ PASSED: aims_phases.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_phases


Calculating Metrics:   0%|          | 0/59 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 91.7%)


❌ FAILED: aims_productassetclasses.parquet (95.8%)


Calculating Metrics:   0%|          | 0/104 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 95.2%)


❌ FAILED: aims_productcharacteristics.parquet (97.6%)


Calculating Metrics:   0%|          | 0/49 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.0%)


❌ FAILED: aims_productlinks.parquet (95.0%)


Calculating Metrics:   0%|          | 0/109 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 95.5%)


❌ FAILED: aims_products.parquet (97.7%)


Calculating Metrics:   0%|          | 0/255 [00:00<?, ?it/s]

✅ PASSED: aims_projectitemactions.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_projectitemactions


Calculating Metrics:   0%|          | 0/83 [00:00<?, ?it/s]

✅ PASSED: aims_projectitemassignedroles.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_projectitemassignedroles


Calculating Metrics:   0%|          | 0/221 [00:00<?, ?it/s]

✅ PASSED: aims_projectitemattributes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_projectitemattributes


Calculating Metrics:   0%|          | 0/64 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 92.3%)


❌ FAILED: aims_projectitemlinks.parquet (96.2%)


Calculating Metrics:   0%|          | 0/121 [00:00<?, ?it/s]

✅ PASSED: aims_projectitems.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_projectitems


Calculating Metrics:   0%|          | 0/104 [00:00<?, ?it/s]

✅ PASSED: aims_relationships.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_relationships


Calculating Metrics:   0%|          | 0/176 [00:00<?, ?it/s]

✅ PASSED: aims_relationshiptypes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_relationshiptypes


Calculating Metrics:   0%|          | 0/139 [00:00<?, ?it/s]

✅ PASSED: aims_routes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_routes


Calculating Metrics:   0%|          | 0/87 [00:00<?, ?it/s]

✅ PASSED: aims_secondaryassetclasscodes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_secondaryassetclasscodes


Calculating Metrics:   0%|          | 0/91 [00:00<?, ?it/s]

✅ PASSED: aims_stages.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_stages


Calculating Metrics:   0%|          | 0/178 [00:00<?, ?it/s]

✅ PASSED: aims_taskdefinitions.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_taskdefinitions


Calculating Metrics:   0%|          | 0/41 [00:00<?, ?it/s]

✅ PASSED: aims_tracks.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_tracks


Calculating Metrics:   0%|          | 0/93 [00:00<?, ?it/s]

✅ PASSED: aims_ua_beneficiaries.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_ua_beneficiaries


Calculating Metrics:   0%|          | 0/83 [00:00<?, ?it/s]

✅ PASSED: aims_ua_comments.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_ua_comments


Calculating Metrics:   0%|          | 0/90 [00:00<?, ?it/s]

✅ PASSED: aims_ua_entities.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_ua_entities


Calculating Metrics:   0%|          | 0/81 [00:00<?, ?it/s]

✅ PASSED: aims_ua_meetingattendees.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_ua_meetingattendees


Calculating Metrics:   0%|          | 0/88 [00:00<?, ?it/s]

✅ PASSED: aims_ua_meetings.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_ua_meetings


Calculating Metrics:   0%|          | 0/94 [00:00<?, ?it/s]

✅ PASSED: aims_ua_noncompimppartytypes.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_ua_noncompimppartytypes


Calculating Metrics:   0%|          | 0/91 [00:00<?, ?it/s]

✅ PASSED: aims_ua_noncomplianceimpacts.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_ua_noncomplianceimpacts


Calculating Metrics:   0%|          | 0/112 [00:00<?, ?it/s]

✅ PASSED: aims_ua_noncompotheruas.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_ua_noncompotheruas


Calculating Metrics:   0%|          | 0/46 [00:00<?, ?it/s]

✅ PASSED: aims_ua_optionvalues.parquet (100.0%)
   → Ingested to Silver via StorageManager: aims_ua_optionvalues


Calculating Metrics:   0%|          | 0/283 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 97.3%)


❌ FAILED: aims_undertakings_assurances.parquet (98.9%)


Calculating Metrics:   0%|          | 0/89 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 94.4%)


❌ FAILED: aims_workbanks.parquet (97.2%)


Calculating Metrics:   0%|          | 0/54 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.9%)


❌ FAILED: aims_workbankworkorders.parquet (95.5%)


Calculating Metrics:   0%|          | 0/89 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 94.4%)


❌ FAILED: aims_workorderattributes.parquet (97.2%)


Calculating Metrics:   0%|          | 0/119 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 95.8%)


❌ FAILED: aims_workorders.parquet (97.9%)


Calculating Metrics:   0%|          | 0/89 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 94.4%)


❌ FAILED: aims_workorderstatustransition.parquet (97.2%)

VALIDATION SUMMARY
Total Files:  68
✅ Passed:     50
❌ Failed:     18
⚠️  Skipped:    0
💥 Errors:     0

Pass Rate: 73.5%
Results saved to: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/notebooks/config/validation_results/validation_results.json
2026-01-19 13:10:51 | INFO     | orchestration | ⏱️ Phase 2: Validation & Ingestion completed in 64.40s


## 📈 Phase 3: Data Quality Monitoring

**Purpose:** Generate DQ dashboards and monitoring reports

In [7]:
if PIPELINE_CONFIG["run_monitoring"]:
    print("\n" + "="*80)
    print("PHASE 3: DATA QUALITY MONITORING")
    print("="*80)
    
    phase_start = datetime.now()
    
    try:
        with timed_operation("Phase 3: DQ Monitoring", logger):
            # Load validation results
            results_file = RESULTS_DIR / "validation_results.json"
            
            if not results_file.exists():
                print("⚠️ No validation results found. Skipping monitoring.")
            else:
                with open(results_file, 'r') as f:
                    validation_data = json.load(f)
                
                print(f"\n📊 Generating monitoring dashboards...")
                print(f"   Data source: {results_file}")
                
                # Check if we have file results
                files_data = validation_data.get("files", {})
                if not files_data:
                    print("⚠️ No file validation results available. Run validation first.")
                    print(f"\n📊 Summary Statistics:")
                    summary = validation_data.get("summary", {})
                    print(f"   Total Files: {summary.get('total', 0)}")
                    print(f"   Passed: {summary.get('passed', 0)}")
                    print(f"   Failed: {summary.get('failed', 0)}")
                    print(f"   Skipped: {summary.get('skipped', 0)}")
                    print(f"   Errors: {summary.get('errors', 0)}")
                else:
                    # Create summary DataFrame
                    summary_data = []
                    for table_name, result in files_data.items():
                        summary_data.append({
                            "Table": table_name,
                            "Success %": result.get("success_percentage", 0),
                            "Status": "Passed" if result.get("overall_success") else "Failed",
                            "Evaluated": result.get("statistics", {}).get("evaluated_expectations", 0),
                            "Successful": result.get("statistics", {}).get("successful_expectations", 0)
                        })
                    
                    df_summary = pd.DataFrame(summary_data)
                    
                    print(f"\n📋 DQ Summary:")
                    print(df_summary.head(10).to_string(index=False))
                    
                    # Calculate key metrics
                    avg_quality = df_summary["Success %"].mean()
                    pass_rate = (df_summary["Status"] == "Passed").sum() / len(df_summary) * 100
                    
                    print(f"\n📊 Key Metrics:")
                    print(f"   Average Quality Score: {avg_quality:.1f}%")
                    print(f"   Pass Rate: {pass_rate:.1f}%")
                    print(f"   Tables Monitored: {len(df_summary)}")
                    
                    # Log phase execution
                    execution_log["phases"].append({
                        "phase": "monitoring",
                        "status": "success",
                        "duration_seconds": (datetime.now() - phase_start).total_seconds(),
                        "metrics": {
                            "avg_quality_score": float(avg_quality),
                            "pass_rate": float(pass_rate),
                            "tables_monitored": len(df_summary)
                        }
                    })
                
    except Exception as e:
        print(f"\n❌ Monitoring Failed: {e}")
        import traceback
        traceback.print_exc()
        
        execution_log["phases"].append({
            "phase": "monitoring",
            "status": "failed",
            "error": str(e),
            "duration_seconds": (datetime.now() - phase_start).total_seconds()
        })
        
        if not PIPELINE_CONFIG.get("continue_on_error", False):
            raise
else:
    print("⏭️ Skipping Phase 3: Monitoring (disabled in config)")


PHASE 3: DATA QUALITY MONITORING
2026-01-19 13:10:58 | INFO     | orchestration | ⏱️ Phase 3: DQ Monitoring...

📊 Generating monitoring dashboards...
   Data source: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/notebooks/config/validation_results/validation_results.json

📋 DQ Summary:
                       Table  Success % Status  Evaluated  Successful
          aims_activitydates 100.000000 Passed         23          23
        aims_assetattributes 100.000000 Passed        159         159
   aims_assetclassattributes 100.000000 Passed         63          63
   aims_assetclasschangelogs 100.000000 Passed         15          15
           aims_assetclasses  99.137931 Passed        116         115
aims_assetclassrelationships  91.489362 Passed         47          43
          aims_assetconsents 100.000000 Passed         17          17
      aims_assethierarchymap  92.857143 Failed         14          13
         aims_assetlocations  97.000000 Failed        100         

## 📝 Pipeline Execution Summary

In [8]:
# Calculate success rate
successful_phases = sum(1 for p in execution_log["phases"] if p["status"] in ["success", "partial"])
total_phases = len(execution_log["phases"])
success_rate = (successful_phases / total_phases * 100) if total_phases > 0 else 0

# Finalize execution log
execution_log["end_time"] = datetime.now().isoformat()
execution_log["total_duration_seconds"] = sum(
    p.get("duration_seconds", 0) for p in execution_log["phases"]
)

print(f"\n📊 Overall Status:")
print(f"   Phases Completed: {successful_phases}/{total_phases}")
print(f"   Success Rate: {success_rate:.1f}%")
print(f"   Total Duration: {execution_log['total_duration_seconds']:.2f}s")

# Save execution log
log_file = RESULTS_DIR / f"orchestration_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(log_file, 'w') as f:
    json.dump(execution_log, f, indent=2)

print(f"\n💾 Execution log saved to: {log_file}")
print("\n" + "="*80)

if success_rate == 100:
    print("🎉 ALL PHASES COMPLETED SUCCESSFULLY!")
elif success_rate >= 80:
    print("⚠️ PIPELINE COMPLETED WITH WARNINGS")
else:
    print("❌ PIPELINE COMPLETED WITH ERRORS")

print("="*80)

# Log final summary if logger available
if logger:
    logger.info(f"Pipeline completed: {successful_phases}/{total_phases} phases successful")


📊 Overall Status:
   Phases Completed: 3/3
   Success Rate: 100.0%
   Total Duration: 71.93s

💾 Execution log saved to: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/notebooks/config/validation_results/orchestration_log_20260119_131102.json

🎉 ALL PHASES COMPLETED SUCCESSFULLY!
2026-01-19 13:11:02 | INFO     | orchestration | Pipeline completed: 3/3 phases successful
