## üîß Environment Detection & Validation

In [None]:
# --- CONFIGURATION ---
import os
import sys
import time
from pathlib import Path
from dotenv import load_dotenv

print("Starting configuration...")
start_time = time.time()

# 1. Detect Environment (avoid slow/hanging imports locally)
print("Detecting environment...")
IS_FABRIC = Path("/lakehouse/default/Files").exists()
if IS_FABRIC:
    try:
        from notebookutils import mssparkutils  # noqa: F401
    except Exception:
        mssparkutils = None
    print("Running in Microsoft Fabric")
else:
    print("Running Locally (Fabric path not found)")

print(f"Environment detection took {time.time() - start_time:.2f}s")

# 2. Define Paths based on Environment
if IS_FABRIC:
    # Fabric: Use Lakehouse Paths (OneLake mounted path)
    BASE_DIR = Path("/lakehouse/default/Files")
    
    # Try to load .env from the Lakehouse Files root if it exists
    env_path = BASE_DIR / ".env"
    if env_path.exists():
        load_dotenv(dotenv_path=env_path)
        print(f"Loaded configuration from {env_path}")
    
    # Medallion Architecture Paths
    # Use env vars if loaded, otherwise default to standard Fabric structure
    BRONZE_DIR = BASE_DIR / os.getenv("BRONZE_PATH", "data/Samples_LH_Bronze_Aims_26_parquet")
    SILVER_DIR = BASE_DIR / os.getenv("SILVER_PATH", "data/Silver")
    GOLD_DIR   = BASE_DIR / os.getenv("GOLD_PATH", "data/Gold")
    
    CONFIG_DIR = BASE_DIR / "config/data_quality"
    RESULTS_DIR = BASE_DIR / "config/validation_results"
    
    # Ensure Output Directory Exists
    try:
        CONFIG_DIR.mkdir(exist_ok=True, parents=True)
        RESULTS_DIR.mkdir(exist_ok=True, parents=True)
    except OSError:
        print("Warning: Could not create output directory. Ensure you have write permissions.")
    
    # Performance Settings for Spark/Fabric
    NUM_WORKERS = 8 
else:
    # Local: Set BASE_DIR first, then load other env vars
    # This ensures we use the correct project directory
    BASE_DIR = Path("/home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026")
    
    print(f"Loading .env from {BASE_DIR}...")
    # Load .env but don't override BASE_DIR
    load_dotenv(dotenv_path=BASE_DIR / ".env", override=False)
    
    BRONZE_DIR = BASE_DIR / "data/Samples_LH_Bronze_Aims_26_parquet"
    SILVER_DIR = BASE_DIR / "data/Silver"
    GOLD_DIR   = BASE_DIR / "data/Gold"
    
    CONFIG_DIR = BASE_DIR / "config/data_quality"
    RESULTS_DIR = BASE_DIR / "config/validation_results"
    
    # Ensure Output Directory Exists
    CONFIG_DIR.mkdir(exist_ok=True, parents=True)
    RESULTS_DIR.mkdir(exist_ok=True, parents=True)
    SILVER_DIR.mkdir(exist_ok=True, parents=True)
    GOLD_DIR.mkdir(exist_ok=True, parents=True)
    
    # Performance Settings for Local
    NUM_WORKERS = 4

# Common Settings
SAMPLE_SIZE = 100000 
STORAGE_FORMAT = "parquet"

# Validate Bronze data exists
if not BRONZE_DIR.exists():
    raise FileNotFoundError(f"Bronze directory not found: {BRONZE_DIR}")

# Count Bronze files
print(f"Scanning Bronze directory: {BRONZE_DIR}")
parquet_files = list(BRONZE_DIR.glob("*.parquet"))
if len(parquet_files) == 0:
    raise FileNotFoundError(f"No parquet files found in {BRONZE_DIR}")

print(f"Configuration:\n Environment: {'Fabric' if IS_FABRIC else 'Local'}")
print(f" Bronze (Source): {BRONZE_DIR}")
print(f" Silver (Target): {SILVER_DIR}")
print(f" Config Dir:  {CONFIG_DIR}")
print(f" Workers: {NUM_WORKERS}")
print(f" Found {len(parquet_files)} parquet files")
print(f"Configuration complete in {time.time() - start_time:.2f}s")

Starting configuration...
Detecting environment...
Running Locally (Fabric path not found)
Environment detection took 0.00s
Loading .env from /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026...
Scanning Bronze directory: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Samples_LH_Bronze_Aims_26_parquet
Configuration:
 Environment: Local
 Bronze (Source): /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Samples_LH_Bronze_Aims_26_parquet
 Silver (Target): /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Silver
 Config Dir:  /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality
 Workers: 4
 Found 68 parquet files
Configuration complete in 0.01s


## üì¶ Package Installation & Imports

In [2]:
#!pip install --quiet --upgrade great-expectations==0.18.8 ydata-profiling==4.5.1 pyarrow fastparquet

In [3]:
import os
import json
import pandas as pd
from datetime import datetime

# Disable Great Expectations analytics to speed up import
os.environ["GX_ANALYTICS_ENABLED"] = "False"

# Use the local library to ensure end-to-end alignment
from aims_data_platform import BatchProfiler, DataQualityValidator, DataLoader, ConfigLoader

print("‚úÖ Libraries imported successfully")

‚úÖ Libraries imported successfully


## ‚öôÔ∏è Pipeline Configuration

In [4]:
PIPELINE_CONFIG = {
    "run_profiling": True,          # Phase 1: Generate DQ configs
    "run_ingestion": True,          # Phase 2: Bronze ‚Üí Silver with validation
    "run_monitoring": True,         # Phase 3: DQ monitoring dashboards
    "run_dq_modeling": False,       # Phase 4: Advanced DQ modeling (optional)
    "run_bi_analytics": False,      # Phase 5: BI analytics (optional)
    "force_reprocess": False,       # Force reprocessing even if files exist
    "dq_threshold": 85.0,          # Global DQ threshold (85%)
    "max_workers": 8 if IS_FABRIC else 4,  # Parallel processing workers
    "continue_on_error": False,    # Continue pipeline even if phase fails
}

# Display Configuration
print("‚öôÔ∏è Pipeline Configuration:")
for key, value in PIPELINE_CONFIG.items():
    print(f"   {key}: {value}")

# Initialize Execution Log
execution_log = {
    "start_time": datetime.now().isoformat(),
    "environment": "Fabric" if IS_FABRIC else "Local",
    "storage_format": STORAGE_FORMAT,
    "config": PIPELINE_CONFIG,
    "phases": []
}

print("\n‚úÖ Configuration Complete")

‚öôÔ∏è Pipeline Configuration:
   run_profiling: True
   run_ingestion: True
   run_monitoring: True
   run_dq_modeling: False
   run_bi_analytics: False
   force_reprocess: False
   dq_threshold: 85.0
   max_workers: 4
   continue_on_error: False

‚úÖ Configuration Complete


## üöÄ Phase 1: Data Profiling

**Purpose:** Generate DQ validation configs for all Bronze layer tables

**Process:**
1. Profile each Bronze parquet file
2. Generate validation YAML configs
3. Save configs to `config/data_quality/`

In [5]:
if PIPELINE_CONFIG["run_profiling"]:
    print("\n" + "="*80)
    print("PHASE 1: DATA PROFILING")
    print("="*80)
    
    phase_start = datetime.now()
    
    try:
        # Add project root to path for imports
        if str(BASE_DIR) not in sys.path:
            sys.path.insert(0, str(BASE_DIR))
        
        # Import profiling modules
        from aims_data_platform import BatchProfiler
        
        print(f"\nüìä Profiling Bronze layer: {BRONZE_DIR}")
        print(f"   Workers: {PIPELINE_CONFIG['max_workers']}")
        print(f"   Output: {CONFIG_DIR}")
        
        # Define custom thresholds
        custom_thresholds = {
            "severity_threshold": "medium",
            "null_tolerance": 5.0,
            "include_structural": True,
            "include_completeness": True,
            "include_validity": True
        }
        
        # Run parallel profiling using BatchProfiler
        results = BatchProfiler.run_parallel_profiling(
            input_dir=str(BRONZE_DIR),
            output_dir=str(CONFIG_DIR),
            workers=PIPELINE_CONFIG['max_workers'],
            sample_size=100000,
            thresholds=custom_thresholds
        )
        
        # Count successes and errors
        success_results = [r for r in results if r.get('status') == 'success']
        error_results = [r for r in results if r.get('status') != 'success']
        
        # Display results
        print(f"\n‚úÖ Profiling Complete:")
        print(f"   Files Profiled: {len(success_results)}")
        print(f"   Configs Generated: {len(list(CONFIG_DIR.glob('*.yml')))}")
        if error_results:
            print(f"   Errors: {len(error_results)}")
            for err in error_results[:5]:
                print(f"      - {err.get('file', 'unknown')}: {err.get('error', 'unknown error')}")
        
        # Log phase execution
        execution_log["phases"].append({
            "phase": "profiling",
            "status": "success" if len(error_results) == 0 else "partial",
            "duration_seconds": (datetime.now() - phase_start).total_seconds(),
            "files_profiled": len(success_results),
            "configs_generated": len(list(CONFIG_DIR.glob('*.yml'))),
            "errors": len(error_results)
        })
        
    except Exception as e:
        print(f"\n‚ùå Profiling Failed: {e}")
        import traceback
        traceback.print_exc()
        
        execution_log["phases"].append({
            "phase": "profiling",
            "status": "failed",
            "error": str(e),
            "duration_seconds": (datetime.now() - phase_start).total_seconds()
        })
        
        if not PIPELINE_CONFIG.get("continue_on_error", False):
            raise
else:
    print("‚è≠Ô∏è Skipping Phase 1: Data Profiling (disabled in config)")


PHASE 1: DATA PROFILING

üìä Profiling Bronze layer: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Samples_LH_Bronze_Aims_26_parquet
   Workers: 4
   Output: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality
Found 68 files. Starting processing with 4 workers...
Configuration saved: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality/aims_organisations_validation.yml
   - 27 expectations generated
Configuration saved: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality/aims_attributegroups_validation.yml
   - 32 expectations generated
Configuration saved: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality/aims_organisations_validation.yml
   - 27 expectations generated
Configuration saved: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality/aims_attributegroups_validation.yml
   - 32 expectations generated
Configu

## ‚úÖ Phase 2: Data Validation & Ingestion

**Purpose:** Validate Bronze data and ingest to Silver layer

**Process:**
1. Load validation configs
2. Validate each Bronze table
3. Ingest passing records to Silver (Delta Lake in Fabric, Parquet locally)
4. Quarantine failing records

In [6]:
if PIPELINE_CONFIG["run_ingestion"]:
    print("\n" + "="*80)
    print("PHASE 2: DATA VALIDATION & INGESTION")
    print("="*80)
    
    phase_start = datetime.now()
    
    try:
        # Import validation modules
        from aims_data_platform import DataQualityValidator, DataLoader
        
        # Track validation results
        validation_results = {
            "timestamp": datetime.now().isoformat(),
            "threshold": PIPELINE_CONFIG['dq_threshold'],
            "storage_format": STORAGE_FORMAT,
            "files": {},
            "summary": {"total": 0, "passed": 0, "failed": 0, "skipped": 0, "errors": 0}
        }
        
        validation_results["summary"]["total"] = len(parquet_files)
        print(f"   Found {len(parquet_files)} parquet files to validate\n")
        
        # Validate each file
        for parquet_file in sorted(parquet_files):
            table_name = parquet_file.stem
            config_file = CONFIG_DIR / f"{table_name}_validation.yml"
            
            if not config_file.exists():
                print(f"‚ö†Ô∏è SKIPPED: {table_name}.parquet (no config)")
                validation_results["summary"]["skipped"] += 1
                continue
            
            try:
                # Load validator and data
                validator = DataQualityValidator(config_path=str(config_file))
                
                # Use DataLoader for safe loading (handles sampling)
                df_batch = DataLoader.load_data(str(parquet_file), sample_size=100000)
                result = validator.validate(df_batch)
                
                # Store results
                # Use top-level keys as per installed dq_framework version
                validation_results["files"][table_name] = {
                    "overall_success": result.get('success', False),
                    "success_percentage": result.get('success_rate', 0.0),
                    "statistics": {
                        "evaluated_expectations": result.get('evaluated_checks', 0),
                        "successful_expectations": result.get('successful_checks', 0)
                    }
                }
                
                # Update summary
                if result.get('success', False):
                    validation_results["summary"]["passed"] += 1
                    print(f"‚úÖ PASSED: {table_name}.parquet ({result.get('success_rate', 0):.1f}%)")
                    
                    # Ingest to Silver layer
                    silver_file = SILVER_DIR / f"{table_name}.parquet"
                    
                    # Write validated data to Silver layer
                    df_batch.to_parquet(silver_file, index=False, engine='pyarrow')
                    print(f"   ‚Üí Ingested to Silver: {silver_file.name}")
                        
                    # Simple file copy for ingestion
                    df_batch.to_parquet(silver_file, index=False, engine='pyarrow')
                    print(f"   ‚Üí Ingested to Silver: {silver_file.name}")
                        
                else:
                    validation_results["summary"]["failed"] += 1
                    print(f"‚ùå FAILED: {table_name}.parquet ({result.get('success_rate', 0):.1f}%)")
                    
            except Exception as e:
                validation_results["summary"]["errors"] += 1
                print(f"üí• ERROR: {table_name}.parquet - {e}")
        
        # Save validation results
        results_file = RESULTS_DIR / "validation_results.json"
        with open(results_file, 'w') as f:
            json.dump(validation_results, f, indent=2)
        
        # Display summary
        print(f"\n{'='*70}")
        print("VALIDATION SUMMARY")
        print(f"{'='*70}")
        summary = validation_results["summary"]
        print(f"Total Files:  {summary['total']}")
        print(f"‚úÖ Passed:     {summary['passed']}")
        print(f"‚ùå Failed:     {summary['failed']}")
        print(f"‚ö†Ô∏è  Skipped:    {summary['skipped']}")
        print(f"üí• Errors:     {summary['errors']}")
        print(f"\nPass Rate: {(summary['passed']/summary['total']*100):.1f}%")
        print(f"Results saved to: {results_file}")
        print(f"{'='*70}")
        
        # Log phase execution
        execution_log["phases"].append({
            "phase": "validation_ingestion",
            "status": "success",
            "duration_seconds": (datetime.now() - phase_start).total_seconds(),
            "validation_summary": validation_results["summary"]
        })
        
    except Exception as e:
        print(f"\n‚ùå Validation/Ingestion Failed: {e}")
        import traceback
        traceback.print_exc()
        
        execution_log["phases"].append({
            "phase": "validation_ingestion",
            "status": "failed",
            "error": str(e),
            "duration_seconds": (datetime.now() - phase_start).total_seconds()
        })
        
        if not PIPELINE_CONFIG.get("continue_on_error", False):
            raise


PHASE 2: DATA VALIDATION & INGESTION
   Found 68 parquet files to validate



Calculating Metrics:   0%|          | 0/69 [00:00<?, ?it/s]

‚úÖ PASSED: aims_activitydates.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_activitydates.parquet
   ‚Üí Ingested to Silver: aims_activitydates.parquet


Calculating Metrics:   0%|          | 0/526 [00:00<?, ?it/s]

‚úÖ PASSED: aims_assetattributes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_assetattributes.parquet
   ‚Üí Ingested to Silver: aims_assetattributes.parquet
   ‚Üí Ingested to Silver: aims_assetattributes.parquet
   ‚Üí Ingested to Silver: aims_assetattributes.parquet


Calculating Metrics:   0%|          | 0/193 [00:00<?, ?it/s]

‚úÖ PASSED: aims_assetclassattributes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_assetclassattributes.parquet
   ‚Üí Ingested to Silver: aims_assetclassattributes.parquet
   ‚Üí Ingested to Silver: aims_assetclassattributes.parquet


Calculating Metrics:   0%|          | 0/46 [00:00<?, ?it/s]

‚úÖ PASSED: aims_assetclasschangelogs.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_assetclasschangelogs.parquet
   ‚Üí Ingested to Silver: aims_assetclasschangelogs.parquet


Calculating Metrics:   0%|          | 0/382 [00:00<?, ?it/s]

‚úÖ PASSED: aims_assetclasses.parquet (99.1%)
   ‚Üí Ingested to Silver: aims_assetclasses.parquet
   ‚Üí Ingested to Silver: aims_assetclasses.parquet


Calculating Metrics:   0%|          | 0/140 [00:00<?, ?it/s]

‚úÖ PASSED: aims_assetclassrelationships.parquet (91.5%)
   ‚Üí Ingested to Silver: aims_assetclassrelationships.parquet
   ‚Üí Ingested to Silver: aims_assetclassrelationships.parquet


Calculating Metrics:   0%|          | 0/58 [00:00<?, ?it/s]

‚úÖ PASSED: aims_assetconsents.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_assetconsents.parquet
   ‚Üí Ingested to Silver: aims_assetconsents.parquet


Calculating Metrics:   0%|          | 0/39 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 85.7%)


‚ùå FAILED: aims_assethierarchymap.parquet (92.9%)


Calculating Metrics:   0%|          | 0/319 [00:00<?, ?it/s]

Validation FAILED: 3 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 97.4%)


‚ùå FAILED: aims_assetlocations.parquet (97.0%)


Calculating Metrics:   0%|          | 0/219 [00:00<?, ?it/s]

‚úÖ PASSED: aims_assets.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_assets.parquet
   ‚Üí Ingested to Silver: aims_assets.parquet
   ‚Üí Ingested to Silver: aims_assets.parquet


Calculating Metrics:   0%|          | 0/127 [00:00<?, ?it/s]

‚úÖ PASSED: aims_attributedomains.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_attributedomains.parquet
   ‚Üí Ingested to Silver: aims_attributedomains.parquet


Calculating Metrics:   0%|          | 0/94 [00:00<?, ?it/s]

‚úÖ PASSED: aims_attributedomainvalues.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_attributedomainvalues.parquet
   ‚Üí Ingested to Silver: aims_attributedomainvalues.parquet


Calculating Metrics:   0%|          | 0/110 [00:00<?, ?it/s]

‚úÖ PASSED: aims_attributegroups.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_attributegroups.parquet
   ‚Üí Ingested to Silver: aims_attributegroups.parquet


Calculating Metrics:   0%|          | 0/301 [00:00<?, ?it/s]

‚úÖ PASSED: aims_attributes.parquet (96.7%)
   ‚Üí Ingested to Silver: aims_attributes.parquet
   ‚Üí Ingested to Silver: aims_attributes.parquet


Calculating Metrics:   0%|          | 0/54 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.9%)


‚ùå FAILED: aims_consentlinks.parquet (95.5%)


Calculating Metrics:   0%|          | 0/69 [00:00<?, ?it/s]

‚úÖ PASSED: aims_consentmilestones.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_consentmilestones.parquet
   ‚Üí Ingested to Silver: aims_consentmilestones.parquet


Calculating Metrics:   0%|          | 0/46 [00:00<?, ?it/s]

‚úÖ PASSED: aims_consentmilestonetypes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_consentmilestonetypes.parquet
   ‚Üí Ingested to Silver: aims_consentmilestonetypes.parquet


Calculating Metrics:   0%|          | 0/238 [00:00<?, ?it/s]

‚úÖ PASSED: aims_consents.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_consents.parquet
   ‚Üí Ingested to Silver: aims_consents.parquet


Calculating Metrics:   0%|          | 0/90 [00:00<?, ?it/s]

‚úÖ PASSED: aims_consenttypemilestones.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_consenttypemilestones.parquet
   ‚Üí Ingested to Silver: aims_consenttypemilestones.parquet


Calculating Metrics:   0%|          | 0/67 [00:00<?, ?it/s]

‚úÖ PASSED: aims_consenttypes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_consenttypes.parquet
   ‚Üí Ingested to Silver: aims_consenttypes.parquet


Calculating Metrics:   0%|          | 0/106 [00:00<?, ?it/s]

‚úÖ PASSED: aims_informationneedassetclass.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_informationneedassetclass.parquet
   ‚Üí Ingested to Silver: aims_informationneedassetclass.parquet


Calculating Metrics:   0%|          | 0/131 [00:00<?, ?it/s]

‚úÖ PASSED: aims_informationneedattributes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_informationneedattributes.parquet
   ‚Üí Ingested to Silver: aims_informationneedattributes.parquet


Calculating Metrics:   0%|          | 0/74 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 93.3%)


‚ùå FAILED: aims_informationneeddocs.parquet (96.7%)


Calculating Metrics:   0%|          | 0/214 [00:00<?, ?it/s]

‚úÖ PASSED: aims_informationneedgeometries.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_informationneedgeometries.parquet
   ‚Üí Ingested to Silver: aims_informationneedgeometries.parquet


Calculating Metrics:   0%|          | 0/148 [00:00<?, ?it/s]

‚úÖ PASSED: aims_informationneedlinks.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_informationneedlinks.parquet
   ‚Üí Ingested to Silver: aims_informationneedlinks.parquet


Calculating Metrics:   0%|          | 0/338 [00:00<?, ?it/s]

‚úÖ PASSED: aims_informationneedpropchngs.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_informationneedpropchngs.parquet
   ‚Üí Ingested to Silver: aims_informationneedpropchngs.parquet


Calculating Metrics:   0%|          | 0/240 [00:00<?, ?it/s]

‚úÖ PASSED: aims_informationneeds.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_informationneeds.parquet
   ‚Üí Ingested to Silver: aims_informationneeds.parquet


Calculating Metrics:   0%|          | 0/99 [00:00<?, ?it/s]

‚úÖ PASSED: aims_informationneedsourcedocs.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_informationneedsourcedocs.parquet
   ‚Üí Ingested to Silver: aims_informationneedsourcedocs.parquet


Calculating Metrics:   0%|          | 0/73 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.0%)


‚ùå FAILED: aims_informationneedstatusupd.parquet (95.5%)


Calculating Metrics:   0%|          | 0/54 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.9%)


‚ùå FAILED: aims_informationpackages.parquet (95.5%)


Calculating Metrics:   0%|          | 0/126 [00:00<?, ?it/s]

‚úÖ PASSED: aims_links.parquet (97.6%)
   ‚Üí Ingested to Silver: aims_links.parquet
   ‚Üí Ingested to Silver: aims_links.parquet


Calculating Metrics:   0%|          | 0/136 [00:00<?, ?it/s]

‚úÖ PASSED: aims_linktypes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_linktypes.parquet
   ‚Üí Ingested to Silver: aims_linktypes.parquet


Calculating Metrics:   0%|          | 0/1045 [00:00<?, ?it/s]

‚úÖ PASSED: aims_noncompliances.parquet (98.7%)
   ‚Üí Ingested to Silver: aims_noncompliances.parquet
   ‚Üí Ingested to Silver: aims_noncompliances.parquet


Calculating Metrics:   0%|          | 0/89 [00:00<?, ?it/s]

‚úÖ PASSED: aims_organisations.parquet (96.3%)
   ‚Üí Ingested to Silver: aims_organisations.parquet
   ‚Üí Ingested to Silver: aims_organisations.parquet


Calculating Metrics:   0%|          | 0/98 [00:00<?, ?it/s]

‚úÖ PASSED: aims_owners.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_owners.parquet
   ‚Üí Ingested to Silver: aims_owners.parquet


Calculating Metrics:   0%|          | 0/138 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 94.1%)


‚ùå FAILED: aims_people.parquet (97.6%)


Calculating Metrics:   0%|          | 0/91 [00:00<?, ?it/s]

‚úÖ PASSED: aims_phases.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_phases.parquet
   ‚Üí Ingested to Silver: aims_phases.parquet


Calculating Metrics:   0%|          | 0/59 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 91.7%)


‚ùå FAILED: aims_productassetclasses.parquet (95.8%)


Calculating Metrics:   0%|          | 0/104 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 95.2%)


‚ùå FAILED: aims_productcharacteristics.parquet (97.6%)


Calculating Metrics:   0%|          | 0/49 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.0%)


‚ùå FAILED: aims_productlinks.parquet (95.0%)


Calculating Metrics:   0%|          | 0/109 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 95.5%)


‚ùå FAILED: aims_products.parquet (97.7%)


Calculating Metrics:   0%|          | 0/255 [00:00<?, ?it/s]

‚úÖ PASSED: aims_projectitemactions.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_projectitemactions.parquet
   ‚Üí Ingested to Silver: aims_projectitemactions.parquet


Calculating Metrics:   0%|          | 0/83 [00:00<?, ?it/s]

‚úÖ PASSED: aims_projectitemassignedroles.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_projectitemassignedroles.parquet
   ‚Üí Ingested to Silver: aims_projectitemassignedroles.parquet


Calculating Metrics:   0%|          | 0/221 [00:00<?, ?it/s]

‚úÖ PASSED: aims_projectitemattributes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_projectitemattributes.parquet
   ‚Üí Ingested to Silver: aims_projectitemattributes.parquet


Calculating Metrics:   0%|          | 0/64 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 92.3%)


‚ùå FAILED: aims_projectitemlinks.parquet (96.2%)


Calculating Metrics:   0%|          | 0/121 [00:00<?, ?it/s]

‚úÖ PASSED: aims_projectitems.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_projectitems.parquet
   ‚Üí Ingested to Silver: aims_projectitems.parquet


Calculating Metrics:   0%|          | 0/104 [00:00<?, ?it/s]

‚úÖ PASSED: aims_relationships.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_relationships.parquet
   ‚Üí Ingested to Silver: aims_relationships.parquet
   ‚Üí Ingested to Silver: aims_relationships.parquet


Calculating Metrics:   0%|          | 0/176 [00:00<?, ?it/s]

‚úÖ PASSED: aims_relationshiptypes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_relationshiptypes.parquet
   ‚Üí Ingested to Silver: aims_relationshiptypes.parquet


Calculating Metrics:   0%|          | 0/139 [00:00<?, ?it/s]

‚úÖ PASSED: aims_routes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_routes.parquet
   ‚Üí Ingested to Silver: aims_routes.parquet


Calculating Metrics:   0%|          | 0/87 [00:00<?, ?it/s]

‚úÖ PASSED: aims_secondaryassetclasscodes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_secondaryassetclasscodes.parquet
   ‚Üí Ingested to Silver: aims_secondaryassetclasscodes.parquet


Calculating Metrics:   0%|          | 0/91 [00:00<?, ?it/s]

‚úÖ PASSED: aims_stages.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_stages.parquet
   ‚Üí Ingested to Silver: aims_stages.parquet


Calculating Metrics:   0%|          | 0/178 [00:00<?, ?it/s]

‚úÖ PASSED: aims_taskdefinitions.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_taskdefinitions.parquet
   ‚Üí Ingested to Silver: aims_taskdefinitions.parquet


Calculating Metrics:   0%|          | 0/41 [00:00<?, ?it/s]

‚úÖ PASSED: aims_tracks.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_tracks.parquet
   ‚Üí Ingested to Silver: aims_tracks.parquet
   ‚Üí Ingested to Silver: aims_tracks.parquet
   ‚Üí Ingested to Silver: aims_tracks.parquet


Calculating Metrics:   0%|          | 0/93 [00:00<?, ?it/s]

‚úÖ PASSED: aims_ua_beneficiaries.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_ua_beneficiaries.parquet
   ‚Üí Ingested to Silver: aims_ua_beneficiaries.parquet


Calculating Metrics:   0%|          | 0/83 [00:00<?, ?it/s]

‚úÖ PASSED: aims_ua_comments.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_ua_comments.parquet
   ‚Üí Ingested to Silver: aims_ua_comments.parquet


Calculating Metrics:   0%|          | 0/90 [00:00<?, ?it/s]

‚úÖ PASSED: aims_ua_entities.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_ua_entities.parquet
   ‚Üí Ingested to Silver: aims_ua_entities.parquet


Calculating Metrics:   0%|          | 0/81 [00:00<?, ?it/s]

‚úÖ PASSED: aims_ua_meetingattendees.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_ua_meetingattendees.parquet
   ‚Üí Ingested to Silver: aims_ua_meetingattendees.parquet


Calculating Metrics:   0%|          | 0/88 [00:00<?, ?it/s]

‚úÖ PASSED: aims_ua_meetings.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_ua_meetings.parquet
   ‚Üí Ingested to Silver: aims_ua_meetings.parquet


Calculating Metrics:   0%|          | 0/94 [00:00<?, ?it/s]

‚úÖ PASSED: aims_ua_noncompimppartytypes.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_ua_noncompimppartytypes.parquet
   ‚Üí Ingested to Silver: aims_ua_noncompimppartytypes.parquet


Calculating Metrics:   0%|          | 0/91 [00:00<?, ?it/s]

‚úÖ PASSED: aims_ua_noncomplianceimpacts.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_ua_noncomplianceimpacts.parquet
   ‚Üí Ingested to Silver: aims_ua_noncomplianceimpacts.parquet


Calculating Metrics:   0%|          | 0/112 [00:00<?, ?it/s]

‚úÖ PASSED: aims_ua_noncompotheruas.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_ua_noncompotheruas.parquet
   ‚Üí Ingested to Silver: aims_ua_noncompotheruas.parquet


Calculating Metrics:   0%|          | 0/46 [00:00<?, ?it/s]

‚úÖ PASSED: aims_ua_optionvalues.parquet (100.0%)
   ‚Üí Ingested to Silver: aims_ua_optionvalues.parquet
   ‚Üí Ingested to Silver: aims_ua_optionvalues.parquet
   ‚Üí Ingested to Silver: aims_ua_optionvalues.parquet
   ‚Üí Ingested to Silver: aims_ua_optionvalues.parquet


Calculating Metrics:   0%|          | 0/283 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 97.3%)


‚ùå FAILED: aims_undertakings_assurances.parquet (98.9%)


Calculating Metrics:   0%|          | 0/89 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 94.4%)


‚ùå FAILED: aims_workbanks.parquet (97.2%)


Calculating Metrics:   0%|          | 0/54 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 90.9%)


‚ùå FAILED: aims_workbankworkorders.parquet (95.5%)


Calculating Metrics:   0%|          | 0/89 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 94.4%)


‚ùå FAILED: aims_workorderattributes.parquet (97.2%)


Calculating Metrics:   0%|          | 0/119 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 95.8%)


‚ùå FAILED: aims_workorders.parquet (97.9%)


Calculating Metrics:   0%|          | 0/89 [00:00<?, ?it/s]

Validation FAILED: 1 checks failed. Reasons: Severity 'critical' threshold 100.0% failed (actual: 94.4%)


‚ùå FAILED: aims_workorderstatustransition.parquet (97.2%)

VALIDATION SUMMARY
Total Files:  68
‚úÖ Passed:     50
‚ùå Failed:     18
‚ö†Ô∏è  Skipped:    0
üí• Errors:     0

Pass Rate: 73.5%
Results saved to: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/validation_results/validation_results.json


## üìà Phase 3: Data Quality Monitoring

**Purpose:** Generate DQ dashboards and monitoring reports

In [7]:
if PIPELINE_CONFIG["run_monitoring"]:
    print("\n" + "="*80)
    print("PHASE 3: DATA QUALITY MONITORING")
    print("="*80)
    
    phase_start = datetime.now()
    
    try:
        # Load validation results
        results_file = RESULTS_DIR / "validation_results.json"
        
        if not results_file.exists():
            print("‚ö†Ô∏è No validation results found. Skipping monitoring.")
        else:
            with open(results_file, 'r') as f:
                validation_data = json.load(f)
            
            print(f"\nüìä Generating monitoring dashboards...")
            print(f"   Data source: {results_file}")
            
            # Check if we have file results
            files_data = validation_data.get("files", {})
            if not files_data:
                print("‚ö†Ô∏è No file validation results available. Run validation first.")
                print(f"\nüìä Summary Statistics:")
                summary = validation_data.get("summary", {})
                print(f"   Total Files: {summary.get('total', 0)}")
                print(f"   Passed: {summary.get('passed', 0)}")
                print(f"   Failed: {summary.get('failed', 0)}")
                print(f"   Skipped: {summary.get('skipped', 0)}")
                print(f"   Errors: {summary.get('errors', 0)}")
            else:
                # Create summary DataFrame
                summary_data = []
                for table_name, result in files_data.items():
                    summary_data.append({
                        "Table": table_name,
                        "Success %": result.get("success_percentage", 0),
                        "Status": "Passed" if result.get("overall_success") else "Failed",
                        "Evaluated": result.get("statistics", {}).get("evaluated_expectations", 0),
                        "Successful": result.get("statistics", {}).get("successful_expectations", 0)
                    })
                
                df_summary = pd.DataFrame(summary_data)
                
                print(f"\nüìã DQ Summary:")
                print(df_summary.head(10).to_string(index=False))
                
                # Calculate key metrics
                avg_quality = df_summary["Success %"].mean()
                pass_rate = (df_summary["Status"] == "Passed").sum() / len(df_summary) * 100
                
                print(f"\nüìä Key Metrics:")
                print(f"   Average Quality Score: {avg_quality:.1f}%")
                print(f"   Pass Rate: {pass_rate:.1f}%")
                print(f"   Tables Monitored: {len(df_summary)}")
                
                # Log phase execution
                execution_log["phases"].append({
                    "phase": "monitoring",
                    "status": "success",
                    "duration_seconds": (datetime.now() - phase_start).total_seconds(),
                    "metrics": {
                        "avg_quality_score": float(avg_quality),
                        "pass_rate": float(pass_rate),
                        "tables_monitored": len(df_summary)
                    }
                })
                
    except Exception as e:
        print(f"\n‚ùå Monitoring Failed: {e}")
        import traceback
        traceback.print_exc()
        
        execution_log["phases"].append({
            "phase": "monitoring",
            "status": "failed",
            "error": str(e),
            "duration_seconds": (datetime.now() - phase_start).total_seconds()
        })
        
        if not PIPELINE_CONFIG.get("continue_on_error", False):
            raise
else:
    print("‚è≠Ô∏è Skipping Phase 3: Monitoring (disabled in config)")


PHASE 3: DATA QUALITY MONITORING

üìä Generating monitoring dashboards...
   Data source: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/validation_results/validation_results.json

üìã DQ Summary:
                       Table  Success % Status  Evaluated  Successful
          aims_activitydates 100.000000 Passed         23          23
        aims_assetattributes 100.000000 Passed        159         159
   aims_assetclassattributes 100.000000 Passed         63          63
   aims_assetclasschangelogs 100.000000 Passed         15          15
           aims_assetclasses  99.137931 Passed        116         115
aims_assetclassrelationships  91.489362 Passed         47          43
          aims_assetconsents 100.000000 Passed         17          17
      aims_assethierarchymap  92.857143 Failed         14          13
         aims_assetlocations  97.000000 Failed        100          97
                 aims_assets 100.000000 Passed         70          70

üìä Ke

## üìù Pipeline Execution Summary

In [8]:
# Calculate success rate
successful_phases = sum(1 for p in execution_log["phases"] if p["status"] in ["success", "partial"])
total_phases = len(execution_log["phases"])
success_rate = (successful_phases / total_phases * 100) if total_phases > 0 else 0

print(f"\nüìä Overall Status:")
print(f"   Phases Completed: {successful_phases}/{total_phases}")
print(f"   Success Rate: {success_rate:.1f}%")

# Save execution log
log_file = RESULTS_DIR / f"orchestration_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(log_file, 'w') as f:
    json.dump(execution_log, f, indent=2)

print(f"\nüíæ Execution log saved to: {log_file}")
print("\n" + "="*80)

if success_rate == 100:
    print("üéâ ALL PHASES COMPLETED SUCCESSFULLY!")
elif success_rate >= 80:
    print("‚ö†Ô∏è PIPELINE COMPLETED WITH WARNINGS")
else:
    print("‚ùå PIPELINE COMPLETED WITH ERRORS")

print("="*80)


üìä Overall Status:
   Phases Completed: 3/3
   Success Rate: 100.0%

üíæ Execution log saved to: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/validation_results/orchestration_log_20251211_001530.json

üéâ ALL PHASES COMPLETED SUCCESSFULLY!
