## üîß Environment Detection & Validation

In [1]:
# --- ENVIRONMENT DETECTION & VALIDATION ---
import os
import sys
from pathlib import Path
from datetime import datetime
import json

# Detect Environment
try:
    from notebookutils import mssparkutils
    IS_FABRIC = True
    print("üåê Running in Microsoft Fabric")
except ImportError:
    IS_FABRIC = False
    print("üíª Running Locally")

# Fabric-Specific Validation
if IS_FABRIC:
    # Validate Lakehouse attachment
    try:
        workspace_id = mssparkutils.env.getWorkspaceId()
        print(f"‚úÖ Workspace ID: {workspace_id}")
    except Exception as e:
        raise RuntimeError(
            "‚ùå No Lakehouse attached to this notebook!\n"
            "Please attach a Lakehouse: Notebook toolbar > Add Lakehouse > Select your lakehouse\n"
            f"Error: {e}"
        )

print(f"\n‚úÖ Environment Detection Complete")

üíª Running Locally

‚úÖ Environment Detection Complete


## üì¶ Package Installation & Imports

In [2]:
# --- PACKAGE INSTALLATION ---

# Set base directory first
if IS_FABRIC:
    BASE_DIR = Path("/lakehouse/default/Files")
else:
    # Local: Use project root (parent of notebooks directory)
    current_dir = Path.cwd()
    if current_dir.name == "notebooks":
        BASE_DIR = current_dir.parent
    else:
        BASE_DIR = current_dir

print(f"üìÇ Base Directory: {BASE_DIR}")

# Install dq_framework package if needed
try:
    import dq_framework
    print(f"‚úÖ dq_framework already installed")
except ImportError:
    print("‚ö†Ô∏è dq_framework not found. Installing...")
    
    if IS_FABRIC:
        # Try to install from Lakehouse Files/libs/
        wheel_path = BASE_DIR / "libs/fabric_data_quality-1.2.0-py3-none-any.whl"
        
        if wheel_path.exists():
            print(f"üì¶ Installing from: {wheel_path}")
            %pip install {str(wheel_path)} --quiet
            print("‚úÖ Package installed successfully")
        else:
            raise FileNotFoundError(
                f"‚ùå Wheel file not found at: {wheel_path}\n"
                f"Please upload fabric_data_quality-1.2.0-py3-none-any.whl to Lakehouse Files/libs/\n"
                f"Or install via Fabric Environment in Workspace Settings"
            )
    else:
        # Local: Install from local dist or editable install
        print("üì¶ Installing locally (editable mode)...")
        %pip install -e {str(BASE_DIR)} --quiet
        print("‚úÖ Package installed")

# Import required modules
from dotenv import load_dotenv
import pandas as pd

# Load environment variables
if IS_FABRIC:
    # Try to load .env from Lakehouse if it exists
    env_path = BASE_DIR / ".env"
    if env_path.exists():
        load_dotenv(dotenv_path=env_path)
        print("‚úÖ Loaded .env from Lakehouse")
else:
    load_dotenv()
    print("‚úÖ Loaded .env from local filesystem")

print("\n‚úÖ Package Installation Complete")

üìÇ Base Directory: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026


‚úÖ dq_framework already installed
‚úÖ Loaded .env from local filesystem

‚úÖ Package Installation Complete


## üìÇ Path Configuration & Validation

In [3]:
# --- PATH CONFIGURATION ---

# Configure Paths based on Environment
if IS_FABRIC:
    # Fabric: Use Lakehouse Paths
    BRONZE_DIR = BASE_DIR / "data/Samples_LH_Bronze_Aims_26_parquet"
    SILVER_DIR = BASE_DIR / "data/Silver"
    GOLD_DIR = BASE_DIR / "data/Gold"
    CONFIG_DIR = BASE_DIR / "config/data_quality"
    RESULTS_DIR = BASE_DIR / "config/validation_results"
    NOTEBOOK_DIR = BASE_DIR / "notebooks"
    
    # Storage format
    STORAGE_FORMAT = "delta"  # Use Delta Lake in Fabric
else:
    # Local: Use environment variables or defaults
    BRONZE_DIR = BASE_DIR / "data/Samples_LH_Bronze_Aims_26_parquet"
    SILVER_DIR = BASE_DIR / "data/Silver"
    GOLD_DIR = BASE_DIR / "data/Gold"
    CONFIG_DIR = BASE_DIR / "config/data_quality"
    RESULTS_DIR = BASE_DIR / "config/validation_results"
    NOTEBOOK_DIR = BASE_DIR / "notebooks"
    
    # Storage format
    STORAGE_FORMAT = "parquet"  # Use Parquet locally

# Ensure directories exist
for directory in [SILVER_DIR, GOLD_DIR, CONFIG_DIR, RESULTS_DIR]:
    directory.mkdir(exist_ok=True, parents=True)

print(f"\nüìÇ Configuration:")
print(f"   Environment: {'Fabric' if IS_FABRIC else 'Local'}")
print(f"   Base Directory: {BASE_DIR}")
print(f"   Bronze Layer: {BRONZE_DIR}")
print(f"   Silver Layer: {SILVER_DIR}")
print(f"   Gold Layer: {GOLD_DIR}")
print(f"   Config Directory: {CONFIG_DIR}")
print(f"   Results Directory: {RESULTS_DIR}")
print(f"   Storage Format: {STORAGE_FORMAT}")

# Validate Bronze data exists
if not BRONZE_DIR.exists():
    raise FileNotFoundError(
        f"‚ùå Bronze data directory not found!\n"
        f"Expected location: {BRONZE_DIR}\n"
        f"{'Please upload parquet files to Lakehouse Files/data/Samples_LH_Bronze_Aims_26_parquet/' if IS_FABRIC else 'Please check your data directory path'}"
    )

# Count Bronze files
parquet_files = list(BRONZE_DIR.glob("*.parquet"))
if len(parquet_files) == 0:
    raise FileNotFoundError(
        f"‚ùå No parquet files found in {BRONZE_DIR}\n"
        f"Expected: 68 parquet files"
    )

print(f"\n‚úÖ Found {len(parquet_files)} Bronze parquet files")
print(f"‚úÖ Path Validation Complete")


üìÇ Configuration:
   Environment: Local
   Base Directory: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026
   Bronze Layer: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Samples_LH_Bronze_Aims_26_parquet
   Silver Layer: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Silver
   Gold Layer: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Gold
   Config Directory: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality
   Results Directory: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/validation_results
   Storage Format: parquet

‚úÖ Found 68 Bronze parquet files
‚úÖ Path Validation Complete


## ‚öôÔ∏è Pipeline Configuration

In [4]:
PIPELINE_CONFIG = {
    "run_profiling": True,          # Phase 1: Generate DQ configs
    "run_ingestion": True,          # Phase 2: Bronze ‚Üí Silver with validation
    "run_monitoring": True,         # Phase 3: DQ monitoring dashboards
    "run_dq_modeling": False,       # Phase 4: Advanced DQ modeling (optional)
    "run_bi_analytics": False,      # Phase 5: BI analytics (optional)
    "force_reprocess": False,       # Force reprocessing even if files exist
    "dq_threshold": 85.0,          # Global DQ threshold (85%)
    "max_workers": 8 if IS_FABRIC else 4,  # Parallel processing workers
    "continue_on_error": False,    # Continue pipeline even if phase fails
}

# Display Configuration
print("‚öôÔ∏è Pipeline Configuration:")
for key, value in PIPELINE_CONFIG.items():
    print(f"   {key}: {value}")

# Initialize Execution Log
execution_log = {
    "start_time": datetime.now().isoformat(),
    "environment": "Fabric" if IS_FABRIC else "Local",
    "storage_format": STORAGE_FORMAT,
    "config": PIPELINE_CONFIG,
    "phases": []
}

print("\n‚úÖ Configuration Complete")

‚öôÔ∏è Pipeline Configuration:
   run_profiling: True
   run_ingestion: True
   run_monitoring: True
   run_dq_modeling: False
   run_bi_analytics: False
   force_reprocess: False
   dq_threshold: 85.0
   max_workers: 4
   continue_on_error: False

‚úÖ Configuration Complete


## üöÄ Phase 1: Data Profiling

**Purpose:** Generate DQ validation configs for all Bronze layer tables

**Process:**
1. Profile each Bronze parquet file
2. Generate validation YAML configs
3. Save configs to `config/data_quality/`

In [5]:
if PIPELINE_CONFIG["run_profiling"]:
    print("\n" + "="*80)
    print("PHASE 1: DATA PROFILING")
    print("="*80)
    
    phase_start = datetime.now()
    
    try:
        # Add project root to path for imports
        if str(BASE_DIR) not in sys.path:
            sys.path.insert(0, str(BASE_DIR))
        
        # Import profiling modules
        from dq_framework import DataProfiler
        
        print(f"\nüìä Profiling Bronze layer: {BRONZE_DIR}")
        print(f"   Workers: {PIPELINE_CONFIG['max_workers']}")
        print(f"   Output: {CONFIG_DIR}")
        
        # Profile each parquet file
        profiled_files = []
        errors = []
        
        for parquet_file in sorted(parquet_files):
            table_name = parquet_file.stem
            config_file = CONFIG_DIR / f"{table_name}_validation.yml"
            
            try:
                print(f"   Profiling: {table_name}.parquet...", end=" ")
                
                # Profile and generate config
                profiler = DataProfiler(str(parquet_file))
                config = profiler.generate_validation_config()
                
                # Save config
                with open(config_file, 'w') as f:
                    import yaml
                    yaml.dump(config, f, default_flow_style=False)
                
                profiled_files.append(table_name)
                print("‚úÖ")
                
            except Exception as e:
                errors.append({"table": table_name, "error": str(e)})
                print(f"‚ùå Error: {e}")
        
        # Display results
        print(f"\n‚úÖ Profiling Complete:")
        print(f"   Files Profiled: {len(profiled_files)}")
        print(f"   Configs Generated: {len(list(CONFIG_DIR.glob('*.yml')))}")
        if errors:
            print(f"   Errors: {len(errors)}")
        
        # Log phase execution
        execution_log["phases"].append({
            "phase": "profiling",
            "status": "success" if len(errors) == 0 else "partial",
            "duration_seconds": (datetime.now() - phase_start).total_seconds(),
            "files_profiled": len(profiled_files),
            "configs_generated": len(list(CONFIG_DIR.glob('*.yml'))),
            "errors": len(errors)
        })
        
    except Exception as e:
        print(f"\n‚ùå Profiling Failed: {e}")
        import traceback
        traceback.print_exc()
        
        execution_log["phases"].append({
            "phase": "profiling",
            "status": "failed",
            "error": str(e),
            "duration_seconds": (datetime.now() - phase_start).total_seconds()
        })
        
        if not PIPELINE_CONFIG.get("continue_on_error", False):
            raise
else:
    print("‚è≠Ô∏è Skipping Phase 1: Data Profiling (disabled in config)")


PHASE 1: DATA PROFILING

üìä Profiling Bronze layer: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/data/Samples_LH_Bronze_Aims_26_parquet
   Workers: 4
   Output: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/data_quality
   Profiling: aims_activitydates.parquet... ‚ùå Error: 'DataProfiler' object has no attribute 'generate_validation_config'
   Profiling: aims_assetattributes.parquet... ‚ùå Error: 'DataProfiler' object has no attribute 'generate_validation_config'
   Profiling: aims_assetclassattributes.parquet... ‚ùå Error: 'DataProfiler' object has no attribute 'generate_validation_config'
   Profiling: aims_assetclasschangelogs.parquet... ‚ùå Error: 'DataProfiler' object has no attribute 'generate_validation_config'
   Profiling: aims_assetclasses.parquet... ‚ùå Error: 'DataProfiler' object has no attribute 'generate_validation_config'
   Profiling: aims_assetclassrelationships.parquet... ‚ùå Error: 'DataProfiler' object has no attribute 'ge

## ‚úÖ Phase 2: Data Validation & Ingestion

**Purpose:** Validate Bronze data and ingest to Silver layer

**Process:**
1. Load validation configs
2. Validate each Bronze table
3. Ingest passing records to Silver (Delta Lake in Fabric, Parquet locally)
4. Quarantine failing records

In [6]:
if PIPELINE_CONFIG["run_ingestion"]:
    print("\n" + "="*80)
    print("PHASE 2: DATA VALIDATION & INGESTION")
    print("="*80)
    
    phase_start = datetime.now()
    
    try:
        
        # Track validation results
        validation_results = {
            "timestamp": datetime.now().isoformat(),
            "threshold": PIPELINE_CONFIG['dq_threshold'],
            "storage_format": STORAGE_FORMAT,
            "files": {},
            "summary": {"total": 0, "passed": 0, "failed": 0, "skipped": 0, "errors": 0}
        }
        
        validation_results["summary"]["total"] = len(parquet_files)
        print(f"   Found {len(parquet_files)} parquet files to validate\n")
        
        # Validate each file
        for parquet_file in sorted(parquet_files):
            table_name = parquet_file.stem
            config_file = CONFIG_DIR / f"{table_name}_validation.yml"
            
            if not config_file.exists():
                print(f"‚ö†Ô∏è SKIPPED: {table_name}.parquet (no config)")
                validation_results["summary"]["skipped"] += 1
                continue
            
            try:
                # Load and validate
                validator = DataValidator(str(config_file))
                result = validator.validate(str(parquet_file))
                
                # Store results
                validation_results["files"][table_name] = result
                
                # Update summary
                if result.get("overall_success", False):
                    validation_results["summary"]["passed"] += 1
                    print(f"‚úÖ PASSED: {table_name}.parquet ({result.get('success_percentage', 0):.1f}%)")
                    
                    # Ingest to Silver layer (pandas + parquet works in both Local and Fabric)
                    df = pd.read_parquet(parquet_file)
                    silver_file = SILVER_DIR / f"{table_name}.parquet"
                    df.to_parquet(silver_file, index=False)
                    
                else:
                    validation_results["summary"]["failed"] += 1
                    print(f"‚ùå FAILED: {table_name}.parquet ({result.get('success_percentage', 0):.1f}%)")
                    
            except Exception as e:
                validation_results["summary"]["errors"] += 1
                print(f"üí• ERROR: {table_name}.parquet - {e}")
        
        # Save validation results
        results_file = RESULTS_DIR / "validation_results.json"
        with open(results_file, 'w') as f:
            json.dump(validation_results, f, indent=2)
        
        # Display summary
        print(f"\n{'='*70}")
        print("VALIDATION SUMMARY")
        print(f"{'='*70}")
        summary = validation_results["summary"]
        print(f"Total Files:  {summary['total']}")
        print(f"‚úÖ Passed:     {summary['passed']}")
        print(f"‚ùå Failed:     {summary['failed']}")
        print(f"‚ö†Ô∏è  Skipped:    {summary['skipped']}")
        print(f"üí• Errors:     {summary['errors']}")
        print(f"\nPass Rate: {(summary['passed']/summary['total']*100):.1f}%")
        print(f"Results saved to: {results_file}")
        print(f"{'='*70}")
        
        # Log phase execution
        execution_log["phases"].append({
            "phase": "validation_ingestion",
            "status": "success",
            "duration_seconds": (datetime.now() - phase_start).total_seconds(),
            "validation_summary": validation_results["summary"]
        })
        
    except Exception as e:
        print(f"\n‚ùå Validation/Ingestion Failed: {e}")
        import traceback
        traceback.print_exc()
        
        execution_log["phases"].append({
            "phase": "validation_ingestion",
            "status": "failed",
            "error": str(e),
            "duration_seconds": (datetime.now() - phase_start).total_seconds()
        })
        
        if not PIPELINE_CONFIG.get("continue_on_error", False):
            raise
else:
    print("‚è≠Ô∏è Skipping Phase 2: Validation & Ingestion (disabled in config)")


PHASE 2: DATA VALIDATION & INGESTION
   Found 68 parquet files to validate

üí• ERROR: aims_activitydates.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_assetattributes.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_assetclassattributes.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_assetclasschangelogs.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_assetclasses.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_assetclassrelationships.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_assetconsents.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_assethierarchymap.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_assetlocations.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_assets.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_attributedomains.parquet - name 'DataValidator' is not defined
üí• ERROR: aims_attributedomainvalues.parquet - n

## üìà Phase 3: Data Quality Monitoring

**Purpose:** Generate DQ dashboards and monitoring reports

In [7]:
if PIPELINE_CONFIG["run_monitoring"]:
    print("\n" + "="*80)
    print("PHASE 3: DATA QUALITY MONITORING")
    print("="*80)
    
    phase_start = datetime.now()
    
    try:
        # Load validation results
        results_file = RESULTS_DIR / "validation_results.json"
        
        if not results_file.exists():
            print("‚ö†Ô∏è No validation results found. Skipping monitoring.")
        else:
            with open(results_file, 'r') as f:
                validation_data = json.load(f)
            
            print(f"\nüìä Generating monitoring dashboards...")
            print(f"   Data source: {results_file}")
            
            # Check if we have file results
            files_data = validation_data.get("files", {})
            if not files_data:
                print("‚ö†Ô∏è No file validation results available. Run validation first.")
                print(f"\nüìä Summary Statistics:")
                summary = validation_data.get("summary", {})
                print(f"   Total Files: {summary.get('total', 0)}")
                print(f"   Passed: {summary.get('passed', 0)}")
                print(f"   Failed: {summary.get('failed', 0)}")
                print(f"   Skipped: {summary.get('skipped', 0)}")
                print(f"   Errors: {summary.get('errors', 0)}")
            else:
                # Create summary DataFrame
                summary_data = []
                for table_name, result in files_data.items():
                    summary_data.append({
                        "Table": table_name,
                        "Success %": result.get("success_percentage", 0),
                        "Status": "Passed" if result.get("overall_success") else "Failed",
                        "Evaluated": result.get("statistics", {}).get("evaluated_expectations", 0),
                        "Successful": result.get("statistics", {}).get("successful_expectations", 0)
                    })
                
                df_summary = pd.DataFrame(summary_data)
                
                print(f"\nüìã DQ Summary:")
                print(df_summary.head(10).to_string(index=False))
                
                # Calculate key metrics
                avg_quality = df_summary["Success %"].mean()
                pass_rate = (df_summary["Status"] == "Passed").sum() / len(df_summary) * 100
                
                print(f"\nüìä Key Metrics:")
                print(f"   Average Quality Score: {avg_quality:.1f}%")
                print(f"   Pass Rate: {pass_rate:.1f}%")
                print(f"   Tables Monitored: {len(df_summary)}")
                
                # Log phase execution
                execution_log["phases"].append({
                    "phase": "monitoring",
                    "status": "success",
                    "duration_seconds": (datetime.now() - phase_start).total_seconds(),
                    "metrics": {
                        "avg_quality_score": float(avg_quality),
                        "pass_rate": float(pass_rate),
                        "tables_monitored": len(df_summary)
                    }
                })
                
    except Exception as e:
        print(f"\n‚ùå Monitoring Failed: {e}")
        import traceback
        traceback.print_exc()
        
        execution_log["phases"].append({
            "phase": "monitoring",
            "status": "failed",
            "error": str(e),
            "duration_seconds": (datetime.now() - phase_start).total_seconds()
        })
        
        if not PIPELINE_CONFIG.get("continue_on_error", False):
            raise
else:
    print("‚è≠Ô∏è Skipping Phase 3: Monitoring (disabled in config)")


PHASE 3: DATA QUALITY MONITORING

üìä Generating monitoring dashboards...
   Data source: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/validation_results/validation_results.json
‚ö†Ô∏è No file validation results available. Run validation first.

üìä Summary Statistics:
   Total Files: 68
   Passed: 0
   Failed: 0
   Skipped: 0
   Errors: 68


## üìù Pipeline Execution Summary

In [8]:
# Calculate success rate
successful_phases = sum(1 for p in execution_log["phases"] if p["status"] in ["success", "partial"])
total_phases = len(execution_log["phases"])
success_rate = (successful_phases / total_phases * 100) if total_phases > 0 else 0

print(f"\nüìä Overall Status:")
print(f"   Phases Completed: {successful_phases}/{total_phases}")
print(f"   Success Rate: {success_rate:.1f}%")

# Save execution log
log_file = RESULTS_DIR / f"orchestration_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(log_file, 'w') as f:
    json.dump(execution_log, f, indent=2)

print(f"\nüíæ Execution log saved to: {log_file}")
print("\n" + "="*80)

if success_rate == 100:
    print("üéâ ALL PHASES COMPLETED SUCCESSFULLY!")
elif success_rate >= 80:
    print("‚ö†Ô∏è PIPELINE COMPLETED WITH WARNINGS")
else:
    print("‚ùå PIPELINE COMPLETED WITH ERRORS")

print("="*80)


üìä Overall Status:
   Phases Completed: 2/2
   Success Rate: 100.0%

üíæ Execution log saved to: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/1_AIMS_LOCAL_2026/config/validation_results/orchestration_log_20251210_143930.json

üéâ ALL PHASES COMPLETED SUCCESSFULLY!
