# AIMS Data Ingestion Pipeline with DQ Gatekeeping

This notebook executes the data ingestion process with Data Quality checks.

## Purpose
- Load data files from local source directory
- Validate each file against its DQ config
- If PASSED: Process and mark as complete
- If FAILED: Quarantine and log for review
- Track processed files using watermarks

## Local Execution
This notebook runs entirely locally using the sample data in `data/Samples_LH_Bronze_Aims_26_parquet/`

In [None]:
# Step 1: Install the library (if not already installed in the environment)
from pathlib import Path

# Detect Environment (avoid slow/hanging imports locally)
IS_FABRIC = Path("/lakehouse/default/Files").exists()
if IS_FABRIC:
    try:
        from notebookutils import mssparkutils  # noqa: F401
    except Exception:
        mssparkutils = None
    print("Running in Microsoft Fabric")
else:
    print("Running Locally")

if not IS_FABRIC:
    wheel_candidates = []
    wheel_candidates += sorted(Path("../dq_great_expectations/dq_package_dist").glob("fabric_data_quality-*.whl"))
    wheel_candidates += sorted(Path("../dist").glob("fabric_data_quality-*.whl"))
    wheel_candidates += sorted(Path("../../2_DATA_QUALITY_LIBRARY/dist").glob("fabric_data_quality-*.whl"))

    if not wheel_candidates:
        raise FileNotFoundError(
            "No fabric_data_quality wheel found. Build one in 2_DATA_QUALITY_LIBRARY/dist or place it in dq_great_expectations/dq_package_dist."
        )

    wheel_path = wheel_candidates[-1]
    print(f"Installing: {wheel_path}")
    %pip install {wheel_path} --force-reinstall
    # %pip install adlfs
else:
    # Fabric: We assume the library is installed via the Fabric Environment (Workspace Settings).
    # If you need to install it manually from Files, uncomment the line below:
    # %pip install /lakehouse/default/Files/libs/fabric_data_quality-*.whl --force-reinstall
    pass

In [None]:
# Step 2: Configuration & Setup
import os
import json
from datetime import datetime
import glob
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
import pyarrow.parquet as pq  # Added for empty file detection

# Define Paths based on Environment
if IS_FABRIC:
    # Fabric Paths
    BASE_DIR = Path("/lakehouse/default/Files")
    
    # Try to load .env from the Lakehouse Files root
    env_path = BASE_DIR / ".env"
    if env_path.exists():
        load_dotenv(dotenv_path=env_path)
        print(f"Loaded configuration from {env_path}")

outputs
# Step 3: Import DQ Libraries
from dq_framework import DataLoader, DataQualityValidator

print("Libraries imported successfully")

Configuration:
   Environment: Local
   Bronze (Source): /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data/Samples_LH_Bronze_Aims_26_parquet
   Silver (Target): /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data/Silver
   Gold (Target):   /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data/Gold
   DQ Configs: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/dq_great_expectations/generated_configs
   Watermarks: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data/state/watermarks.json
   DQ Logs: /home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL/data/state/dq_logs.jsonl


In [9]:
# Step 4: Watermark & Logging Helper Functions

def load_watermarks():
    """Load watermarks from JSON file."""
    if WATERMARK_FILE.exists():
        with open(WATERMARK_FILE, 'r') as f:
            return json.load(f)
    return {}

def save_watermark(file_name):
    """Mark a file as processed."""
    watermarks = load_watermarks()
    watermarks[file_name] = datetime.now().isoformat()
    with open(WATERMARK_FILE, 'w') as f:
        json.dump(watermarks, f, indent=2)

def is_processed(file_name):
    """Check if file has already been processed."""
    watermarks = load_watermarks()
    return file_name in watermarks

def log_dq_result(file_name, status, score, details=None):
    """Append validation result to JSONL log file."""
    entry = {
        "timestamp": datetime.now().isoformat(),
        "file": file_name,
        "status": status,
        "score": score,
        "details": details or {}
    }
    with open(DQ_LOG_FILE, "a") as f:
        f.write(json.dumps(entry) + "\n")

def quarantine_file(file_path, reason):
    """Simulate moving file to quarantine."""
    print(f"   QUARANTINED: {os.path.basename(file_path)} -> Reason: {reason}")

print("Helper functions defined")

Helper functions defined


In [10]:
# Step 5: Execute Ingestion Pipeline with DQ Gatekeeping

# Discover all parquet files in source directory (Bronze)
source_files = list(DATA_PATH.glob("*.parquet"))
print(f"Starting Ingestion Pipeline for {len(source_files)} files...\n")

processed_count = 0
passed_count = 0
failed_count = 0
skipped_count = 0

# Use tqdm for progress bar
for file_path in tqdm(source_files, desc="Processing Files"):
    file_name = file_path.name
    
    # Check Watermark (Skip if already processed)
    if is_processed(file_name):
        print(f"Skipping {file_name} (Already Processed)")
        skipped_count += 1
        continue

    # Check for Empty File (0 rows)
    # We skip these to prevent skewing DQ metrics
    try:
        if pq.read_metadata(file_path).num_rows == 0:
            print(f"Skipping {file_name} (Empty File - 0 rows)")
            skipped_count += 1
            continue
    except Exception:
        pass # Proceed if check fails (let the pipeline handle it)

    print(f"Processing: {file_name}...")
    
    try:
        # --- PHASE 1: DQ GATEKEEPING (Validation) ---
        # Look for corresponding validation config
        config_name = file_name.replace('.parquet', '_validation.yml')
        config_path = CONFIG_DIR / config_name
        
        if not config_path.exists():
            print(f"   Warning: No validation config found. Skipping DQ check.")
            validation_passed = True  # or False for strict mode
            score = 0.0
            failures = []
        else:
            # Load data and validate
            validator = DataQualityValidator(config_path=str(config_path))
            df_batch = DataLoader.load_data(str(file_path), sample_size=100000)
            result = validator.validate(df_batch)
            
            validation_passed = result['success']
            score = result['success_rate']
            failures = result.get('failed_expectations', [])
            
            # Log the result
            log_dq_result(
                file_name, 
                "PASSED" if validation_passed else "FAILED", 
                score, 
                {"failed_count": len(failures), "failures": failures[:5]}  # Log first 5 failures
            )

        # --- PHASE 2: ACTION (Ingest to Silver) ---
        if validation_passed:
            print(f"   DQ Passed (Score: {score:.1f}%). Ingesting to Silver...")
            
            # Define Silver Path
            silver_path = SILVER_DIR / file_name
            
            # In a real Spark/Fabric scenario, you would read into a DataFrame and write to Delta
            # Example:
            # df = spark.read.parquet(str(file_path))
            # df.write.format("delta").mode("overwrite").save(str(silver_path))
            
            # For this local/demo simulation, we'll just copy the file if it doesn't exist
            if not silver_path.exists():
                import shutil
                shutil.copy2(file_path, silver_path)
                print(f"   -> Copied to {silver_path}")
            else:
                print(f"   -> File already exists in Silver")
            
            # Mark as processed
            save_watermark(file_name)
            print(f"   Marked as processed.")
            passed_count += 1
            
        else:
            print(f"   DQ FAILED (Score: {score:.1f}%). Blocked from ingestion.")
            quarantine_file(file_path, f"Failed {len(failures)} checks")
            failed_count += 1
            
        processed_count += 1
            
    except Exception as e:
        print(f"   Pipeline Error: {e}")
        log_dq_result(file_name, "ERROR", 0.0, {"error": str(e)})
        failed_count += 1

print(f"\n{'='*60}")
print(f"Pipeline Execution Complete")
print(f"{'='*60}")
print(f"Total Files: {len(source_files)}")
print(f"Processed: {processed_count}")
print(f"Passed DQ: {passed_count}")
print(f"Failed DQ: {failed_count}")
print(f"Skipped (Already Processed or Empty): {skipped_count}")
print(f"\nView results in Notebook 03 (Monitoring Dashboard)")

Starting Ingestion Pipeline for 0 files...



Processing Files: 0it [00:00, ?it/s]


Pipeline Execution Complete
Total Files: 0
Processed: 0
Passed DQ: 0
Failed DQ: 0
Skipped (Already Processed or Empty): 0

View results in Notebook 03 (Monitoring Dashboard)



