# 01. AIMS Data Profiling & Configuration
**Role: The Architect**

This notebook is the first step in the Data Quality Lifecycle.
1.  **Profiles** your raw data to understand its structure and quality.
2.  **Generates** validation rules (YAML configurations) automatically.
3.  **Validates** a sample batch to ensure the rules are sensible.

**Output:** A set of YAML configuration files used by the Ingestion Pipeline (Notebook 02).

In [None]:
# --- CONFIGURATION ---
import os
import sys
from pathlib import Path

# 1. Detect Environment
try:
    from notebookutils import mssparkutils
    IS_FABRIC = True
    print("Running in Microsoft Fabric")
except ImportError:
    IS_FABRIC = False
    print("Running Locally")

# 2. Define Paths based on Environment
if IS_FABRIC:
    # Fabric: Use Lakehouse Paths
    # Assumes a default Lakehouse is attached
    BASE_DIR = Path("/lakehouse/default/Files")
    
    # Adjust these subpaths to match your Fabric Lakehouse structure
    DATA_PATH = BASE_DIR / "data/Samples_LH_Bronze_Aims_26_parquet" 
    OUTPUT_DIR = BASE_DIR / "dq_configs"
    
    # Performance Settings for Spark/Fabric
    NUM_WORKERS = 8 
else:
    # Local: Use .env or defaults
    from dotenv import load_dotenv
    load_dotenv()
    
    BASE_DIR = Path(os.getenv("BASE_DIR", "/home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL"))
    DATA_PATH = BASE_DIR / os.getenv("DATA_PATH", "data/Samples_LH_Bronze_Aims_26_parquet")
    OUTPUT_DIR = BASE_DIR / os.getenv("CONFIG_DIR", "dq_great_expectations/generated_configs")
    
    # Performance Settings for Local
    NUM_WORKERS = 4

# Common Settings
SAMPLE_SIZE = 100000 

# Convert to strings for compatibility
DATA_PATH = str(DATA_PATH)
OUTPUT_DIR = str(OUTPUT_DIR)

print(f"Configuration:\n Environment: {'Fabric' if IS_FABRIC else 'Local'}\n Input: {DATA_PATH}\n Output: {OUTPUT_DIR}\n Workers: {NUM_WORKERS}")

# Configuration
Set up the input path, output destination, and performance settings.

In [None]:
# Install the library
# Local: Installs from the local dist folder
# Fabric: Uncomment the line below and ensure the wheel is uploaded to your Lakehouse Files/libs folder

if not IS_FABRIC:
    %pip install ../dq_great_expectations/dq_package_dist/fabric_data_quality-1.1.2-py3-none-any.whl --force-reinstall
else:
    # Example Fabric install command (adjust path as needed)
    # %pip install /lakehouse/default/Files/libs/fabric_data_quality-1.1.2-py3-none-any.whl --force-reinstall
    pass

In [None]:
import os
import pandas as pd
from dq_framework import DataProfiler, DataQualityValidator, ConfigLoader

print("Libraries imported successfully")

In [None]:
# Run the parallel profiling using the library's BatchProfiler
from dq_framework import BatchProfiler

# Define custom thresholds (optional)
# These override the defaults in the library
custom_thresholds = {
    "severity_threshold": "medium",
    "null_tolerance": 5.0,  # tolerance (5%)
    "include_structural": True,
    "include_completeness": True,
    "include_validity": True
}

if os.path.exists(DATA_PATH):
    results = BatchProfiler.run_parallel_profiling(
        input_dir=DATA_PATH,
        output_dir=OUTPUT_DIR,
        workers=NUM_WORKERS,
        sample_size=SAMPLE_SIZE,
        thresholds=custom_thresholds
    )
else:
    print(f"Input path does not exist: {DATA_PATH}")

In [None]:
# Inspect one of the generated config files
if results:
    success_results = [r for r in results if r['status'] == 'success']
    if success_results:
        last_output = success_results[0]['output']
        print(f"Reading generated config: {last_output}")
        
        with open(last_output, 'r') as f:
            print(f.read())
    else:
        print("No successful results to inspect.")

## Part 2: Data Validation Workflow

Now that we have profiled the data and generated a configuration (expectations), we can use this configuration to validate new batches of data.

This implements the workflow:
**Load New Data Batch** -> **Load YAML Config** -> **Run Validation** -> **Get Pass/Fail Results**

In [None]:
# 1. Iterate through all profiled files and validate them
from dq_framework import DataLoader
import pandas as pd
from IPython.display import display

if 'results' in locals() and results:
    print(f"Starting Batch Validation for {len(results)} files...\n")
    
    validation_summary = []
    failure_details = []
    
    for i, result in enumerate(results):
        if result['status'] != 'success':
            continue
            
        print(f"[{i+1}/{len(results)}] Validating: {result['file']}...", end="\r")
        
        # Setup paths
        config_path = result['output']
        data_file_path = os.path.join(DATA_PATH, result['file'])
        
        try:
            # 2. Initialize Validator
            validator = DataQualityValidator(config_path=config_path)
            
            # 3. Load Data Batch (Safe Mode)
            df_batch = DataLoader.load_data(data_file_path, sample_size=100000)
            
            # 4. Run Validation
            validation_results = validator.validate(df_batch)
            
            # 5. Collect Results
            success = validation_results['success']
            
            summary_entry = {
                'File': result['file'],
                'Status': 'PASSED' if success else 'FAILED',
                'Score (%)': round(validation_results['success_rate'], 1),
                'Passed Checks': validation_results['successful_checks'],
                'Total Checks': validation_results['evaluated_checks'],
                'Failed Checks': len(validation_results.get('failed_expectations', []))
            }
            validation_summary.append(summary_entry)
            
            # Collect detailed failures
            if not success:
                for failure in validation_results.get('failed_expectations', []):
                    unexpected_pct = failure.get('details', {}).get('unexpected_percent', None)
                    unexpected_str = f"{unexpected_pct:.1f}%" if unexpected_pct is not None else "N/A"
                    
                    failure_details.append({
                        'File': result['file'],
                        'Column': failure['column'],
                        'Expectation': failure['expectation'],
                        'Unexpected %': unexpected_str,
                        'Details': str(failure.get('details', ''))
                    })
            
        except Exception as e:
            print(f"\nError validating {result['file']}: {str(e)}")
            validation_summary.append({
                'File': result['file'],
                'Status': 'ERROR',
                'Score (%)': 0.0,
                'Passed Checks': 0,
                'Total Checks': 0,
                'Failed Checks': 0
            })
            
    print(f"\n\n{'='*60}")
    print("VALIDATION DASHBOARD")
    print(f"{'='*60}")
    
    if validation_summary:
        df_summary = pd.DataFrame(validation_summary)
        
        # Calculate Stats
        total = len(df_summary)
        passed = len(df_summary[df_summary['Status'].str.contains('PASSED')])
        failed = len(df_summary[df_summary['Status'].str.contains('FAILED')])
        avg_score = df_summary['Score (%)'].mean()
        
        # Display Metrics
        print(f"Total Files: {total} | Passed: {passed} | Failed: {failed} | Avg Score: {avg_score:.1f}%")
        
        # Display Summary Table
        print("\nSummary Report:")
        display(df_summary)
        
        # Display Failure Details Table
        if failure_details:
            print("\nFailure Details Report:")
            df_failures = pd.DataFrame(failure_details)
            # Reorder columns for better readability
            cols = ['File', 'Column', 'Expectation', 'Unexpected %', 'Details']
            display(df_failures[cols])
        elif failed > 0:
             print("\nFailures detected but no details available.")
        else:
             print("\nNo failures detected across all files.")
    else:
        print("No validation results generated.")
        
else:
    print("No profiling results available. Please run Part 1 first.")