In [1]:
# Step 4: Data Quality Monitoring Dashboard
print("\n" + "="*60)
print("üìä DATA QUALITY DASHBOARD")
print("="*60)

if DQ_LOG_FILE.exists():
    # Read all DQ logs
    logs = []
    with open(DQ_LOG_FILE, 'r') as f:
        for line in f:
            if line.strip():
                logs.append(json.loads(line))
    
    if logs:
        df_logs = pd.DataFrame(logs)
        df_logs['timestamp'] = pd.to_datetime(df_logs['timestamp'])
        
        # --- HIGH-LEVEL METRICS ---
        total_runs = len(df_logs)
        passed_runs = len(df_logs[df_logs['status'] == 'PASSED'])
        failed_runs = len(df_logs[df_logs['status'] == 'FAILED'])
        error_runs = len(df_logs[df_logs['status'] == 'ERROR'])
        avg_score = df_logs['score'].mean()
        
        print(f"\nüìà Summary Statistics:")
        print(f"   Total Validations: {total_runs}")
        print(f"   ‚úÖ Passed: {passed_runs} ({passed_runs/total_runs*100:.1f}%)")
        print(f"   ‚ùå Failed: {failed_runs} ({failed_runs/total_runs*100:.1f}%)")
        print(f"   üí• Errors: {error_runs}")
        print(f"   üìä Average Quality Score: {avg_score:.1f}%")
        
        # --- RECENT ACTIVITY TABLE ---
        print("\nüìã Recent Validation Runs:")
        display_cols = ['timestamp', 'file', 'status', 'score']
        display(df_logs[display_cols].sort_values('timestamp', ascending=False).head(15))
        
        # --- TREND CHART ---
        if len(df_logs) > 1:
            print("\nüìà Quality Score Trend:")
            plt.figure(figsize=(12, 4))
            df_plot = df_logs.sort_values('timestamp')
            plt.plot(df_plot['timestamp'], df_plot['score'], marker='o', linestyle='-', linewidth=2)
            plt.axhline(y=100, color='g', linestyle='--', alpha=0.3, label='Perfect Score')
            plt.axhline(y=90, color='orange', linestyle='--', alpha=0.3, label='Warning Threshold')
            plt.title('Data Quality Score Over Time')
            plt.xlabel('Timestamp')
            plt.ylabel('Quality Score (%)')
            plt.ylim(0, 105)
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
        
        # --- FAILURES BREAKDOWN ---
        if failed_runs > 0:
            print(f"\n‚ùå Failed Files ({failed_runs} total):")
            failed_df = df_logs[df_logs['status'] == 'FAILED'][['timestamp', 'file', 'score']].copy()
            display(failed_df.sort_values('score'))
        
    else:
        print("\n‚ö†Ô∏è  DQ log file exists but is empty.")
else:
    print("\n‚ùå DQ log file not found. Run Notebook 02 (Ingestion) first.")


üìä DATA QUALITY DASHBOARD


NameError: name 'DQ_LOG_FILE' is not defined

In [None]:
# Step 5: Alerting System (Simulation)
print("\n" + "="*60)
print("üö® ACTIVE ALERTS")
print("="*60)

if 'df_logs' in locals() and not df_logs.empty:
    # Filter for failures
    failures = df_logs[df_logs['status'] == 'FAILED'].copy()
    
    if not failures.empty:
        print(f"\n‚ö†Ô∏è  Found {len(failures)} failed runs requiring attention.\n")
        
        for idx, (_, row) in enumerate(failures.iterrows(), 1):
            # Construct Alert Payload (for Slack/Teams/PagerDuty integration)
            alert_payload = {
                "alert_type": "DataQualityFailure",
                "severity": "High" if row['score'] < 70 else "Medium",
                "source": "AIMS_Ingestion_Pipeline",
                "timestamp": row['timestamp'].isoformat(),
                "file": row['file'],
                "score": row['score'],
                "failed_checks": row.get('details', {}).get('failed_count', 'Unknown'),
                "action_required": "Review quarantined file and fix data quality issues"
            }
            
            print(f"üî¥ ALERT #{idx}: {row['file']}")
            print(f"   Severity: {alert_payload['severity']}")
            print(f"   Score: {row['score']:.1f}%")
            print(f"   Time: {row['timestamp']}")
            print(f"   Payload: {json.dumps(alert_payload, indent=2)}")
            print("-" * 60)
    else:
        print("\n‚úÖ No active alerts. All validations passed!")
else:
    print("\n‚ö†Ô∏è  No data available for alert analysis.")

In [None]:
# Step 3: View Watermark Status
print("="*60)
print("üìã WATERMARK STATUS - Processed Files")
print("="*60)

if WATERMARK_FILE.exists():
    with open(WATERMARK_FILE, 'r') as f:
        watermarks = json.load(f)
    
    if watermarks:
        df_watermarks = pd.DataFrame(
            list(watermarks.items()), 
            columns=['Filename', 'Processed_Timestamp']
        )
        df_watermarks['Processed_Timestamp'] = pd.to_datetime(df_watermarks['Processed_Timestamp'])
        df_watermarks = df_watermarks.sort_values('Processed_Timestamp', ascending=False)
        
        print(f"\n‚úÖ Total Processed Files: {len(df_watermarks)}")
        print("\nMost Recent Ingestions:")
        display(df_watermarks.head(10))
    else:
        print("\n‚ö†Ô∏è  No files have been processed yet.")
else:
    print("\n‚ùå Watermark file not found. Run Notebook 02 (Ingestion) first.")

In [None]:
# Step 2: Configuration - Point to Local State Files
BASE_DIR = Path("/home/sanmi/Documents/HS2/HS2_PROJECTS_2025/AIMS_LOCAL")
STATE_DIR = BASE_DIR / "data/state"

WATERMARK_FILE = STATE_DIR / "watermarks.json"
DQ_LOG_FILE = STATE_DIR / "dq_logs.jsonl"

print("‚úÖ Monitoring Configuration:")
print(f"   State Directory: {STATE_DIR}")
print(f"   Watermarks: {WATERMARK_FILE}")
print(f"   DQ Logs: {DQ_LOG_FILE}")
print(f"   Files exist: Watermarks={WATERMARK_FILE.exists()}, Logs={DQ_LOG_FILE.exists()}")

In [None]:
# Step 1: Import Libraries
import pandas as pd
import json
import os
from pathlib import Path
import matplotlib.pyplot as plt
from IPython.display import display

print("‚úÖ Libraries imported successfully")

# AIMS Data Quality Monitoring Dashboard

This notebook provides monitoring and observability for the AIMS Data Platform.

## Purpose
- View watermark status (which files have been processed)
- Review Data Quality validation results
- Analyze DQ trends and patterns
- Generate alerts for failures

## Local Execution
Reads logs from `data/state/` directory generated by Notebook 02