# VA Patronage Data Freshness Monitor (Modularized)

**Purpose:** Daily monitoring job for VA patronage modular pipeline data freshness  
**Alert Mechanism:** Notebook fails if processing gaps detected, triggering Databricks email notifications  
**Schedule:** Daily automated execution via Databricks Jobs  
**Time Zone:** All timestamps and comparisons are in UTC to ensure consistency across data sources and processing times.

## 1. Initialize Environment

In [None]:
# Import required modules and initialize modular pipeline configuration
from datetime import datetime, timedelta, timezone
from typing import Optional, Dict, Any, List
import sys

from databricks.sdk.runtime import dbutils

# Databricks: add the project root so `import patronage_modularized` works
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
parts = notebook_path.strip("/").split("/")
if parts[0] == "Workspace":
    base = "/" + "/".join(parts[:4])
else:
    base = "/Workspace/" + "/".join(parts[:3])

if base not in sys.path:
    sys.path.insert(0, base)

from patronage_modularized import config as pipeline
import patronage_modularized.discovery as discovery

TARGET_TABLE_NAME = pipeline.PATRONAGE_TABLE_NAME
PIPELINE_CONFIG = pipeline.PIPELINE_CONFIG

# Enable verbose logging to see DEBUG messages
pipeline.LOGGING_VERBOSE = True

pipeline.log_message('VA Patronage Data Freshness Monitor (Modularized)')
pipeline.log_message('=' * 50)
pipeline.log_message(f"Execution Time (UTC): {datetime.now(timezone.utc).strftime(pipeline.PY_DATETIME_FORMAT)} UTC")
pipeline.log_message(f'Target Table: {TARGET_TABLE_NAME}')
pipeline.log_message('Purpose: Daily gap detection with automated alerts (UTC aligned)')

## 2. Data Freshness Analysis

In [None]:
# Further optimized & modularized data freshness monitoring

def _normalize_dbutils_dir(path: str) -> str:
    """Normalize a directory path for use with dbutils.fs.ls.

    dbutils.fs.* expects DBFS URIs like `dbfs:/...` (or certain mount paths).
    This normalizes common variants used across notebooks/modules.
    """
    if not path:
        return path

    # Fix common typo-style path: dbfs/mnt/... -> dbfs:/mnt/...
    if path.startswith('dbfs/mnt/'):
        return 'dbfs:/' + path[len('dbfs/'):]

    # If given a mount path like /mnt/... ensure dbfs: prefix
    if path.startswith('/mnt/'):
        return 'dbfs:' + path

    return path


def get_latest_timestamp(config: Dict[str, Any]) -> Optional[datetime]:
    """Fetch the latest timestamp based on the source configuration."""
    source_type = config.get('type')

    if source_type == 'table':
        query = f"""
            SELECT MAX(SDP_Event_Created_Timestamp) as latest_timestamp
            FROM {TARGET_TABLE_NAME}
            WHERE Batch_CD = '{config['batch_cd']}' AND RecordStatus = true
        """
        result = spark.sql(query).collect()
        return result[0]['latest_timestamp'] if result and result[0]['latest_timestamp'] else None

    elif source_type == 'delta':
        delta_path = PIPELINE_CONFIG[pipeline.SOURCE_TYPE_SCD]['pt_sources']['delta_table']
        pt_table_info = spark.sql(f"DESCRIBE DETAIL delta.`{delta_path}`").collect()
        return pt_table_info[0]['lastModified'] if pt_table_info else None

    elif source_type == 'directory':
        dmdc_dir = _normalize_dbutils_dir(config.get('path'))
        try:
            pipeline.log_message(f'   Checking directory: {dmdc_dir}', level='DEBUG')
            files = dbutils.fs.ls(dmdc_dir)
            pipeline.log_message(f'   Found {len(files)} items in directory', level='DEBUG')

            if not files:
                pipeline.log_message('   Directory is empty', level='DEBUG')
                return None

            # Filter out directories, keep only files
            files = [f for f in files if not f.path.endswith('/')]
            pipeline.log_message(f'   Filtered to {len(files)} files', level='DEBUG')

            if not files:
                pipeline.log_message('   No files found (all items were directories)', level='DEBUG')
                return None

            latest_file = max(files, key=lambda f: f.modificationTime)
            pipeline.log_message(f'   Latest file: {latest_file.name}', level='DEBUG')

            latest_ts = datetime.fromtimestamp(latest_file.modificationTime / 1000, tz=timezone.utc)
            return latest_ts
        except Exception as e:
            pipeline.log_message(f'   Error reading export directory: {str(e)}', level='WARN')
            pipeline.log_message(f'   Directory path checked: {dmdc_dir}', level='WARN')
            return None

    return None


def _add_months_utc(dt: datetime, months: int) -> datetime:
    """Add calendar months to a UTC datetime, clamping day to month-end when needed."""
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)

    year = dt.year + (dt.month - 1 + months) // 12
    month = (dt.month - 1 + months) % 12 + 1

    if month == 12:
        next_month = datetime(year + 1, 1, 1, tzinfo=timezone.utc)
    else:
        next_month = datetime(year, month + 1, 1, tzinfo=timezone.utc)
    last_day = (next_month - timedelta(days=1)).day

    day = min(dt.day, last_day)
    return datetime(year, month, day, dt.hour, dt.minute, dt.second, dt.microsecond, tzinfo=timezone.utc)


def log_freshness_check(source_name: str, config: Dict[str, Any], files: List):
    """Log data freshness for a single source based on its configuration."""
    pipeline.log_message(f'\n{source_name} FILES')
    latest_ts = get_latest_timestamp(config)

    if latest_ts:
        if hasattr(latest_ts, 'tzinfo') and latest_ts.tzinfo is None:
            latest_ts = latest_ts.replace(tzinfo=timezone.utc)

        if 'get_due_date' in config:
            due_date = config['get_due_date'](latest_ts)
            if due_date.tzinfo is None:
                due_date = due_date.replace(tzinfo=timezone.utc)

            status = 'ON TIME' if current_time_utc <= due_date else 'OVERDUE'
            pipeline.log_message(f"   Latest processed: {latest_ts.strftime('%Y-%m-%d %H:%M:%S %Z')} - Due by: {due_date.strftime('%Y-%m-%d %H:%M:%S %Z')} - {status}")

            if status == 'OVERDUE':
                overdue_days = (current_time_utc - due_date).days
                critical_issues.append(f"{source_name} is overdue by {overdue_days} days (UTC)")
        else:
            days_since = (current_time_utc - latest_ts).days
            status = 'ON TIME' if days_since <= config['expected_days'] else 'OVERDUE'
            pipeline.log_message(f"   Latest processed: {latest_ts.strftime('%Y-%m-%d %H:%M:%S %Z')} ({days_since}d ago) - {status}")

            if status == 'OVERDUE':
                critical_issues.append(f"{source_name} is overdue by {days_since} days (UTC)")

        if 'get_next_expected' in config:
            next_expected = config['get_next_expected'](current_time_utc)
            pipeline.log_message(f'   Next expected: {next_expected}')

    else:
        pipeline.log_message('   Latest processed: NO DATA')
        critical_issues.append(f'{source_name} has no processed data')

    if files:
        pipeline.log_message(f'   Unprocessed files available: {len(files)}')
        if latest_ts:
            latest_available_time = max(files, key=lambda x: x[1])[1]
            if isinstance(latest_available_time, str):
                latest_available_time = datetime.fromisoformat(latest_available_time)
            if latest_available_time.tzinfo is None:
                latest_available_time = latest_available_time.replace(tzinfo=timezone.utc)

            if latest_available_time > latest_ts:
                gap_hours = round((latest_available_time - latest_ts).total_seconds() / 3600)
                pipeline.log_message(f'   GAP DETECTED: {gap_hours}h between latest available and processed (UTC)')
                gaps_detected.append(f'{source_name}: {gap_hours}h processing gap with {len(files)} unprocessed files (UTC)')
    else:
        pipeline.log_message('   No unprocessed files detected')


current_time_utc = datetime.now(timezone.utc)
gaps_detected = []
critical_issues = []

MONITORING_CONFIG = {
    'CG': {
        'type': 'table',
        'batch_cd': pipeline.SOURCE_TYPE_CG,
        'expected_days': 1,
    },
    'SCD': {
        'type': 'table',
        'batch_cd': pipeline.SOURCE_TYPE_SCD,
        'expected_days': 4,
        'get_next_expected': lambda today: 'Wednesday' if today.weekday() < 2 else 'Saturday' if today.weekday() < 5 else 'Next Wednesday',
    },
    'PT Delta Table': {
        'type': 'delta',
        'get_due_date': lambda latest_ts: _add_months_utc(latest_ts, 1),
    },
    'DMDC Export': {
        'type': 'directory',
        'path': pipeline.DMDC_EXPORT_DIR,
        'expected_days': 4,
    }
}

pipeline.log_message('Data Source Freshness & Gap Analysis (UTC)')
pipeline.log_message('=' * 50)

try:
    unprocessed_files = discovery.discover_unprocessed_files('update')

    for source_name, config in MONITORING_CONFIG.items():
        try:
            if config.get('type') == 'directory':
                files = []
            else:
                files = unprocessed_files.get(config.get('batch_cd', ''), [])

            log_freshness_check(source_name, config, files)
        except Exception as e:
            pipeline.log_message(f'ERROR processing {source_name}: {str(e)}', level='ERROR')
            critical_issues.append(f'{source_name} monitoring failed: {str(e)}')

    pipeline.log_message(f"\nMonitoring completed at {current_time_utc.strftime('%Y-%m-%d %H:%M:%S %Z')}")

except Exception as e:
    critical_issues.append(f'Monitoring execution failed: {str(e)}')
    pipeline.log_message(f'Freshness monitoring failed: {str(e)}')

pipeline.log_message('=' * 50)

## 3. Gap Detection Summary & Alert Logic

In [None]:
# Comprehensive gap analysis and alert decision logic
pipeline.log_message('\nGap Detection Summary')

if gaps_detected:
    pipeline.log_message(f'PROCESSING GAPS DETECTED: {len(gaps_detected)}')
    for i, gap in enumerate(gaps_detected, 1):
        pipeline.log_message(f'   {i}. {gap}')
else:
    pipeline.log_message('No processing gaps detected')

if critical_issues:
    pipeline.log_message(f'\nCRITICAL ISSUES DETECTED: {len(critical_issues)}')
    for i, issue in enumerate(critical_issues, 1):
        pipeline.log_message(f'   {i}. {issue}')
else:
    pipeline.log_message('\nNo critical issues detected')

total_issues = len(gaps_detected) + len(critical_issues)
alert_required = total_issues > 0

pipeline.log_message('\nAlert Decision:')
pipeline.log_message(f'   Total Issues: {total_issues}')
pipeline.log_message(f"   Alert Required: {'YES' if alert_required else 'NO'}")

if alert_required:
    pipeline.log_message('   Action: Job will fail to trigger team notifications')
else:
    pipeline.log_message('   Action: Job will complete successfully')

## 4. Forced Failure for Alert Notifications

In [None]:
# Force notebook failure if gaps or critical issues detected
if alert_required:
    pipeline.log_message('\n' + '=' * 110)
    pipeline.log_message('CRITICAL: DATA PROCESSING GAPS DETECTED')
    pipeline.log_message('=' * 110)

    pipeline.log_message('\nIssue Summary:')
    if gaps_detected:
        pipeline.log_message('\nProcessing Gaps:')
        for gap in gaps_detected:
            pipeline.log_message(f'   - {gap}')

    if critical_issues:
        pipeline.log_message('\nCritical Issues:')
        for issue in critical_issues:
            pipeline.log_message(f'   - {issue}')

    pipeline.log_message('\nImmediate Actions Required:')
    pipeline.log_message('   1. Review unprocessed files in source directories')
    pipeline.log_message('   2. Execute patronage pipeline to process pending data')
    pipeline.log_message('   3. Investigate any system or data delivery issues')
    pipeline.log_message('   4. Verify data source availability and accessibility')

    pipeline.log_message(f"\nAlert Time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S %Z')}")
    pipeline.log_message('Team will be notified via Databricks job failure email')
    pipeline.log_message('=' * 110)

    raise Exception(
        f"VA Patronage Data Freshness Alert: {len(gaps_detected)} processing gaps and "
        f"{len(critical_issues)} critical issues detected. Immediate attention required."
    )
else:
    pipeline.log_message('\n' + '=' * 110)
    pipeline.log_message('SUCCESS: All data sources are current')
    pipeline.log_message('\nAll systems operating normally:')
    pipeline.log_message('   - No processing gaps detected')
    pipeline.log_message('   - No critical issues identified')
    pipeline.log_message('   - Data freshness within acceptable parameters')
    pipeline.log_message(f"\nNext monitoring check: {(datetime.now(timezone.utc) + timedelta(days=1)).strftime('%Y-%m-%d')}")