In [None]:
from dataclasses import dataclass
from pyspark.sql import SparkSession


@dataclass
class IntegrityCheck:
    title: str
    query: str


@dataclass
class DataIntegrityChecker:
    """
    Accepts a spark session and a list of Integrity Checks, which simply consist of a title and a query.
    If any of the provided queries return results, will throw an error at job completion with the titles
    of all failing queries, as well as the first few rows of their result set. If no queries return any
    results, will pass with no errors.
    """

    spark: SparkSession
    integrity_checks: list[IntegrityCheck]

    def check(self):
        results = []
        for integrity_check in self.integrity_checks:
            df = self.spark.sql(integrity_check.query)
            check_failed = not df.isEmpty()
            if check_failed:
                error_message = (
                    f"{integrity_check.title} \n {df._jdf.showString(20,0,False)}"
                )
                results.append(error_message)

        if results:
            error_messages = "\n".join(
                [
                    f"{index}. {message}"
                    for index, message in enumerate(results, start=1)
                ]
            )
            raise Exception(f"Integrity Issues Found:\n{error_messages}")

In [None]:
checks = [
    IntegrityCheck(
        "Duplicate event_ids in events table.",
        """                                        
        SELECT
            event_id,
            count(1) occurrences
        FROM edl_current.nmis_device_events
        GROUP by event_id
        HAVING occurrences > 1                  
        """,
    ),
    IntegrityCheck(
        "Duplicate event and sensor in temperature table.",
        """                                        
        SELECT
            event_id,
            sensor,
            count(1) occurrences
        FROM edl.nmis_device_event_temperatures
        GROUP by event_id, sensor
        HAVING occurrences > 1                  
        """,
    ),
    IntegrityCheck(
        "Duplicate event and interface_id in interface table.",
        """                                        
        SELECT
            event_id,
            interface_id,
            count(1) occurrences
        FROM edl.nmis_device_event_interfaces
        GROUP by event_id, interface_id
        HAVING occurrences > 1                  
        """,
    ),
    IntegrityCheck(
        "Duplicate event and index, type, element, value for statuses table.",
        """
        SELECT
            event_id,
            type,
            element,
            value,
            index,
            count(1) occurrences
        FROM
            edl.nmis_device_event_statuses
        GROUP BY
            event_id,
            type,
            element,
            value,
            index
        HAVING
            occurrences > 1
        """,
    ),
    IntegrityCheck(
        "Locations in the event table not found in the locations tagitble",
        """
        SELECT e.event_id, e.location_id
        FROM edl_current.nmis_device_events e
        LEFT JOIN edl_current.device_locations l
            ON e.location_id = l.location_id
        WHERE e.location_id is not null
            AND l.location_id is null
        """,
    ),
    IntegrityCheck(
        "Matching event_id present on both edl.nmis_device_event_statuses and edl_current.nmis_device_events table",
        """                                        
        SELECT DISTINCT es.event_id
        FROM edl.nmis_device_event_statuses es
        WHERE NOT EXISTS (
            SELECT 1
            FROM edl_current.nmis_device_events e
            WHERE e.event_id = es.event_id
        );  
        """,
    ),
    IntegrityCheck(
        "Matching event_id present on both edl.nmis_device_event_interfaces and edl_current.nmis_device_events table",
        """                                        
        SELECT DISTINCT i.event_id
        FROM edl.nmis_device_event_interfaces i
        WHERE NOT EXISTS (
            SELECT 1
            FROM edl_current.nmis_device_events e
            WHERE e.event_id = i.event_id
        );
        """,
    ),
    IntegrityCheck(
        "Matching event_id present on both edl.nmis_device_event_termperatures and edl_current.nmis_device_events table",
        """                                        
        SELECT DISTINCT et.event_id
        FROM edl.nmis_device_event_temperatures et
        WHERE NOT EXISTS (
            SELECT 1
            FROM edl_current.nmis_device_events e
            WHERE e.event_id = et.event_id
        );
        """,
    ),
]

DataIntegrityChecker(spark, checks).check()