# Setup Environment
Import necessary libraries including PySpark, YAML parser, and other utilities needed for Databricks.

In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession  # For creating and managing Spark sessions
import yaml  # For parsing YAML configuration files
import os  # For handling file paths and environment variables
import logging  # For logging purposes

# Initialize Spark session
spark = SparkSession.builder \
    .appName("DataQualityManager") \
    .getOrCreate()

# Set logging level
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("DataQualityManager")

# Verify Spark session
logger.info("Spark session initialized successfully.")

# Define DataQualityManager Class
Create the main DataQualityManager class with methods for configuration interpretation, execution coordination, and result management.

In [None]:
# Define the DataQualityManager class
class DataQualityManager:
    def __init__(self, config_path):
        """
        Initialize the DataQualityManager with the path to the YAML configuration file.
        """
        self.config_path = config_path
        self.config = None
        self.execution_strategy = None
        self.results = []

    def load_config(self):
        """
        Load and parse the YAML configuration file.
        """
        if not os.path.exists(self.config_path):
            raise FileNotFoundError(f"Configuration file not found: {self.config_path}")
        
        with open(self.config_path, 'r') as file:
            self.config = yaml.safe_load(file)
        
        logger.info("Configuration loaded successfully.")

    def set_execution_strategy(self, strategy):
        """
        Set the execution strategy for applying data quality expectations.
        """
        self.execution_strategy = strategy
        logger.info(f"Execution strategy set to: {strategy.__class__.__name__}")

    def apply_expectations(self, dataframe):
        """
        Apply data quality expectations to the given Spark DataFrame using the configured strategy.
        """
        if not self.execution_strategy:
            raise ValueError("Execution strategy is not set.")
        
        if not self.config or 'expectations' not in self.config:
            raise ValueError("Expectations are not defined in the configuration.")
        
        expectations = self.config['expectations']
        logger.info(f"Applying {len(expectations)} expectations to the DataFrame.")
        
        self.results = self.execution_strategy.execute(dataframe, expectations)
        logger.info("Expectations applied successfully.")

    def aggregate_results(self):
        """
        Aggregate and process the results of the executed expectations.
        """
        if not self.results:
            raise ValueError("No results to aggregate.")
        
        # Example aggregation logic: count passed and failed expectations
        passed = sum(1 for result in self.results if result['status'] == 'passed')
        failed = sum(1 for result in self.results if result['status'] == 'failed')
        
        logger.info(f"Results aggregated: {passed} passed, {failed} failed.")
        return {"passed": passed, "failed": failed}

    def handle_fallbacks(self):
        """
        Handle fallback procedures as defined in the configuration.
        """
        if 'fallbacks' not in self.config:
            logger.info("No fallback procedures defined in the configuration.")
            return
        
        fallbacks = self.config['fallbacks']
        logger.info(f"Executing {len(fallbacks)} fallback procedures.")
        
        for fallback in fallbacks:
            # Example: log the fallback action
            logger.info(f"Executing fallback: {fallback}")
            # Add actual fallback logic here

# Implement Configuration Loading
Build functions to load and parse YAML configuration files that define data quality expectations and execution parameters.

In [None]:
def load_yaml_config(file_path):
    """
    Load and parse a YAML configuration file.

    :param file_path: Path to the YAML configuration file.
    :return: Parsed configuration as a dictionary.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"YAML configuration file not found: {file_path}")
    
    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)
    
    logger.info(f"YAML configuration loaded successfully from {file_path}.")
    return config

def validate_config(config):
    """
    Validate the structure and required fields of the YAML configuration.

    :param config: Parsed YAML configuration as a dictionary.
    :return: None. Raises ValueError if validation fails.
    """
    required_keys = ['expectations', 'execution_strategy']
    for key in required_keys:
        if key not in config:
            raise ValueError(f"Missing required configuration key: {key}")
    
    logger.info("YAML configuration validated successfully.")

# Example usage
try:
    # Replace 'path/to/config.yaml' with the actual path to your YAML file
    config_path = 'path/to/config.yaml'
    config = load_yaml_config(config_path)
    validate_config(config)
except Exception as e:
    logger.error(f"Error loading or validating configuration: {e}")

# Create Execution Strategies
Implement different execution strategies (sequential, parallel, micro-batch) as separate classes with a common interface.

In [None]:
from concurrent.futures import ThreadPoolExecutor
from pyspark.sql import DataFrame

class ExecutionStrategy:
    """
    Abstract base class for execution strategies.
    """
    def execute(self, dataframe: DataFrame, expectations: list):
        """
        Execute the expectations on the given DataFrame.
        :param dataframe: Spark DataFrame to apply expectations to.
        :param expectations: List of expectations to apply.
        :return: List of results for each expectation.
        """
        raise NotImplementedError("Subclasses must implement the execute method.")

class SequentialExecutionStrategy(ExecutionStrategy):
    """
    Execute expectations sequentially.
    """
    def execute(self, dataframe: DataFrame, expectations: list):
        results = []
        for expectation in expectations:
            result = self._apply_expectation(dataframe, expectation)
            results.append(result)
        return results

    def _apply_expectation(self, dataframe: DataFrame, expectation: dict):
        # Example logic for applying an expectation
        logger.info(f"Applying expectation: {expectation['name']}")
        # Simulate expectation result
        return {"name": expectation['name'], "status": "passed"}

class ParallelExecutionStrategy(ExecutionStrategy):
    """
    Execute expectations in parallel using ThreadPoolExecutor.
    """
    def execute(self, dataframe: DataFrame, expectations: list):
        results = []
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(self._apply_expectation, dataframe, expectation) for expectation in expectations]
            for future in futures:
                results.append(future.result())
        return results

    def _apply_expectation(self, dataframe: DataFrame, expectation: dict):
        # Example logic for applying an expectation
        logger.info(f"Applying expectation in parallel: {expectation['name']}")
        # Simulate expectation result
        return {"name": expectation['name'], "status": "passed"}

class MicroBatchExecutionStrategy(ExecutionStrategy):
    """
    Execute expectations in micro-batches.
    """
    def __init__(self, batch_size: int):
        self.batch_size = batch_size

    def execute(self, dataframe: DataFrame, expectations: list):
        results = []
        for i in range(0, len(expectations), self.batch_size):
            batch = expectations[i:i + self.batch_size]
            logger.info(f"Processing batch of size {len(batch)}")
            for expectation in batch:
                result = self._apply_expectation(dataframe, expectation)
                results.append(result)
        return results

    def _apply_expectation(self, dataframe: DataFrame, expectation: dict):
        # Example logic for applying an expectation
        logger.info(f"Applying expectation in micro-batch: {expectation['name']}")
        # Simulate expectation result
        return {"name": expectation['name'], "status": "passed"}

# Define Data Quality Expectations
Create a framework for defining various data quality checks (completeness, uniqueness, range validation, etc.) that can be applied to Spark DataFrames.

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, count, when

class DataQualityExpectation:
    """
    Base class for data quality expectations.
    """
    def validate(self, dataframe: DataFrame):
        """
        Validate the expectation on the given DataFrame.
        :param dataframe: Spark DataFrame to validate.
        :return: Dictionary with validation result.
        """
        raise NotImplementedError("Subclasses must implement the validate method.")

class CompletenessExpectation(DataQualityExpectation):
    """
    Check for completeness of a column (no null values).
    """
    def __init__(self, column_name: str):
        self.column_name = column_name

    def validate(self, dataframe: DataFrame):
        null_count = dataframe.filter(col(self.column_name).isNull()).count()
        total_count = dataframe.count()
        status = "passed" if null_count == 0 else "failed"
        return {
            "expectation": f"Completeness check for column {self.column_name}",
            "status": status,
            "details": f"{null_count} null values out of {total_count} rows"
        }

class UniquenessExpectation(DataQualityExpectation):
    """
    Check for uniqueness of a column.
    """
    def __init__(self, column_name: str):
        self.column_name = column_name

    def validate(self, dataframe: DataFrame):
        total_count = dataframe.count()
        unique_count = dataframe.select(self.column_name).distinct().count()
        status = "passed" if total_count == unique_count else "failed"
        return {
            "expectation": f"Uniqueness check for column {self.column_name}",
            "status": status,
            "details": f"{unique_count} unique values out of {total_count} rows"
        }

class RangeExpectation(DataQualityExpectation):
    """
    Check if values in a column fall within a specified range.
    """
    def __init__(self, column_name: str, min_value: float, max_value: float):
        self.column_name = column_name
        self.min_value = min_value
        self.max_value = max_value

    def validate(self, dataframe: DataFrame):
        out_of_range_count = dataframe.filter(
            (col(self.column_name) < self.min_value) | (col(self.column_name) > self.max_value)
        ).count()
        total_count = dataframe.count()
        status = "passed" if out_of_range_count == 0 else "failed"
        return {
            "expectation": f"Range check for column {self.column_name} ({self.min_value} to {self.max_value})",
            "status": status,
            "details": f"{out_of_range_count} out-of-range values out of {total_count} rows"
        }

# Implement Result Aggregation
Develop methods to collect, aggregate, and analyze results from multiple quality checks, with support for blocking policies based on failure thresholds.

In [None]:
class ResultAggregator:
    """
    Class to collect, aggregate, and analyze results from multiple quality checks.
    Supports blocking policies based on failure thresholds.
    """
    def __init__(self, failure_threshold: float = 0.1):
        """
        Initialize the ResultAggregator with a failure threshold.
        :param failure_threshold: Proportion of failed checks that triggers blocking.
        """
        self.failure_threshold = failure_threshold

    def aggregate_results(self, results: list):
        """
        Aggregate and analyze the results of quality checks.
        :param results: List of dictionaries containing the results of quality checks.
        :return: Aggregated results and blocking decision.
        """
        if not results:
            raise ValueError("No results provided for aggregation.")

        # Count passed and failed checks
        passed = sum(1 for result in results if result['status'] == 'passed')
        failed = sum(1 for result in results if result['status'] == 'failed')
        total = passed + failed

        # Calculate failure rate
        failure_rate = failed / total if total > 0 else 0

        # Determine if blocking policy is triggered
        block_execution = failure_rate > self.failure_threshold

        logger.info(f"Aggregation complete: {passed} passed, {failed} failed, failure rate: {failure_rate:.2%}")
        logger.info(f"Blocking policy triggered: {block_execution}")

        return {
            "passed": passed,
            "failed": failed,
            "failure_rate": failure_rate,
            "block_execution": block_execution
        }

# Example usage of ResultAggregator
try:
    # Simulated results from quality checks
    simulated_results = [
        {"name": "Check 1", "status": "passed"},
        {"name": "Check 2", "status": "failed"},
        {"name": "Check 3", "status": "passed"},
        {"name": "Check 4", "status": "failed"}
    ]

    # Initialize ResultAggregator with a failure threshold of 20%
    aggregator = ResultAggregator(failure_threshold=0.2)

    # Aggregate results and analyze
    aggregated_results = aggregator.aggregate_results(simulated_results)
    logger.info(f"Aggregated Results: {aggregated_results}")
except Exception as e:
    logger.error(f"Error during result aggregation: {e}")

# Create Fallback Mechanism
Implement fallback procedures that trigger when quality checks fail, based on the configuration settings.

In [None]:
class FallbackHandler:
    """
    Class to handle fallback procedures when quality checks fail.
    """
    def __init__(self, fallback_config: list):
        """
        Initialize the FallbackHandler with the fallback configuration.
        :param fallback_config: List of fallback procedures defined in the configuration.
        """
        self.fallback_config = fallback_config

    def execute_fallbacks(self):
        """
        Execute the fallback procedures as defined in the configuration.
        """
        if not self.fallback_config:
            logger.info("No fallback procedures to execute.")
            return

        for fallback in self.fallback_config:
            action = fallback.get("action")
            params = fallback.get("params", {})
            logger.info(f"Executing fallback action: {action} with params: {params}")
            self._execute_action(action, params)

    def _execute_action(self, action: str, params: dict):
        """
        Execute a specific fallback action.
        :param action: The action to execute.
        :param params: Parameters for the action.
        """
        if action == "log_message":
            message = params.get("message", "No message provided.")
            logger.warning(f"Fallback action - Log Message: {message}")
        elif action == "send_alert":
            alert_type = params.get("type", "email")
            recipient = params.get("recipient", "admin@example.com")
            logger.warning(f"Fallback action - Send Alert: Type={alert_type}, Recipient={recipient}")
        elif action == "retry_operation":
            retries = params.get("retries", 3)
            logger.warning(f"Fallback action - Retry Operation: Retries={retries}")
            # Add retry logic here
        else:
            logger.error(f"Unknown fallback action: {action}")

# Example usage of FallbackHandler
try:
    # Simulated fallback configuration
    simulated_fallbacks = [
        {"action": "log_message", "params": {"message": "Data quality check failed."}},
        {"action": "send_alert", "params": {"type": "email", "recipient": "data_team@example.com"}},
        {"action": "retry_operation", "params": {"retries": 5}}
    ]

    # Initialize FallbackHandler with the simulated configuration
    fallback_handler = FallbackHandler(simulated_fallbacks)

    # Execute fallback procedures
    fallback_handler.execute_fallbacks()
except Exception as e:
    logger.error(f"Error during fallback execution: {e}")

# Demo with Example DataFrames
Demonstrate the DataQualityManager with example Spark DataFrames, showing how to apply configurations and interpret results.

In [None]:
# Demo with Example DataFrames

# Create example DataFrames
example_data_1 = [
    (1, "Alice", 25),
    (2, "Bob", None),
    (3, "Charlie", 30),
    (4, "David", 35),
    (5, "Eve", None)
]
example_data_2 = [
    (1, "Alice", 1000),
    (2, "Bob", 2000),
    (3, "Charlie", 3000),
    (4, "David", 4000),
    (5, "Eve", 5000)
]

schema_1 = ["id", "name", "age"]
schema_2 = ["id", "name", "salary"]

df1 = spark.createDataFrame(example_data_1, schema_1)
df2 = spark.createDataFrame(example_data_2, schema_2)

# Display the example DataFrames
logger.info("Example DataFrame 1:")
df1.show()

logger.info("Example DataFrame 2:")
df2.show()

# Define expectations for the demo
demo_expectations = [
    CompletenessExpectation(column_name="age"),
    UniquenessExpectation(column_name="id"),
    RangeExpectation(column_name="age", min_value=20, max_value=40)
]

# Apply expectations to the first DataFrame
logger.info("Applying expectations to DataFrame 1...")
results_df1 = [expectation.validate(df1) for expectation in demo_expectations]

# Log results for DataFrame 1
logger.info("Results for DataFrame 1:")
for result in results_df1:
    logger.info(result)

# Apply expectations to the second DataFrame
logger.info("Applying expectations to DataFrame 2...")
results_df2 = [expectation.validate(df2) for expectation in demo_expectations]

# Log results for DataFrame 2
logger.info("Results for DataFrame 2:")
for result in results_df2:
    logger.info(result)

# Aggregate results using ResultAggregator
aggregator = ResultAggregator(failure_threshold=0.2)

logger.info("Aggregating results for DataFrame 1...")
aggregated_results_df1 = aggregator.aggregate_results(results_df1)
logger.info(f"Aggregated Results for DataFrame 1: {aggregated_results_df1}")

logger.info("Aggregating results for DataFrame 2...")
aggregated_results_df2 = aggregator.aggregate_results(results_df2)
logger.info(f"Aggregated Results for DataFrame 2: {aggregated_results_df2}")

# Handle fallbacks if necessary
fallback_config = [
    {"action": "log_message", "params": {"message": "Fallback triggered for DataFrame 1"}},
    {"action": "send_alert", "params": {"type": "email", "recipient": "data_team@example.com"}}
]

fallback_handler = FallbackHandler(fallback_config)

if aggregated_results_df1["block_execution"]:
    logger.warning("Blocking policy triggered for DataFrame 1. Executing fallbacks...")
    fallback_handler.execute_fallbacks()

if aggregated_results_df2["block_execution"]:
    logger.warning("Blocking policy triggered for DataFrame 2. Executing fallbacks...")
    fallback_handler.execute_fallbacks()