## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [None]:
import schedule
import time
import pandas as pd
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(filename='data_quality_log.txt', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# --- Configuration ---
DATA_FILE_PATH = 'your_data.csv'  # Replace with the actual path to your data file
TRUSTED_DATA_FILE_PATH = 'trusted_data.csv' # Optional: For accuracy checks
ACCURACY_COLUMN = 'your_id_column' # Optional: Common identifier for accuracy check
DATA_COLUMNS_TO_CHECK = ['column1', 'column2', 'column_with_dates'] # Columns for completeness
DATE_COLUMN = 'column_with_dates' # Column to extract time information (for trend analysis if needed)

def calculate_completeness(df, columns_to_check):
    """Calculates the completeness rate for specified columns."""
    completeness_metrics = {}
    total_rows = len(df)
    if total_rows > 0:
        for col in columns_to_check:
            if col in df.columns:
                missing_count = df[col].isnull().sum()
                completeness_rate = ((total_rows - missing_count) / total_rows) * 100
                completeness_metrics[f'{col}_completeness'] = f"{completeness_rate:.2f}%"
            else:
                logging.warning(f"Column '{col}' not found in the data.")
    else:
        logging.warning("Dataframe is empty, cannot calculate completeness.")
    return completeness_metrics

def calculate_accuracy(df, trusted_df, on_column, columns_to_compare):
    """Calculates the accuracy by comparing values with a trusted source."""
    accuracy_metrics = {}
    if df is not None and trusted_df is not None and on_column in df.columns and on_column in trusted_df.columns:
        merged_df = pd.merge(df, trusted_df, on=on_column, suffixes=('_current', '_trusted'), how='inner')
        total_compared = len(merged_df)
        if total_compared > 0:
            for col in columns_to_compare:
                current_col = f'{col}_current'
                trusted_col = f'{col}_trusted'
                if current_col in merged_df.columns and trusted_col in merged_df.columns:
                    match_count = (merged_df[current_col] == merged_df[trusted_col]).sum()
                    accuracy_rate = (match_count / total_compared) * 100
                    accuracy_metrics[f'{col}_accuracy'] = f"{accuracy_rate:.2f}%"
                else:
                    logging.warning(f"Column(s) '{current_col}' or '{trusted_col}' not found in merged data for accuracy check.")
        else:
            logging.warning(f"No matching records found on '{on_column}' to calculate accuracy.")
    else:
        logging.warning("Current or trusted data or the join column is missing, cannot calculate accuracy.")
    return accuracy_metrics

def calculate_quality_metrics():
    """Calculates data quality metrics and logs them."""
    logging.info("--- Running Data Quality Check ---")
    try:
        df = pd.read_csv(DATA_FILE_PATH)

        # Calculate completeness
        completeness = calculate_completeness(df, DATA_COLUMNS_TO_CHECK)
        logging.info(f"Completeness Metrics: {completeness}")

        # Calculate accuracy (if trusted data path and column are provided)
        if TRUSTED_DATA_FILE_PATH and ACCURACY_COLUMN:
            try:
                trusted_df = pd.read_csv(TRUSTED_DATA_FILE_PATH)
                accuracy = calculate_accuracy(df, trusted_df, ACCURACY_COLUMN, DATA_COLUMNS_TO_CHECK)
                logging.info(f"Accuracy Metrics: {accuracy}")
            except FileNotFoundError:
                logging.warning(f"Trusted data file '{TRUSTED_DATA_FILE_PATH}' not found, skipping accuracy check.")
        else:
            logging.info("Trusted data path or accuracy column not configured, skipping accuracy check.")

    except FileNotFoundError:
        logging.error(f"Data file '{DATA_FILE_PATH}' not found.")
    except Exception as e:
        logging.error(f"An error occurred during data quality check: {e}")

# --- Schedule the job ---
if 'schedule' in locals():
    schedule.every().day.at("09:00").do(calculate_quality_metrics) # Run daily at 9:00 AM
    # schedule.every(5).minutes.do(calculate_quality_metrics) # For testing, run every 5 minutes

    while True:
        schedule.run_pending()
        time.sleep(1)
else:
    print("Error: The 'schedule' library was not imported correctly. Please ensure it is installed.")