In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load the dataset
df = pd.read_csv("/mnt/data/file-PGUdoCDJHVQAiB5rTTjGuJ")

# Define data quality dimension functions
def completeness(df):
    total = df.size
    missing = df.isnull().sum().sum()
    return round(100 * (1 - missing / total), 2)

def uniqueness(df):
    duplicates = df.duplicated().sum()
    return round(100 * (1 - duplicates / len(df)), 2)

def validity(df, rules={}):
    # Example: Rules can be passed as {column: function}
    valid_counts = 0
    total_checks = 0
    for col, func in rules.items():
        valid_counts += df[col].apply(func).sum()
        total_checks += df[col].notnull().sum()
    if total_checks == 0:
        return np.nan
    return round(100 * (valid_counts / total_checks), 2)

def consistency(df, constraints=[]):
    # Example: Each constraint is a boolean mask or lambda that returns a mask
    if not constraints:
        return np.nan
    passed = 0
    for rule in constraints:
        if isinstance(rule, pd.Series):
            passed += rule.sum()
        else:
            passed += rule(df).sum()
    total = len(df) * len(constraints)
    return round(100 * (passed / total), 2)

def accuracy(df):
    # Placeholder: Typically requires comparison with source-of-truth data
    return np.nan  # You can update this based on known correct data

def timeliness(df, date_column, threshold_days=365):
    if date_column not in df.columns:
        return np.nan
    recent_threshold = datetime.now() - timedelta(days=threshold_days)
    recent_count = pd.to_datetime(df[date_column], errors='coerce') >= recent_threshold
    return round(100 * recent_count.sum() / df[date_column].notnull().sum(), 2)

# Example validity rules
validity_rules = {
    'Email': lambda x: isinstance(x, str) and '@' in x,
    'Phone': lambda x: isinstance(x, str) and len(x) == 14 and x.startswith('(')
}

# Example consistency rule (e.g., Country code matches region logic)
consistency_rules = [
    lambda df: df['Country'] == 'USA'  # Just a placeholder
]

# Compute dimension scores
scores = {
    'Completeness': completeness(df),
    'Uniqueness': uniqueness(df),
    'Validity': validity(df, validity_rules),
    'Consistency': consistency(df, consistency_rules),
    'Accuracy': accuracy(df),
    'Timeliness': timeliness(df, date_column='LastUpdated')  # Replace column name as needed
}

# Convert to DataFrame
result_df = pd.DataFrame(list(scores.items()), columns=["Dimension", "Score"])

# Print results
print("ISO 8000 Data Quality Assessment")
print(result_df)