# Measuring Completeness

**Activity Overview**: Evaluate data completeness by checking missing data rates and handling partially available records.

## Title: Customer Profiles

**Task**: Calculate the missing data rate for customer profiles.

**Steps**:
1. List all required fields for a complete customer profile (e.g., name, address, email,
phone number).
2. Analyze the dataset to count how many profiles have missing fields.
3. Calculate the percentage of missing data fields across all profiles.

In [1]:
# Write your code from here
import pandas as pd
import numpy as np

def calculate_missing_data_rate(dataset_path, required_fields):
    try:
        df = pd.read_csv(dataset_path)
        for field in required_fields:
            if field not in df.columns:
                print(f"Error: Required field '{field}' not found in the dataset.")
                return None, None
        missing_data = df[required_fields].isnull()
        missing_count_series = missing_data.sum(axis=1)
        total_missing_count = missing_count_series.sum()
        total_possible_fields = len(required_fields) * len(df)
        missing_percentage = (total_missing_count / total_possible_fields) * 100 if total_possible_fields else 0.0
        profiles_with_missing_fields_df = df[missing_count_series > 0].copy()
        if profiles_with_missing_fields_df.empty:
            print("No profiles with missing fields found.")
            return missing_percentage, pd.DataFrame()
        profiles_with_missing_fields_df['Missing_Fields'] = profiles_with_missing_fields_df.apply(
            lambda row: [field for field in required_fields if pd.isna(row[field])], axis=1
        )
        return missing_percentage, profiles_with_missing_fields_df
    except FileNotFoundError as e:
        print(f"Error: File not found: {e.filename}")
        return None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None

if __name__ == '__main__':
    data = {
        'Name': ['John Doe', 'Jane Smith', 'Peter Jones', 'Alice Brown', 'Bob Williams'],
        'Address': ['123 Main St', '456 Oak Ave', None, '101 Pine Ln', '222 Elm Rd'],
        'Email': ['john.doe@example.com', 'jane.smith@example.com', 'peter.jones@sample.org', None, 'bob.williams@work.net'],
        'Phone Number': ['555-1234', '555-5678', '555-9012', '555-3456', None],
        'City': ['Anytown', 'Someville', 'Othertown', 'Bigcity', 'Smallville'],
    }
    df = pd.DataFrame(data)
    dataset_file_path = 'customer_profiles.csv'
    df.to_csv(dataset_file_path, index=False)
    required_fields = ['Name', 'Address', 'Email', 'Phone Number']
    missing_percentage, profiles_with_missing_fields_df = calculate_missing_data_rate(dataset_file_path, required_fields)
    if missing_percentage is not None:
        print(f"Overall Percentage of Missing Data Fields: {missing_percentage:.2f}%")
        if profiles_with_missing_fields_df is not None and not profiles_with_missing_fields_df.empty:
            print("\nProfiles with Missing Fields:")
            print(profiles_with_missing_fields_df.to_string())
        elif profiles_with_missing_fields_df is not None:
            print("\nNo profiles with missing fields.")
    else:
        print("An error occurred during the process.")


Overall Percentage of Missing Data Fields: 15.00%

Profiles with Missing Fields:
           Name      Address                   Email Phone Number        City  Missing_Fields
2   Peter Jones          NaN  peter.jones@sample.org     555-9012   Othertown       [Address]
3   Alice Brown  101 Pine Ln                     NaN     555-3456     Bigcity         [Email]
4  Bob Williams   222 Elm Rd   bob.williams@work.net          NaN  Smallville  [Phone Number]
