In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Step 1: Load or Create Sample Dataset
data = pd.DataFrame({
    'id': [1, 2, 2, 4, 5],
    'name': ['Alice', 'Bob', 'Bob', 'David', np.nan],
    'age': [25, np.nan, 30, 22, 40],
    'email': ['alice@mail.com', 'bob@mail.com', 'bob@mail.com', None, 'david@mail.com'],
    'date_joined': ['01-01-2021', '2021-01-01', '01-01-2021', 'March 5, 2021', 'invalid-date']
})

print("=== Dataset Overview ===")
print(data.head(), "\n")

# Step 2: Check for Missing Values
print("=== Missing Values per Column ===")
print(data.isnull().sum(), "\n")

# Step 3: Identify Duplicate Rows
duplicates = data[data.duplicated()]
print("=== Duplicate Rows ===")
print(duplicates if not duplicates.empty else "No duplicates found.", "\n")

# Step 4: Standardize Date Format and Handle Inconsistencies
def standardize_date(date_str):
    formats = ['%d-%m-%Y', '%Y-%m-%d', '%m-%d-%Y', '%B %d, %Y']
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt).strftime('%Y-%m-%d')
        except:
            continue
    return 'Invalid'

data['standardized_date'] = data['date_joined'].apply(standardize_date)
print("=== Standardized Date Column ===")
print(data[['date_joined', 'standardized_date']], "\n")

# Step 5: Generate Data Quality Report
print("=== Data Quality Report ===")
report = pd.DataFrame({
    'Column': data.columns,
    'Data Type': data.dtypes.astype(str),
    'Missing Values': data.isnull().sum(),
    'Unique Values': data.nunique()
})
print(report)