In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Sample dataset creation
data = pd.DataFrame({
    'id': [1, 2, 2, 4, 5],
    'name': ['Alice', 'Bob', 'Bob', 'David', None],
    'age': [25, np.nan, 30, 22, 40],
    'email': ['alice@mail.com', 'bob@mail.com', 'bob@mail.com', None, 'david@mail.com'],
    'date_joined': ['01-01-2021', '2021-01-01', '01-01-2021', 'March 5, 2021', 'invalid-date']
})

# Step 1: Overview of the Dataset
print("=== Dataset Overview ===")
print(data.head(), "\n")

# Step 2: Checking for Missing Values
print("=== Missing Values per Column ===")
missing_values = data.isnull().sum()
print(missing_values, "\n")

# Step 3: Identifying Duplicates
duplicates = data[data.duplicated()]
print("=== Duplicate Rows ===")
if not duplicates.empty:
    print(duplicates, "\n")
else:
    print("No duplicates found.", "\n")

# Step 4: Standardizing Date Formats and Handling Inconsistent Dates
def standardize_date(date_str):
    formats = ['%d-%m-%Y', '%Y-%m-%d', '%m-%d-%Y', '%B %d, %Y']
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt).strftime('%Y-%m-%d')
        except:
            continue
    return 'Invalid'  # Returns 'Invalid' for unrecognized date formats

# Apply the function to standardize date
data['standardized_date'] = data['date_joined'].apply(standardize_date)
print("=== Standardized Date Column ===")
print(data[['date_joined', 'standardized_date']], "\n")

# Step 5: Generating Data Quality Report
print("=== Data Quality Report ===")
data_quality_report = pd.DataFrame({
    'Column': data.columns,
    'Data Type': data.dtypes.astype(str),
    'Missing Values': data.isnull().sum(),
    'Unique Values': data.nunique()
})
print(data_quality_report, "\n")