In [None]:
import pandas as pd
import numpy as np

# Sample dataset
data = pd.DataFrame({
    'id': [1, 2, 2, 4, 5],
    'name': ['Alice', 'Bob', 'Bob', 'David', None],
    'age': [25, np.nan, 30, 22, 40],
    'email': ['alice@mail.com', 'bob@mail.com', 'bob@mail.com', None, 'david@mail.com'],
    'date_joined': ['01-01-2021', '2021-01-01', '01-01-2021', 'March 5, 2021', 'invalid-date']
})

# Step 1: Basic Info
print("Dataset Overview:")
print(data.info())
print("\nFirst 5 Rows:")
print(data.head())

# Step 2: Missing Values
print("\nMissing Values:")
print(data.isnull().sum())

# Step 3: Duplicate Rows
duplicates = data[data.duplicated()]
print("\nDuplicate Rows:")
print(duplicates)

# Step 4: Inconsistent Date Formats
from datetime import datetime

def standardize_date(date_str):
    formats = ['%d-%m-%Y', '%Y-%m-%d', '%m-%d-%Y', '%B %d, %Y']
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt).strftime('%Y-%m-%d')
        except:
            continue
    return 'Invalid'

data['standardized_date'] = data['date_joined'].apply(standardize_date)

# Step 5: Data Quality Report
print("\nData Quality Report:")
report = pd.DataFrame({
    'Column': data.columns,
    'Data Type': data.dtypes.astype(str),
    'Missing Values': data.isnull().sum(),
    'Unique Values': data.nunique()
})
print(report)

# Show cleaned date column
print("\nStandardized Dates:")
print(data[['date_joined', 'standardized_date']])