In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("raw_data.csv")
print("Data Loaded. Shape:", df.shape)

# 1. Schema and Type Checks
expected_columns = {
    'id': 'int64',
    'name': 'object',
    'age': 'int64',
    'email': 'object',
    'salary': 'float64',
    'join_date': 'object'  # Will convert to datetime
}

print("\n=== Schema Validation ===")
for col, expected_type in expected_columns.items():
    if col not in df.columns:
        print(f"Missing column: {col}")
    elif df[col].dtype != expected_type:
        print(f"Type mismatch in '{col}': Expected {expected_type}, Found {df[col].dtype}")

# 2. Data Type Fixes
df['join_date'] = pd.to_datetime(df['join_date'], errors='coerce')

# 3. Uniqueness Check
print("\n=== Uniqueness Checks ===")
if df['id'].duplicated().any():
    print("Duplicate IDs found!")

# 4. Missing Value Checks
print("\n=== Missing Values ===")
print(df.isnull().sum())

# 5. Outlier Detection (using IQR)
print("\n=== Outlier Detection ===")
numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    if not outliers.empty:
        print(f"{col}: {outliers.shape[0]} outliers detected")

# 6. Value Range Checks
print("\n=== Value Range Validation ===")
if 'age' in df.columns:
    invalid_ages = df[(df['age'] < 0) | (df['age'] > 120)]
    print(f"Invalid ages: {invalid_ages.shape[0]}")
if 'salary' in df.columns:
    negative_salary = df[df['salary'] < 0]
    print(f"Negative salaries: {negative_salary.shape[0]}")

# 7. Export Clean Log
print("\n=== Summary ===")
print("Advanced data quality checks completed.")