In [None]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("raw_data.csv")

print("=== DATA ISSUE REPORT ===")

# 1. Missing Values
missing = df.isnull().sum()
print("\nMissing Values:\n", missing[missing > 0])

# 2. Duplicate Rows
duplicates = df.duplicated().sum()
print(f"\nDuplicate Rows: {duplicates}")

# 3. Data Type Mismatches
print("\nColumn Data Types:")
print(df.dtypes)

# 4. Inconsistent String Formatting
print("\nString Columns with Leading/Trailing Spaces:")
for col in df.select_dtypes(include='object'):
    sample = df[col].dropna().astype(str)
    if sample.str.contains(r'^\s+|\s+$', regex=True).any():
        print(f"- {col}")

# 5. Outliers (IQR Method)
print("\nOutlier Detection (IQR):")
for col in df.select_dtypes(include=np.number):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    if not outliers.empty:
        print(f"- {col}: {outliers.shape[0]} outliers")

# 6. Custom Checks (e.g., negative values for age/salary)
print("\nInvalid Value Checks:")
if 'age' in df.columns:
    neg_ages = df[df['age'] < 0].shape[0]
    print(f"- Negative ages: {neg_ages}")
if 'salary' in df.columns:
    neg_salary = df[df['salary'] < 0].shape[0]
    print(f"- Negative salaries: {neg_salary}")