In [None]:
 pip install pandas numpy
 pip install ydata-profiling
 import pandas as pd
import numpy as np

# Step 1: Load Data
df = pd.read_csv("raw_data.csv")

# Step 2: Initial Summary
print("Initial Data Overview:")
print(df.info())
print(df.describe(include='all'))

# Step 3: Data Cleaning Automation

# Remove duplicate rows
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]
print(f"Removed {before - after} duplicate rows.")

# Handle missing values
# Strategy: Fill numeric columns with median, categorical with mode
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype in [np.float64, np.int64]:
            median = df[col].median()
            df[col].fillna(median, inplace=True)
            print(f"Filled missing values in numeric column '{col}' with median: {median}")
        else:
            mode = df[col].mode()[0]
            df[col].fillna(mode, inplace=True)
            print(f"Filled missing values in categorical column '{col}' with mode: {mode}")

# Optional: Convert object columns to category or datetime if needed
for col in df.select_dtypes(include='object'):
    if "date" in col.lower():
        try:
            df[col] = pd.to_datetime(df[col])
            print(f"Converted column '{col}' to datetime.")
        except Exception as e:
            print(f"Could not convert '{col}' to datetime: {e}")
    else:
        df[col] = df[col].astype('category')
        print(f"Converted column '{col}' to category.")

# Step 4: Save cleaned data
df.to_csv("cleaned_data.csv", index=False)
print("Cleaned data saved to 'cleaned_data.csv'")