In [None]:
pip install pandas numpy
import pandas as pd
import numpy as np

# Step 1: Load Raw Data
df = pd.read_csv("raw_data.csv")

# Step 2: Initial Data Overview
print("Initial Shape:", df.shape)
print("Missing values per column:\n", df.isnull().sum())

# Step 3: Remove Duplicates
duplicates = df.duplicated().sum()
df.drop_duplicates(inplace=True)
print(f"Removed {duplicates} duplicate rows.")

# Step 4: Handle Missing Values
# Fill numeric with median, categorical with mode
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if pd.api.types.is_numeric_dtype(df[col]):
            median = df[col].median()
            df[col].fillna(median, inplace=True)
            print(f"Filled missing values in '{col}' with median: {median}")
        else:
            mode = df[col].mode().iloc[0]
            df[col].fillna(mode, inplace=True)
            print(f"Filled missing values in '{col}' with mode: {mode}")

# Step 5: Standardize Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("Standardized column names.")

# Step 6: Convert Dates Automatically
for col in df.select_dtypes(include='object').columns:
    if 'date' in col or 'time' in col:
        try:
            df[col] = pd.to_datetime(df[col])
            print(f"Converted '{col}' to datetime.")
        except:
            continue

# Step 7: Convert Object Columns to Category
for col in df.select_dtypes(include='object'):
    df[col] = df[col].astype('category')
    print(f"Converted '{col}' to category.")

# Step 8: Save Cleaned Data
df.to_csv("cleaned_data.csv", index=False)
print("Cleaned data saved as 'cleaned_data.csv'")