In [None]:
pip install pandas numpy
import pandas as pd
import numpy as np

# Step 1: Load raw data
df = pd.read_csv("raw_data.csv")

# Step 2: Initial overview
print("Initial shape:", df.shape)
print("Missing values:\n", df.isnull().sum())

# Step 3: Remove exact duplicates
dups = df.duplicated().sum()
df.drop_duplicates(inplace=True)
print(f"Removed {dups} duplicate rows.")

# Step 4: Strip whitespaces from string columns
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()

# Step 5: Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(r'[^a-z0-9_]', '', regex=True)

# Step 6: Handle missing values
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if pd.api.types.is_numeric_dtype(df[col]):
            median = df[col].median()
            df[col].fillna(median, inplace=True)
            print(f"Filled missing numeric values in '{col}' with median: {median}")
        else:
            mode = df[col].mode().iloc[0]
            df[col].fillna(mode, inplace=True)
            print(f"Filled missing categorical values in '{col}' with mode: {mode}")

# Step 7: Convert dates and standardize format
for col in df.columns:
    if "date" in col or "time" in col:
        try:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            print(f"Converted '{col}' to datetime.")
        except:
            pass

# Step 8: Fix inconsistent data types (e.g., numbers stored as strings)
for col in df.columns:
    if df[col].dtype == 'object':
        try:
            df[col] = pd.to_numeric(df[col])
            print(f"Converted '{col}' to numeric.")
        except:
            pass

# Step 9: Handle outliers (optional - using IQR method)
numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((df[col] < lower) | (df[col] > upper)).sum()
    df[col] = np.where(df[col] < lower, lower,
              np.where(df[col] > upper, upper, df[col]))
    if outliers > 0:
        print(f"Clipped {outliers} outliers in column '{col}' using IQR.")

# Step 10: Save cleaned data
df.to_csv("cleaned_data.csv", index=False)
print("Cleaned data saved as 'cleaned_data.csv'")