In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Step 1: Load Data
df = pd.read_csv("raw_data.csv")
print("Initial Shape:", df.shape)

# Step 2: Handle Missing Values
# Fill numeric with median, categorical with mode
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)

# Step 3: Remove Duplicates
df.drop_duplicates(inplace=True)

# Step 4: Encode Categorical Variables
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 5: Normalize/Scale Numerical Columns
scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Step 6: Convert Data Types (optional)
# Example: Ensure all numerical columns are float32 for efficiency
for col in numeric_cols:
    df[col] = df[col].astype(np.float32)

# Step 7: Save Preprocessed Data
df.to_csv("preprocessed_data.csv", index=False)
print("Preprocessed data saved as 'preprocessed_data.csv'")