# Preprocessing

Short idea: load data from `data/raw/`, clean it, transform it, and save the final dataset to `data/processed/`.

In [None]:
import pandas as pd

def clean_csv(input_path):
    # 1) Read CSV
    df = pd.read_csv(input_path)

    # 2) Remove empty columns
    df = df.dropna(axis=1, how="all")

    # 2b) Remove duplicate columns
    df = df.loc[:, ~df.T.duplicated()]

    # 3) Fill missing values
    numeric_cols = df.select_dtypes(include="number").columns
    categorical_cols = df.columns.difference(numeric_cols)

    # Numeric → mean
    for col in numeric_cols:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].mean())

    # Categorical → majority vote
    for col in categorical_cols:
        if df[col].isna().any():
            most_common = df[col].mode(dropna=True)
            fill_value = most_common.iloc[0] if len(most_common) > 0 else "UNKNOWN"
            df[col] = df[col].fillna(fill_value)

    # 4) One-hot encoding
    df = pd.get_dummies(df, columns=categorical_cols)

    # 5) Standard scaling
    for col in numeric_cols:
        mean = df[col].mean()
        std = df[col].std()

        if std == 0 or pd.isna(std):
            df[col] = 0
        else:
            df[col] = (df[col] - mean) / std

    return df