<a href="https://colab.research.google.com/github/8in7r/past/blob/main/Git.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np

def clean_data_project(df_raw):
    df = df_raw.copy()

    # --------------------
    # Type casting
    # --------------------
    df["age"] = pd.to_numeric(df["age"], errors="coerce")
    df["income"] = pd.to_numeric(df["income"], errors="coerce")
    df["signup_time"] = pd.to_datetime(df["signup_time"], errors="coerce")

    # --------------------
    # Missing indicators
    # --------------------
    df["age_missing"] = df["age"].isna().astype(int)
    df["income_missing"] = df["income"].isna().astype(int)

    # --------------------
    # Imputation (median)
    # --------------------
    df["age"] = df["age"].fillna(df["age"].median())
    df["income"] = df["income"].fillna(df["income"].median())

    # --------------------
    # Outlier handling
    # --------------------
    income_cap = df["income"].quantile(0.99)
    df["income"] = df["income"].clip(upper=income_cap)

    # log1p transformation
    df["income"] = np.log1p(df["income"])

    # --------------------
    # City cleaning
    # --------------------
    df["city"] = (
        df["city"]
        .astype("string")
        .str.strip()
        .str.lower()
    )

    # --------------------
    # Datetime timezone handling
    # --------------------
    if df["signup_time"].dt.tz is None:
        df["signup_time"] = df["signup_time"].dt.tz_localize("UTC")
    else:
        df["signup_time"] = df["signup_time"].dt.tz_convert("UTC")

    return df
