In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import re

In [2]:
def processData():
    df = pd.read_csv("data/household.csv")
    df = df.dropna(subset=["dwelltype", "owndwell", "homesubregion_ASGS"])

    df[["hhinc_group", "hhinc_group_median"]] = df["hhinc_group"].apply(lambda x: pd.Series(processIncome(x)))
    simpleImputer = SimpleImputer(strategy="median")
    df[["hhinc_group_median"]] = simpleImputer.fit_transform(df[["hhinc_group_median"]])

    df = processNumberedFeatures(df)
    df = labelEncode(df)
    df = oneHotEncode(df)
    df = createBins(df)
    df.to_csv("data/cleaned_household.csv")

In [3]:
def processIncome(value):
    # Match yearly range inside parentheses
    match = re.search(r"\(\$(\d{1,3}(?:,\d{3})*|\d+)-\$(\d{1,3}(?:,\d{3})*|\d+)\)", str(value))
    if match:
        low = int(match.group(1).replace(",", ""))
        high = int(match.group(2).replace(",", ""))
        yearly_str = f"{low}-{high}"       # string version
        midpoint = (low + high) / 2        # numeric version
        return yearly_str, midpoint

    # Match "or more"
    match_more = re.search(r"\(\$(\d{1,3}(?:,\d{3})(?:,\d{3})*|\d+) or more\)", str(value))
    if match_more:
        low = int(match_more.group(1).replace(",", ""))
        yearly_str = f"{low}-inf"
        midpoint = low
        return yearly_str, midpoint

    return value, None

In [4]:
def processNumberedFeatures(df):
    features = ["hhsize", "totalvehs", "totalbikes"]

    for feature in features:
        df[feature] = pd.to_numeric(df[feature], errors="coerce")

    medianImputer = SimpleImputer(strategy="median")
    df[features] = medianImputer.fit_transform(df[features])

    return df


In [5]:
def createBins(df):
    df["hhinc_bin"] = pd.qcut(
        df["hhinc_group_median"],
        q=3,
        labels=["Low", "Medium", "High"],
        duplicates="drop"
    )
    
    df["hhsize_bin"] = pd.cut(
        df["hhsize"],
        bins=[0, 1, 2, 4, np.inf],
        labels=["1", "2", "3-4", "5+"],
    )

    df['totalvehs_bin'] = pd.cut(
        df['totalvehs'],
        bins=[-np.inf, 0, 1, 2, np.inf],
        labels=['0', '1', '2', '3+']
    )

    df['hhinc_bin_le'] = LabelEncoder().fit_transform(df['hhinc_bin'])
    df['hhsize_bin_le'] = LabelEncoder().fit_transform(df['hhsize_bin'])
    df['totalvehs_bin_le'] = LabelEncoder().fit_transform(df['totalvehs_bin'])

    return df

In [6]:
def labelEncode(df):
    features = ["dwelltype", "owndwell", "homesubregion_ASGS"]
    for feature in features:
        labelEncoder = LabelEncoder()
        df[f"{feature}_le"] = labelEncoder.fit_transform(df[feature])

    return df

In [7]:
def oneHotEncode(df):
    dwell_encoded = pd.get_dummies(df['dwelltype'], prefix='dwelltype', drop_first=False, dtype=int)
    df = pd.concat([df, dwell_encoded], axis=1)

    ownership_encoded = pd.get_dummies(df['owndwell'], prefix='owndwell', drop_first=False, dtype=int)
    df = pd.concat([df, ownership_encoded], axis=1)

    region_encoded = pd.get_dummies(df['homesubregion_ASGS'], prefix='homesubregion_ASGS', drop_first=False, dtype=int)
    df = pd.concat([df, region_encoded], axis=1)

    return df

In [8]:
processData()