In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [20]:
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country",
    "income"
]

train_df = pd.read_csv(
    "../data/adult_train.csv",
    header=None,
    names=columns,
    skipinitialspace=True
)

test_df = pd.read_csv(
    "../data/adult_test.csv",
    header=None,
    names=columns,
    skipinitialspace=True,
    skiprows=1   # IMPORTANT: test file has one header row in UCI version
)



In [21]:
train_df = train_df.drop(columns=["education"])
test_df  = test_df.drop(columns=["education"])


In [22]:
y_train = train_df["income"].str.strip().map({">50K": 1, "<=50K": 0})
y_test  = (
    test_df["income"]
    .str.strip()
    .str.replace(".", "", regex=False)
    .map({">50K": 1, "<=50K": 0})
)

X_train = train_df.drop(columns=["income"])
X_test  = test_df.drop(columns=["income"])


In [23]:
numeric_cols = [
    "age",
    "fnlwgt",
    "education-num",
    "capital-gain",
    "capital-loss",
    "hours-per-week"
]

categorical_cols = [
    "workclass",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country"
]


In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


In [25]:
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(missing_values="?", strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_cols),
        ("cat", cat_pipe, categorical_cols)
    ],
    remainder="drop"
)


In [26]:
X_train_p = preprocessor.fit_transform(X_train)
X_test_p  = preprocessor.transform(X_test)


In [None]:
feature_names = preprocessor.get_feature_names_out()

X_train_p = pd.DataFrame(X_train_p, columns=feature_names, index=X_train.index)
X_test_p  = pd.DataFrame(X_test_p,  columns=feature_names, index=X_test.index)

train_out = X_train_p.copy()
train_out["y"] = y_train.values

test_out = X_test_p.copy()
test_out["y"] = y_test.values

train_out.to_csv("../data/adult_preprocessed_train.csv", index=False)
test_out.to_csv("../data/adult_preprocessed_test.csv", index=False)
