In [1]:
import pandas as pd

cols = [
    "age","workclass","fnlwgt","education","education-num",
    "marital-status","occupation","relationship","race","sex",
    "capital-gain","capital-loss","hours-per-week","native-country","income"
]

df = pd.read_csv(
    "data/adult.data",
    names=cols,
    sep=",",
    skipinitialspace=True,
    na_values="?"
)

print("df shape:", df.shape)
print(df.head(3))
print("\nincome counts:\n", df["income"].value_counts())
print("\nmissing by col:\n", df.isna().sum().sort_values(ascending=False).head(10))


df shape: (32561, 15)
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   

       marital-status         occupation   relationship   race   sex  \
0       Never-married       Adm-clerical  Not-in-family  White  Male   
1  Married-civ-spouse    Exec-managerial        Husband  White  Male   
2            Divorced  Handlers-cleaners  Not-in-family  White  Male   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0              13  United-States  <=50K  
2             0             0              40  United-States  <=50K  

income counts:
 income
<=50K    24720
>50K      7841
Name: count, dtype: int64

missing by col:
 occupation        1843
workclass         1836
native-country  

In [3]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# =========================
# 1. Load raw Adult data
# =========================
cols = [
    "age","workclass","fnlwgt","education","education-num",
    "marital-status","occupation","relationship","race","sex",
    "capital-gain","capital-loss","hours-per-week","native-country","income"
]

df = pd.read_csv(
    "data/adult.data",
    names=cols,
    sep=",",
    skipinitialspace=True,
    na_values="?"
)

# =========================
# 2. Define tasks (labels)
# =========================
# Task A: income >50K
yA = (
    df["income"]
    .astype(str)
    .str.contains(">50K")
    .astype(np.float32)
    .values
)

# Task B: married?
yB = (
    df["marital-status"]
    .fillna("__MISSING__")
    .astype(str)
    .str.startswith("Married")
    .astype(np.float32)
    .values
)

In [6]:
yB

array([0., 1., 0., ..., 0., 0., 1.], shape=(32561,), dtype=float32)

In [7]:
# =========================
# 3. Feature columns
# =========================
num_cols = [
    "age", "fnlwgt", "education-num",
    "capital-gain", "capital-loss", "hours-per-week"
]

cat_cols = [
    "workclass", "education", "marital-status",
    "occupation", "relationship", "race",
    "sex", "native-country"
]

# =========================
# 4. Missing value handling
# =========================
df[num_cols] = df[num_cols].fillna(0.0)
df[cat_cols] = df[cat_cols].fillna("__MISSING__")

X = df[num_cols + cat_cols].copy()


In [12]:
# =========================
# 5. Train / Val / Test split
# =========================
X_train, X_tmp, yA_train, yA_tmp, yB_train, yB_tmp = train_test_split(
    X, yA, yB,
    test_size=0.30,
    random_state=42,
    stratify=yA
)

X_val, X_test, yA_val, yA_test, yB_val, yB_test = train_test_split(
    X_tmp, yA_tmp, yB_tmp,
    test_size=0.50,
    random_state=42,
    stratify=yA_tmp
)

# =========================
# 6. Numeric preprocessing
# =========================
scaler = StandardScaler()

X_train_num = scaler.fit_transform(
    X_train[num_cols].values
).astype(np.float32)

X_val_num = scaler.transform(
    X_val[num_cols].values
).astype(np.float32)

X_test_num = scaler.transform(
    X_test[num_cols].values
).astype(np.float32)

In [14]:
yA_tmp

array([1., 0., 0., ..., 0., 1., 0.], shape=(9769,), dtype=float32)

In [34]:
# =========================
# 7. Categorical encoding
# =========================
def build_mapping(series: pd.Series):
    """Build value -> index mapping"""
    cats = pd.Index(series.unique())
    return {k: i for i, k in enumerate(cats)}

X_train_cat, X_val_cat, X_test_cat = [], [], []
cat_sizes = []


In [35]:
# ---- inference-time preprocessing info ----
preproc = {
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "scaler_mean": scaler.mean_.astype(np.float32),
    "scaler_scale": scaler.scale_.astype(np.float32),
    "cat_mappings": {},     # col -> {value: index}
    "cat_oov_index": {},    # col -> oov index
}

for c in cat_cols:
    mapping = build_mapping(X_train[c])
    oov = len(mapping)

    preproc["cat_mappings"][c] = mapping
    preproc["cat_oov_index"][c] = oov
    cat_sizes.append(oov + 1)

    def encode(series):
        arr = (
            series
            .map(lambda x: mapping.get(x, -1))
            .astype(np.int64)
            .values
        )
        arr = np.where(arr < 0, oov, arr)
        return arr

    X_train_cat.append(encode(X_train[c]))
    X_val_cat.append(encode(X_val[c]))
    X_test_cat.append(encode(X_test[c]))

X_train_cat = np.stack(X_train_cat, axis=1)
X_val_cat   = np.stack(X_val_cat, axis=1)
X_test_cat  = np.stack(X_test_cat, axis=1)

In [38]:
X_train_cat

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 1, 0],
       [2, 1, 1, ..., 0, 1, 0],
       ...,
       [2, 4, 0, ..., 2, 0, 0],
       [2, 7, 0, ..., 0, 0, 0],
       [2, 2, 2, ..., 0, 1, 0]], shape=(22792, 8))

In [37]:
# =========================
# 8. Save training arrays
# =========================
np.savez(
    "adult_stage1.npz",
    X_train_num=X_train_num,
    X_val_num=X_val_num,
    X_test_num=X_test_num,
    X_train_cat=X_train_cat,
    X_val_cat=X_val_cat,
    X_test_cat=X_test_cat,
    yA_train=yA_train,
    yA_val=yA_val,
    yA_test=yA_test,
    yB_train=yB_train,
    yB_val=yB_val,
    yB_test=yB_test,
    cat_sizes=np.array(cat_sizes, dtype=np.int64),
)


In [39]:
# =========================
# 9. Save inference preproc
# =========================
with open("adult_preproc.pkl", "wb") as f:
    pickle.dump(preproc, f)
