# Adult data analysis

In this notebook we do some basic analysis of the adult data, to understand existing biases before training a model.

In [None]:
from pathlib import Path

import pandas as pd
from helpers.finance import bin_hours_per_week
from helpers.plot import bar_chart

Directory for saving all processed data.

In [None]:
artifacts_dir = Path("../../artifacts")

Load the preprocessed data. Check out the preprocessing notebook for details on how this data was obtained.

In [None]:
data_dir = artifacts_dir / "data" / "adult"

train = pd.read_csv(data_dir / "processed" / "train.csv")
val = pd.read_csv(data_dir / "processed" / "val.csv")
test = pd.read_csv(data_dir / "processed" / "test.csv")

train_oh = pd.read_csv(data_dir / "processed" / "train-one-hot.csv")
val_oh = pd.read_csv(data_dir / "processed" / "val-one-hot.csv")
test_oh = pd.read_csv(data_dir / "processed" / "test-one-hot.csv")

## Bias in data

Visualise biases present in the data

In [None]:
salary_by_sex = train[["sex", "salary"]].groupby("sex").mean()

fig_salary_by_sex = bar_chart(
    x=["Female", "Male"],
    y=salary_by_sex.salary,
    title="Proportion of high earners by sex",
    xlabel="Sex",
    ylabel="Proportion of high earners",
)
fig_salary_by_sex

In [None]:
salary_by_race = train[["race", "salary"]].groupby("race").mean()

fig_salary_by_race = bar_chart(
    x=[
        "American Indian / Eskimo",
        "Asian / Pacific Islander",
        "Black",
        "Other",
        "White",
    ],
    y=salary_by_race.salary,
    title="Proportion of high earners by race",
    xlabel="Race",
    ylabel="Proportion of high earners",
)
fig_salary_by_race

### Possible resolving variables

Let's look at the relationship between hours per week and salary.

In [None]:
salary_by_hours_per_week = (
    val.assign(hpw=val.hours_per_week.map(bin_hours_per_week))
    .loc[:, ["hpw", "salary"]]
    .groupby("hpw")
    .aggregate(["mean", "count"])
    .reset_index()
)

fig_salary_by_hours_per_week = bar_chart(
    x=salary_by_hours_per_week.hpw,
    y=salary_by_hours_per_week["salary"]["mean"],
    title="Proportion of high earners by hours worked per week",
    xlabel="Hours worked per week",
    ylabel="Proportion of high earners",
    xticks={
        "tickvals": [0, 1, 2, 3],
        "ticktext": ["0-30", "30-40", "40-50", "50+"],
    },
)
fig_salary_by_hours_per_week