# Adult data analysis

In this notebook we do some basic analysis of the adult data, to understand existing biases before training a model.

In [1]:
from pathlib import Path

import pandas as pd
import plotly.graph_objs as go
from helpers.finance import bin_hours_per_week

In [2]:
from helpers import export_plot

Directory for saving all processed data.

In [3]:
artifacts_dir = Path("../../artifacts")

In [4]:
# override data_dir in source notebook
# this is stripped out for the hosted notebooks
artifacts_dir = Path("../../../artifacts")

Load the preprocessed data. Check out the preprocessing notebook for details on how this data was obtained.

In [5]:
data_dir = artifacts_dir / "data" / "adult"

train = pd.read_csv(data_dir / "processed" / "train.csv")
val = pd.read_csv(data_dir / "processed" / "val.csv")
test = pd.read_csv(data_dir / "processed" / "test.csv")

train_oh = pd.read_csv(data_dir / "processed" / "train-one-hot.csv")
val_oh = pd.read_csv(data_dir / "processed" / "val-one-hot.csv")
test_oh = pd.read_csv(data_dir / "processed" / "test-one-hot.csv")

## Bias in data

Visualise biases present in the data

In [8]:
salary_by_sex = train[["sex", "salary"]].groupby("sex").mean()

fig_salary_by_sex = go.Figure(
    [go.Bar(x=["Female", "Male"], y=salary_by_sex.salary)],
    go.Layout(
        title="Proportion of high earners by sex",
        xaxis={"title": "Sex"},
        yaxis={"range": [0, 1], "title": "Proportion of high earners"},
    ),
)
fig_salary_by_sex

In [None]:
export_plot(fig_salary_by_sex, "salary_by_sex.json")

In [9]:
salary_by_race = train[["race", "salary"]].groupby("race").mean()

fig_salary_by_race = go.Figure(
    [
        go.Bar(
            x=[
                "American Indian / Eskimo",
                "Asian / Pacific Islander",
                "Black",
                "Other",
                "White",
            ],
            y=salary_by_race.salary,
        )
    ],
    go.Layout(
        title="Proportion of high earners by race",
        xaxis={"title": "Race"},
        yaxis={"range": [0, 1], "title": "Proportion of high earners"},
    ),
)
fig_salary_by_race

In [None]:
export_plot(fig_salary_by_race, "salary_by_race.json")

### Possible resolving variables

Let's look at the relationship between hours per week and salary.

In [11]:
salary_by_hours_per_week = (
    val.assign(hpw=val.hours_per_week.map(bin_hours_per_week))
    .loc[:, ["hpw", "salary"]]
    .groupby("hpw")
    .aggregate(["mean", "count"])
    .reset_index()
)

fig_salary_by_hours_per_week = go.Figure(
    data=[
        go.Bar(
            x=salary_by_hours_per_week.hpw,
            y=salary_by_hours_per_week["salary"]["mean"],
        )
    ],
    layout={
        "title": "Proportion of high earners by hours worked per week",
        "yaxis": {"range": [0, 1], "title": "Proportion of high earners"},
        "xaxis": {
            "tickvals": [0, 1, 2, 3],
            "ticktext": ["0-30", "30-40", "40-50", "50+"],
            "title": "Hours worked per week",
        },
    },
)
fig_salary_by_hours_per_week

In [None]:
export_plot(
    fig_salary_by_hours_per_week, "salary_by_hours_per_week.json",
)