# Adult analysis

This notebook contains preliminary analysis of the Adult dataset.

In [None]:
import joblib
import os
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from cdei_helpers.adult_preprocess import preprocess
from cdei_helpers.finance import bin_hours_per_week
from cdei_helpers.plot import group_box_plots, group_roc_curves
from sklearn.neural_network import MLPClassifier

In [None]:
from cdei_helpers.export_plot import export_plot

In [None]:
data_dir = Path("./data/adult")
preprocess(data_dir)

Load data.

In [None]:
train = pd.read_csv(data_dir / "processed" / "train.csv")
val = pd.read_csv(data_dir / "processed" / "val.csv")
test = pd.read_csv(data_dir / "processed" / "test.csv")

train_oh = pd.read_csv(data_dir / "processed" / "train-one-hot.csv")
val_oh = pd.read_csv(data_dir / "processed" / "val-one-hot.csv")
test_oh = pd.read_csv(data_dir / "processed" / "test-one-hot.csv")

## Bias in data

Visualise biases present in the data

In [None]:
salary_by_sex = train[["sex", "salary"]].groupby("sex").mean()

fig_salary_by_sex = go.Figure(
    [go.Bar(x=["Female", "Male"], y=salary_by_sex.salary)],
    go.Layout(yaxis={"range": [0, 1]}),
)
fig_salary_by_sex.show()

In [None]:
export_plot(fig_salary_by_sex, "finance_salary_by_sex.json")

In [None]:
salary_by_race = train[["race", "salary"]].groupby("race").mean()

fig_salary_by_race = go.Figure(
    [
        go.Bar(
            x=[
                "American Indian / Eskimo",
                "Asian / Pacific Islander",
                "Black",
                "Other",
                "White",
            ],
            y=salary_by_race.salary,
        )
    ],
    go.Layout(yaxis={"range": [0, 1]}),
)
fig_salary_by_race.show()

In [None]:
export_plot(fig_salary_by_race, "finance_salary_by_race.json")

### Possible resolving variables

Let's look at the relationship between hours per week and salary.

In [None]:
salary_by_hours_per_week = (
    val.assign(hpw=val.hours_per_week.map(bin_hours_per_week))
    .loc[:, ["hpw", "salary"]]
    .groupby("hpw")
    .aggregate(["mean", "count"])
    .reset_index()
)

In [None]:
fig_salary_by_hours_per_week = go.Figure(
    data=[
        go.Bar(
            x=salary_by_hours_per_week.hpw,
            y=salary_by_hours_per_week["salary"]["mean"],
        )
    ],
    layout={
        "title": "Proportion of salary > $50k by hours worked per week",
        "yaxis": {"range": [0, 1], "title": "Proportion salary > $50k"},
        "xaxis": {
            "tickvals": [0, 1, 2, 3],
            "ticktext": ["<30hrs", "30-40hrs", "40-50hrs", ">50hrs"],
        },
    },
)
fig_salary_by_hours_per_week.show()

In [None]:
export_plot(
    fig_salary_by_hours_per_week, "finance_salary_by_hours_per_week.json",
)

## Training a model to predict salary

In [None]:
model = MLPClassifier(hidden_layer_sizes=(100, 100), early_stopping=True,)

model.fit(train_oh.drop(columns="salary"), train_oh.salary)

val_prob = model.predict_proba(val_oh.drop(columns="salary"))[:, 1]
val_accuracy = model.score(val_oh.drop(columns="salary"), val_oh.salary)
print(f"Validation accuracy: {val_accuracy * 100:.2f}%")

## Demographic parity

Distribution of outcomes for different sexes.

In [None]:
fig_dp_by_sex = group_box_plots(
    val_prob,
    np.zeros_like(val_prob),
    val.sex.map(lambda x: "Male" if x else "Female"),
    group_names=[""],
)
fig_dp_by_sex.show()

In [None]:
export_plot(
    fig_dp_by_sex, "finance_dp_by_sex.json",
)

Distribution of outcomes for different races.

In [None]:
fig_dp_by_race = group_box_plots(
    val_prob, np.zeros_like(val_prob), val.race, group_names=["Race"]
)
fig_dp_by_race.show()

In [None]:
export_plot(
    fig_dp_by_race, "finance_dp_by_race.json",
)

## Conditional demographic parity

Distribution by sex and hours worked per week.

In [None]:
val_hpw_enum = val.hours_per_week.map(bin_hours_per_week)

fig_cdp_by_sex = group_box_plots(
    val_prob,
    val_hpw_enum,
    val.sex.map(lambda x: "Male" if x else "Female"),
    group_names=["<30", "30-40", "40-50", ">50"],
)
fig_cdp_by_sex.show()

In [None]:
export_plot(
    fig_cdp_by_sex, "finance_cdp_by_sex.json",
)

Distribution by race and hours worked per week.

In [None]:
fig_cdp_by_race = group_box_plots(
    val_prob,
    val_hpw_enum,
    val.race,
    group_names=["<30", "30-40", "40-50", ">50"],
)
fig_cdp_by_race.show()

In [None]:
export_plot(
    fig_cdp_by_race, "finance_cdp_by_race.json",
)

# Equalised odds

ROC curves by sex

In [None]:
fig_eo_by_sex = group_roc_curves(
    val.salary, val_prob, val.sex.map(lambda x: "Male" if x else "Female")
)
fig_eo_by_sex.show()

In [None]:
export_plot(fig_eo_by_sex, "finance_eo_by_sex.json")

ROC curves by race

In [None]:
fig_eo_by_race = group_roc_curves(val.salary, val_prob, val.race)
fig_eo_by_race.show()

In [None]:
export_plot(fig_eo_by_race, "finance_eo_by_race.json")

## Save model for later comparison

In [None]:
try:
    os.makedirs("../artifacts")
except FileExistsError:
    pass
except:
    raise

joblib.dump(model, "../artifacts/adult_baseline.pkl")