# Adult analysis

This notebook contains preliminary analysis of the Adult dataset.

In [None]:
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from cdei_helpers.plot import group_box_plots, group_roc_curves
from sklearn.neural_network import MLPClassifier

In [None]:
# temporary platform specific path
data_dir = Path("/project/data/adult")

Load data.

In [None]:
train = pd.read_csv(data_dir / "processed" / "train.csv")
val = pd.read_csv(data_dir / "processed" / "val.csv")
test = pd.read_csv(data_dir / "processed" / "test.csv")

train_oh = pd.read_csv(data_dir / "processed" / "train-one-hot.csv")
val_oh = pd.read_csv(data_dir / "processed" / "val-one-hot.csv")
test_oh = pd.read_csv(data_dir / "processed" / "test-one-hot.csv")

## Bias in data

Visualise biases present in the data

In [None]:
salary_by_sex = train[["sex", "salary"]].groupby("sex").mean()

go.Figure(
    [go.Bar(x=["Female", "Male"], y=salary_by_sex.salary)],
    go.Layout(yaxis={"range": [0, 1]}),
)

In [None]:
salary_by_race = train[["race", "salary"]].groupby("race").mean()

go.Figure(
    [
        go.Bar(
            x=[
                "American Indian / Eskimo",
                "Asian / Pacific Islander",
                "Black",
                "Other",
                "White",
            ],
            y=salary_by_race.salary,
        )
    ],
    go.Layout(yaxis={"range": [0, 1]}),
)

### Possible resolving variables

Let's look at the relationship between hours per week and salary.

In [None]:
salary_by_hours_per_week = (
    val[["hours_per_week", "salary"]]
    .groupby("hours_per_week")
    .aggregate(["mean", "count"])
    .reset_index()
)

In [None]:
go.Figure(
    go.Scatter(
        x=salary_by_hours_per_week.hours_per_week,
        y=salary_by_hours_per_week["salary"]["mean"],
        marker={"size": salary_by_hours_per_week["salary"]["count"] / 50},
        mode="markers",
    )
)

## Training a model to predict salary

In [None]:
model = MLPClassifier(hidden_layer_sizes=(100, 100), early_stopping=True,)

model.fit(train_oh.drop(columns="salary"), train_oh.salary)

val_prob = model.predict_proba(val_oh.drop(columns="salary"))[:, 1]
val_accuracy = model.score(val_oh.drop(columns="salary"), val_oh.salary)
print(f"Validation accuracy: {val_accuracy * 100:.2f}%")

## Demographic parity

Distribution of outcomes for different sexes.

In [None]:
go.Figure(
    data=[
        go.Box(
            x=[sex] * (val.sex == sex).sum(),
            y=val_prob[val.sex == sex],
            name="Male" if sex else "Female",
        )
        for sex in range(2)
    ]
)

Distribution of outcomes for different races.

In [None]:
go.Figure(
    data=[
        go.Box(
            x=[race] * (val.race == race).sum(),
            y=val_prob[val.race == race],
            name=race,
        )
        for race in set(val.race)
    ]
)

## Conditional demographic parity

Distribution by sex and hours worked per week.

In [None]:
def enumerate_hours_per_week(hpw):
    if hpw <= 30:
        return 0
    elif hpw <= 40:
        return 1
    elif hpw <= 50:
        return 2
    return 3


val_hpw_enum = val.hours_per_week.map(enumerate_hours_per_week)

group_box_plots(
    val_prob, val_hpw_enum, val.sex.map(lambda x: "Male" if x else "Female")
)

Distribution by race and hours worked per week.

In [None]:
group_box_plots(val_prob, val_hpw_enum, val.race)

# Equalised odds

ROC curves by sex

In [None]:
group_roc_curves(
    val.salary, val_prob, val.sex.map(lambda x: "Male" if x else "Female")
)

ROC curves by race

In [None]:
group_roc_curves(val.salary, val_prob, val.race)

## Save model for later comparison

In [None]:
joblib.dump(model, "/project/data/adult/baseline.pkl")