# Baseline model for recruiting data

In this notebook we train a simple model on the synthetic recruiting data that can serve as a counterfactual for what would have happened if we hadn't made any kind of fairness intervention.

In [None]:
import joblib
from pathlib import Path

import numpy as np
import pandas as pd
from helpers.finance import bin_hours_per_week
from helpers.plot import group_box_plots
from sklearn.neural_network import MLPClassifier  # noqa

In [None]:
from helpers import export_plot

Directory containing preprocessed data.

In [None]:
artifacts_dir = Path("../../artifacts")

In [None]:
# override data_dir in source notebook
# this is stripped out for the hosted notebooks
artifacts_dir = Path("../../../artifacts")

Load the preprocessed data. Check out the preprocessing notebook for details on how this data was obtained.

In [None]:
data_dir = artifacts_dir / "data" / "recruiting"

train = pd.read_csv(data_dir / "processed" / "train.csv")
val = pd.read_csv(data_dir / "processed" / "val.csv")
test = pd.read_csv(data_dir / "processed" / "test.csv")

## Training a model to predict salary

We will load a model from disk so that results are reproducible, but commented out here is the code we used to train the model.

In [None]:
# model = MLPClassifier(hidden_layer_sizes=(100, 100), early_stopping=True)

# model.fit(train.drop(columns="employed_yes"), train.salary)

Load the pretrained model

In [None]:
model = joblib.load(artifacts_dir / "models" / "finance" / "baseline.pkl")

Model accuracy on validation set

In [None]:
val_prob = model.predict_proba(val_oh.drop(columns="salary"))[:, 1]
val_accuracy = model.score(val_oh.drop(columns="salary"), val_oh.salary)
print(f"Validation accuracy: {val_accuracy * 100:.2f}%")

## Demographic parity

Distribution of outcomes for different sexes.

In [None]:
fig_dp_by_sex = group_box_plots(
    val_prob,
    np.zeros_like(val_prob),
    val.sex.map(lambda x: "Male" if x else "Female"),
    group_names=[""],
)
fig_dp_by_sex

In [None]:
export_plot(
    fig_dp_by_sex, "bl_dp_by_sex.json",
)

Distribution of outcomes for different races.

In [None]:
set(val.race)

In [None]:
race_names = {
    "amer_indian_eskimo": "American Indian / Eskimo",
    "asian_pac_islander": "Asian / Pacific Islander",
    "black": "Black",
    "other": "Other",
    "white": "White",
}

fig_dp_by_race = group_box_plots(
    val_prob,
    np.zeros_like(val_prob),
    val.race.map(race_names),
    group_names=["Race"],
)
fig_dp_by_race

In [None]:
export_plot(
    fig_dp_by_race, "bl_dp_by_race.json",
)

## Conditional demographic parity

Distribution by sex and hours worked per week.

In [None]:
val_hpw_enum = val.hours_per_week.map(bin_hours_per_week)

fig_cdp_by_sex = group_box_plots(
    val_prob,
    val_hpw_enum,
    val.sex.map(lambda x: "Male" if x else "Female"),
    group_names=["<30", "30-40", "40-50", ">50"],
)
fig_cdp_by_sex

In [None]:
export_plot(
    fig_cdp_by_sex, "bl_cdp_by_sex.json",
)

Distribution by race and hours worked per week.

In [None]:
fig_cdp_by_race = group_box_plots(
    val_prob,
    val_hpw_enum,
    val.race.map(race_names),
    group_names=["<30", "30-40", "40-50", ">50"],
)
fig_cdp_by_race

In [None]:
export_plot(
    fig_cdp_by_race, "bl_cdp_by_race.json",
)

## Equalised odds

To assess equalised odds we compare scores across the outcome classes.

In [None]:
fig_eo_by_sex = group_box_plots(
    val_prob,
    val.salary,
    val.sex.map(lambda x: "Male" if x else "Female"),
    group_names=["<=$50k", ">$50k"],
)
fig_eo_by_sex

In [None]:
export_plot(fig_eo_by_sex, "bl_eo_by_sex.json")

We do the same, comparing races.

In [None]:
fig_eo_by_race = group_box_plots(
    val_prob,
    val.salary,
    val.race.map(race_names),
    group_names=["<=$50k", ">$50k"],
)
fig_eo_by_race

In [None]:
export_plot(fig_eo_by_race, "bl_eo_by_race.json")