# Recruiting data analysis

In this notebook we do some basic analysis of the synthetic recruiting data, to understand existing biases before training a model.

In [None]:
from pathlib import Path

import pandas as pd
import plotly.graph_objs as go
from helpers.recruiting import bin_years_experience

Directory for saving all processed data.

In [None]:
artifacts_dir = Path("../../artifacts")

Load the preprocessed data. Check out the preprocessing notebook for details on how this data was obtained.

In [None]:
data_dir = artifacts_dir / "data" / "recruiting"

test = pd.read_csv(data_dir / "processed" / "test.csv")
test_raw = pd.read_csv(data_dir / "raw" / "test.csv")

## Bias in data

Visualise biases present in the data

In [None]:
employed_by_sex = test[["sex_male", "employed_yes"]].groupby("sex_male").mean()

fig_employed_by_sex = go.Figure(
    [go.Bar(x=["Female", "Male"], y=employed_by_sex.employed_yes)],
    go.Layout(
        yaxis={"range": [0, 1], "title": "Proportion"},
        title="Propotion of applicants employed by sex",
    ),
)
fig_employed_by_sex

In [None]:
employed_by_race = (
    test[["race_white", "employed_yes"]].groupby("race_white").mean()
)

fig_employed_by_race = go.Figure(
    [go.Bar(x=["Black", "White"], y=employed_by_race.employed_yes)],
    go.Layout(
        yaxis={"range": [0, 1], "title": "Proportion"},
        title="Propotion of applicants employed by race",
    ),
)
fig_employed_by_race

### Possible resolving variables

Let's look at the relationship between years of experience and whether the applicant was employed.

In [None]:
employed_by_experience = (
    test.assign(exp=test_raw.years_experience.map(bin_years_experience))
    .loc[:, ["exp", "employed_yes"]]
    .groupby("exp")
    .aggregate(["mean", "count"])
    .reset_index()
)

fig_employed_by_experience = go.Figure(
    data=[
        go.Bar(
            x=employed_by_experience.exp,
            y=employed_by_experience["employed_yes"]["mean"],
        )
    ],
    layout={
        "title": "Proportion of applicants employed by years of experience",
        "yaxis": {"range": [0, 1], "title": "Proportion"},
        "xaxis": {
            "tickvals": [0, 1, 2, 3],
            "ticktext": ["0-2 years", "3-5 years", "6-9 years", "10+ years"],
        },
    },
)
fig_employed_by_experience