# Synthetic recruiting data

In [None]:
import itertools
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import tensorflow as tf
from helpers import fairness_measures
from scipy import stats
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## High level: What shall we include?

- referred or not
- number of career years relevant for the job
- whether to went to russell group univserity
- honours degree
- GCSE scores
- A-levels
- existing income
- sex
- race
- quality of written cv
- years of volunteering experience
- years of holes in cv
- level of IT skills
- whether employed or not

### Number of required data points

In [None]:
n = 25000

### Sample sex and race by coin flipping

Select each feature independently for each instance independently according to a fair Bernoulli experiment.

In [None]:
df = pd.DataFrame()

In [None]:
df["sex_male"] = np.random.binomial(1, 0.5, n)
df["race_white"] = np.random.binomial(1, 0.5, n)

### Sample labels given sex and race

We sample labels via Bernoulli distributions, thereby introducing unfairness, i.e., success probability is biased with respect to sex and race.

In [None]:
df = df.assign(employed_yes="")

for i in range(2):
    for j in range(2):
        indices = (df["sex_male"] == i) & (df["race_white"] == j)

        if i == 1 and j == 1:
            p = 0.7  # White male
        elif i == 0 and j == 1:
            p = 0.5  # White female
        elif i == 1 and j == 0:
            p = 0.45  # Black male
        elif i == 0 and j == 0:
            p = 0.25  # Black female

        df.loc[indices, "employed_yes"] = np.random.binomial(
            1, p, sum(indices.values)
        )

### Sample number of years of experience 
Given binary labels sample from a Poisson distribution for years of relevant working experience 

In [None]:
# Check out Poisson distribution
x = np.arange(0, 17)
rv = stats.poisson(0.5)
y = rv.pmf(x)
plt.plot(x, y, "bo", ms=5)

In [None]:
df = df.assign(years_experience="")

In [None]:
for i, j in itertools.product(range(2), range(2)):
    indices = (df["employed_yes"] == i) & (df["sex_male"] == j)

    if i == 0:
        mu = 2.0 + 0.5 * j  # Not employed
    elif i == 1:
        mu = 4.35 + 0.5 * j  # Employed

    df.loc[indices, "years_experience"] = np.random.poisson(
        mu, sum(indices.values)
    )

### Experience categories

In [None]:
# Experience categories
df = df.assign(experience_category="")
indices = [
    (df["years_experience"] >= 2 * i) & (df["years_experience"] < 2 * (i + 1))
    for i in range(3)
]
indices.append((df["years_experience"] >= 6.0))

for i in range(3):
    df.loc[indices[i], "experience_category"] = (
        str(2 * i) + "-" + str(2 * (i + 1))
    )
df.loc[indices[-1], "experience_category"] = ">=6"

### Referred 
Binary variable stating whether the applicant has been referred ot not

In [None]:
df = df.assign(referred="")

In [None]:
for i, j, k in itertools.product(range(2), range(2), range(2)):
    indices = (
        (df["employed_yes"] == i)
        & (df["sex_male"] == j)
        & (df["race_white"] == k)
    )
    if i == 0:
        p = 0.1 + 0.3 * j + 0.15 * k  # Not employed
    elif i == 1:
        p = 0.52 + 0.3 * j + 0.15 * k  # Employed

    df.loc[indices, "referred"] = np.random.binomial(1, p, sum(indices.values))

### A levels

This feature will depend on the outcomes of GCSE scores.

In [None]:
df = df.assign(a_levels="")

In [None]:
for i, j in itertools.product(range(2), range(2)):
    indices = (df["employed_yes"] == i) & (df["race_white"] == j)

    if i == 0:
        p = 0.4 + 0.1 * j
    elif i == 1:
        p = 0.625 + 0.1 * j

    df.loc[indices, "a_levels"] = np.random.binomial(4, p, sum(indices.values))

### Number of GCSE better than a C

Sample integer between 0 and 10 using a binomial distribution, reflecting the number of GCSEs better than a C. 

In [None]:
df = df.assign(gcse="")

In [None]:
for i in range(5):
    indices = df["a_levels"] == i
    p = 0.4 + 0.1 * i
    df.loc[indices, "gcse"] = np.random.binomial(10, p, sum(indices.values))

# indices = pd.Series(np.random.choice([True, False], size=n, p=[0.1, 0.9]))
# df.loc[indices, 'gcse'] = np.random.binomial(10, 0.5, sum(indices.values))

### Russel group

Sample binary variable indicating whether went to a Russel group univeristy. The outcome of this variable is correlated with GCSE scores.

In [None]:
df = df.assign(russel_group="")

In [None]:
df.loc[range(n), "russel_group"] = 0

for i in range(2):
    indices = (df["a_levels"] == 4) & (df["employed_yes"] == i)
    if i == 0:
        p = 0.4
    if i == 1:
        p = 0.95
    df.loc[indices, "russel_group"] = np.random.binomial(
        1, p, sum(indices.values)
    )

### Honours degree

In [None]:
df = df.assign(honours="")

In [None]:
for i in range(5):
    for j in range(2):
        indices = (df["a_levels"] == i) & (df["employed_yes"] == j)
        p = (0.05 + 0.05 * i) * (1.0 + j)
        df.loc[indices, "honours"] = np.random.binomial(
            1, p, sum(indices.values)
        )

### Years of voluntary experience

In [None]:
df = df.assign(years_volunteer="")

In [None]:
for i in range(2):
    indices = df["employed_yes"] == i

    if i == 0:
        mu = 0.5  # Not employed
    elif i == 1:
        mu = 0.75  # Employed

    df.loc[indices, "years_volunteer"] = np.random.poisson(
        mu, sum(indices.values)
    )

In [None]:
np.sqrt(25000)

### Existing income

In [None]:
df = df.assign(income="")

In [None]:
for i, j, k, l, m in itertools.product(
    range(2), range(2), range(2), range(2), range(2)
):
    indices = (
        (df["employed_yes"] == i)
        & (df["sex_male"] == j)
        & (df["race_white"] == k)
        & (df["russel_group"] == l)
        & (df["honours"] == m)
    )

    if i == 0:
        mu = 1.0 + 0.1 * j + 0.1 * k + 0.1 * l + 0.1 * m  # Not employed
    elif i == 1:
        mu = 1.4 + 0.1 * j + 0.1 * k + 0.1 * l + 0.1 * m  # Employed

    scale = 2.5 * 1e4
    df.loc[indices, "income"] = np.random.normal(
        mu * scale, 15000, sum(indices.values)
    )

In [None]:
# Set negative feature values to 0
indices = df["income"] < 0
df.loc[indices, "income"] = 0

### Level of IT-skills

In [None]:
df = df.assign(it_skills="")

In [None]:
for i, j in itertools.product(range(2), range(2)):
    indices = (df["employed_yes"] == i) & (df["sex_male"] == j)

    if i == 0:
        p = 0.25 + 0.325 * j  # Not employed
    elif i == 1:
        p = 0.53 + 0.325 * j  # Employed

    df.loc[indices, "it_skills"] = np.random.binomial(
        3, p, sum(indices.values)
    )

### Years of holes in cv

In [None]:
df = df.assign(years_holes="")

In [None]:
for i, j, k in itertools.product(range(2), range(2), range(2)):
    indices = (
        (df["employed_yes"] == i)
        & (df["sex_male"] == j)
        & (df["race_white"] == k)
    )

    if i == 0:
        mu = 1.0 - 0.5 * j - 0.25 * k  # Not employed
    elif i == 1:
        mu = 0.75 - 0.5 * j - 0.25 * k  # Employed

    df.loc[indices, "years_holes"] = np.random.poisson(mu, sum(indices.values))

### Quality of written cv

In [None]:
df = df.assign(quality_cv="")

In [None]:
for i in range(2):
    indices = df["employed_yes"] == i

    if i == 0:
        p = 0.5  # Not employed
    elif i == 1:
        p = 0.7  # Employed

    df.loc[indices, "quality_cv"] = np.random.binomial(
        3, p, sum(indices.values)
    )

### Relabel some of the disadvantaged groups from y=0 to y=1
This is to introduce discrepancy in the ROC curves

In [None]:
indices = (df["sex_male"] == 0) & (df["employed_yes"] == 0)
df.loc[indices, "employed_yes"] = np.random.binomial(
    1, 0.1, sum(indices.values)
)

In [None]:
indices = (df["sex_male"] == 0) & (df["employed_yes"] == 1)
df.loc[indices, "employed_yes"] = np.random.binomial(
    1, 0.98, sum(indices.values)
)

In [None]:
indices = (df["race_white"] == 0) & (df["employed_yes"] == 0)
df.loc[indices, "employed_yes"] = np.random.binomial(
    1, 0.08, sum(indices.values)
)

In [None]:
indices = (df["race_white"] == 0) & (df["employed_yes"] == 1)
df.loc[indices, "employed_yes"] = np.random.binomial(
    1, 0.98, sum(indices.values)
)

## High-level checks of data set

In [None]:
label_balance = 1.0 - df["employed_yes"].sum() / len(df["employed_yes"])
print("Label Balance =", label_balance)

## Train, val and test data

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [None]:
test_df = test_df.reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

## Save to file

In [None]:
artifacts_dir = Path("../../artifacts")

In [None]:
# temporary platform specific directory
data_dir = artifacts_dir / "data" / "recruiting"

In [None]:
train_df.to_csv(data_dir / "raw" / "train.csv", index=False)
test_df.to_csv(data_dir / "raw" / "test.csv", index=False)
val_df.to_csv(data_dir / "raw" / "val.csv", index=False)

## Quick data pre-processing

In [None]:
test_exp_cat = test_df.experience_category

In [None]:
train_df = train_df.drop("experience_category", axis=1)
test_df = test_df.drop("experience_category", axis=1)
val_df = val_df.drop("experience_category", axis=1)
df = df.drop("experience_category", axis=1)

### Scaling data

In [None]:
ss = StandardScaler()

# Numerical attributes
cts_features = [
    "a_levels",
    "gcse",
    "years_experience",
    "years_volunteer",
    "income",
    "it_skills",
    "years_holes",
    "quality_cv",
]

train_df[cts_features] = ss.fit_transform(train_df[cts_features])
val_df[cts_features] = ss.transform(val_df[cts_features])
test_df[cts_features] = ss.transform(test_df[cts_features])

In [None]:
train_df = train_df.astype(float)
test_df = test_df.astype(float)
val_df = val_df.astype(float)

## Model

In [None]:
# Define model
model = tf.keras.Sequential(
    [
        tf.keras.Input(13, name="x"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.compile(
    tf.optimizers.Adam(learning_rate=1e-4),
    "binary_crossentropy",
    ["binary_accuracy"],
)

history = model.fit(
    train_df.drop("employed_yes", axis=1).values,
    train_df["employed_yes"].values,
    epochs=500,
    batch_size=512,
    validation_data=(
        val_df.drop("employed_yes", axis=1).values,
        val_df["employed_yes"].values,
    ),
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=4, min_delta=1e-4)],
)

In [None]:
y_pred = model.predict(test_df.drop(["employed_yes"], axis=1)).flatten()
print(
    "Test Accuracy = ",
    fairness_measures.accuracy(y_pred, test_df["employed_yes"].values),
)

### Demographic parity

In [None]:
ind_p = fairness_measures.disparate_impact_p(y_pred, test_df["race_white"])
print("Independence in probability =", ind_p)

In [None]:
ind_p = fairness_measures.disparate_impact_p(y_pred, test_df["sex_male"])
print("Independence in probability =", ind_p)

### Equal opportunity

In [None]:
eop_p = fairness_measures.equal_opportunity_p(
    y_pred, test_df["race_white"], test_df["employed_yes"]
)
print("White/black equal opportunity in probability =", eop_p)

In [None]:
eop_p = fairness_measures.equal_opportunity_p(
    y_pred, test_df["sex_male"], test_df["employed_yes"]
)
print("Male/female equal opportunity in probability =", eop_p)

### Equalised odds

In [None]:
plt.hist(
    y_pred[(test_df["employed_yes"] == 1) & (test_df["sex_male"] == 0)],
    alpha=0.5,
    bins=20,
)
plt.hist(
    y_pred[(test_df["employed_yes"] == 1) & (test_df["sex_male"] == 1)],
    alpha=0.5,
    bins=20,
)

In [None]:
plt.hist(
    y_pred[(test_df["employed_yes"] == 0) & (test_df["sex_male"] == 0)],
    alpha=0.5,
    bins=20,
)
plt.hist(
    y_pred[(test_df["employed_yes"] == 0) & (test_df["sex_male"] == 1)],
    alpha=0.5,
    bins=20,
)

In [None]:
sep_p = fairness_measures.equalised_odds_p(
    y_pred, test_df["race_white"], test_df["employed_yes"]
)
print("White/black separation in probability =", sep_p)

In [None]:
sep_p = fairness_measures.equalised_odds_p(
    y_pred, test_df["sex_male"], test_df["employed_yes"]
)
print("Male/female separation in probability =", sep_p)

### Feature importance via sklearn

In [None]:
# Build a forest and compute the impurity-based feature importances
# forest = ExtraTreesClassifier(n_estimators=250, random_state=42)
forest = ExtraTreesClassifier(n_estimators=1000, max_depth=15)

forest.fit(train_df.drop(["employed_yes"], axis=1), train_df.employed_yes)

# Accuracy for forest
y_pred = forest.predict_proba(test_df.drop(["employed_yes"], axis=1))[:, 1]
print(
    "Test Accuracy = ",
    fairness_measures.accuracy(y_pred, test_df["employed_yes"].values),
)

importances = forest.feature_importances_
std = np.std(
    [tree.feature_importances_ for tree in forest.estimators_], axis=0
)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(train_df.shape[1] - 1):
    print(
        "%d. feature %s (%f)"
        % (
            f + 1,
            train_df.columns.drop(["employed_yes"])[indices[f]],
            importances[indices[f]],
        )
    )

# Plot the impurity-based feature importances of the forest
fig, ax = plt.subplots(figsize=(10, 5))
plt.title("Feature importances")
ax.bar(
    range(train_df.shape[1] - 1),
    importances[indices],
    color="r",
    yerr=std[indices],
    align="center",
)
ax.set_xticks(range(train_df.shape[1] - 1))
ax.set_xticklabels(
    [train_df.columns.drop(["employed_yes"])[i] for i in indices], rotation=45
)

### Resolving variable fairness
Consider fairness with year of relevant career experience being the resolving variable

In [None]:
cond_test_df = pd.DataFrame(
    {
        "experience_category": test_exp_cat,
        "scores": y_pred,
        "sex_male": test_df.sex_male,
        "race_white": test_df.race_white,
    }
)

In [None]:
fig = px.box(
    cond_test_df,
    x="experience_category",
    y="scores",
    color="sex_male",
    category_orders={"experience_category": ["0-2", "2-4", "4-6", ">=6"]},
)
fig

In [None]:
fig = px.box(
    cond_test_df,
    x="experience_category",
    y="scores",
    color="race_white",
    category_orders={"experience_category": ["0-2", "2-4", "4-6", ">=6"]},
)
fig

## Save data to file

In [None]:
train_df.to_csv(data_dir / "processed" / "train.csv", index=False)
test_df.to_csv(data_dir / "processed" / "test.csv", index=False)
val_df.to_csv(data_dir / "processed" / "val.csv", index=False)