# Kamiran Calders

In [None]:
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from aif360.datasets import StandardDataset
from aif360.algorithms.preprocessing.reweighing import Reweighing
from helpers.fairness_measures import accuracy
from helpers.finance import preprocess
from helpers.plot import group_box_plots
from sklearn.linear_model import LogisticRegression

## Load data

The `preprocess` function checks if adult data is available and if not downloads and saves it.

In [None]:
artifacts_dir = Path("../../../artifacts")

Location of the data

In [None]:
data_dir = artifacts_dir / "data" / "adult"
preprocess(data_dir)

In [None]:
train = pd.read_csv(data_dir / "processed" / "train-one-hot.csv")
val = pd.read_csv(data_dir / "processed" / "val-one-hot.csv")
test = pd.read_csv(data_dir / "processed" / "test-one-hot.csv")

In [None]:
train_sds = StandardDataset(
    train,
    label_name="salary",
    favorable_classes=[1],
    protected_attribute_names=["sex"],
    privileged_classes=[[1]],
)
test_sds = StandardDataset(
    test,
    label_name="salary",
    favorable_classes=[1],
    protected_attribute_names=["sex"],
    privileged_classes=[[1]],
)
val_sds = StandardDataset(
    val,
    label_name="salary",
    favorable_classes=[1],
    protected_attribute_names=["sex"],
    privileged_classes=[[1]],
)

In [None]:
privileged_groups = [{"sex": 1.0}]
unprivileged_groups = [{"sex": 0.0}]

## Train original model

In [None]:
bl_model = joblib.load(artifacts_dir / "models" / "finance" / "baseline.pkl")

bl_test_probs = bl_model.predict_proba(test.drop("salary", axis=1))[:, 1]

In [None]:
mask = test.sex == 1

print("Original model accuracy =", accuracy(bl_test_probs, test.salary))
print(
    "Female accuracy =", accuracy(bl_test_probs[~mask], test.salary[~mask]),
)
print(
    "Male accuracy =", accuracy(bl_test_probs[mask], test.salary[mask]),
)
print("Mean female score =", bl_test_probs[~mask].mean())
print("Mean male score =", bl_test_probs[mask].mean())

## Perform intervention

### Train with and transform the original training data

In [None]:
RW = Reweighing(
    unprivileged_groups=unprivileged_groups,
    privileged_groups=privileged_groups,
)
RW.fit(train_sds)
train_sds_transf = RW.transform(train_sds)

### Train model with transformed training data

In [None]:
model_fair = LogisticRegression(max_iter=10000)
X_train = train_sds_transf.features
y_train = train_sds_transf.labels.flatten()
model_fair.fit(
    X_train, y_train, sample_weight=train_sds_transf.instance_weights
)

### Predict fairly on test set
Note that the pre-processing intervention of the validation data happens in the model prediction since the model has been based on the weighting which was determined by the reweight transformed training data. 

In [None]:
test_sds_pred = test_sds.copy(deepcopy=True)
X_test = test_sds_pred.features
y_test = test_sds.labels
test_sds_pred.scores = model_fair.predict_proba(X_test)[:, 1].reshape(-1, 1)

## Analyse fairness and accuracy

In [None]:
mask = test.sex == 1

print("Accuracy =", accuracy(test_sds_pred.scores.flatten(), test.salary))

print(
    "Female accuracy =",
    accuracy(test_sds_pred.scores.flatten()[~mask], test.salary[~mask],),
)
print(
    "Male accuracy =",
    accuracy(test_sds_pred.scores.flatten()[mask], test.salary[mask],),
)
print(
    "Mean female score =", test_sds_pred.scores.flatten()[~mask].mean(),
)
print(
    "Mean male score =", test_sds_pred.scores.flatten()[mask].mean(),
)

### Plots

In [None]:
dp_box = group_box_plots(
    np.concatenate([bl_test_probs, test_sds_pred.scores.flatten()]),
    np.concatenate(
        [
            np.zeros_like(bl_test_probs),
            np.ones_like(test_sds_pred.scores.flatten()),
        ]
    ),
    np.tile(test.sex.map(lambda x: "Male" if x else "Female"), 2),
    group_names=["Baseline", "Kamiran-Calders"],
)
dp_box