In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import shap
from collections import Counter

# Models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Bias

import aif360
from aif360.algorithms.preprocessing import DisparateImpactRemover, Reweighing
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.datasets import StandardDataset 

pd.options.mode.chained_assignment = None 
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.7f' % x)

In [3]:
df_original = pd.read_csv(r"C:\Users\bassa\Desktop\Natwest\Dataset\adult.csv")

In [4]:
# Rescale the elements of the column to a number
def rescale_elements(df, feature):
    x=df[feature].value_counts()
    item_type_mapping = {}
    item_list = x.index
    for i in range(0,len(item_list)):
        item_type_mapping[item_list[i]] = i
    df[feature]=df[feature].map(lambda x:item_type_mapping[x])
    return df

In [5]:
# rescale race (white = 0, black = 1) after dropping: Asian-Pac-Islander, Amer-Indian-Eskimo, Other
df = df_original[df_original['race'].isin(['White', 'Black'])]
df['race'][df['race'] == 'White'] = 0
df['race'][df['race'] == 'Black'] = 1

# rescale sex (sex : male = 0 , female = 1)
df['gender'][df['gender'] == 'Male'] = 0
df['gender'][df['gender'] == 'Female'] = 1

# rescale marital status (Married-civ-spouse = 0, Never-married = 1) after dropping 'others'
df['marital-status'][df['marital-status'].isin(['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'])] = 0
df['marital-status'][df['marital-status'].isin(['Never-married', 'Divorced', 'Separated', 'Widowed'])] = 1

# rescale income (>50K = 0, <=50K = 1)
df['income'][df['income'] == '>50K'] = 0
df['income'][df['income'] == '<=50K'] = 1

# rescale native-country (United-States = 0, any other country = 1)
df['native-country'][df['native-country'] == 'United-States'] = 0
df['native-country'][df['native-country'] != 0] = 1

# rescale education 
df = rescale_elements(df, 'education')

# rescale relationship
df = rescale_elements(df, 'relationship')

# rescale occupation
df = rescale_elements(df, 'occupation')

# rescale workclass
df = rescale_elements(df, 'workclass')

# Transform the object in integers and reset the index
df = df.apply(pd.to_numeric).reset_index(drop=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46447 entries, 0 to 46446
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   age              46447 non-null  int64
 1   workclass        46447 non-null  int64
 2   fnlwgt           46447 non-null  int64
 3   education        46447 non-null  int64
 4   educational-num  46447 non-null  int64
 5   marital-status   46447 non-null  int64
 6   occupation       46447 non-null  int64
 7   relationship     46447 non-null  int64
 8   race             46447 non-null  int64
 9   gender           46447 non-null  int64
 10  capital-gain     46447 non-null  int64
 11  capital-loss     46447 non-null  int64
 12  hours-per-week   46447 non-null  int64
 13  native-country   46447 non-null  int64
 14  income           46447 non-null  int64
dtypes: int64(15)
memory usage: 5.3 MB


### Metrics

In [6]:
def fair_metrics(data, y_pred, target, protected_attribute_names) :
    dataset = StandardDataset(data, 
                          label_name=target, 
                          favorable_classes=[0], 
                          protected_attribute_names=protected_attribute_names, 
                          privileged_classes=[[0]])

    dataset_pred = dataset.copy()
    dataset_pred.labels = y_pred
        
    attr = dataset_pred.protected_attribute_names[0]
    
    idx = dataset_pred.protected_attribute_names.index(attr)
    privileged_groups =  [{attr:dataset_pred.privileged_protected_attributes[idx][0]}] 
    unprivileged_groups = [{attr:dataset_pred.unprivileged_protected_attributes[idx][0]}] 

    classified_metric = ClassificationMetric(dataset, dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    metric_pred = BinaryLabelDatasetMetric(dataset_pred, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    result = {f'disparate_impact_{protected_attribute_names[0]}': metric_pred.disparate_impact(),
              f'statistical_parity_difference_{protected_attribute_names[0]}': metric_pred.statistical_parity_difference(),
              f'equal_opportunity_difference_{protected_attribute_names[0]}': classified_metric.equal_opportunity_difference()}
        
    return result

### Mitigation

In [7]:
def Reweighing_1(X, y, A):
    # X: independent variables (2-d pd.DataFrame)
    # y: the dependent variable (1-d np.array)
    # A: the name of the sensitive attributes (list of string)
    groups_class = {}
    group_weight = {}
    for i in range(len(y)):
        key_class = tuple([X[a][i] for a in A]+[y[i]])
        key = key_class[:-1]
        if key not in group_weight:
            group_weight[key]=0
        group_weight[key]+=1
        if key_class not in groups_class:
            groups_class[key_class]=[]
        groups_class[key_class].append(i)
    class_weight = Counter(y)
    sample_weight = np.array([1.0]*len(y))
    for key in groups_class:
        weight = class_weight[key[-1]]*group_weight[key[:-1]]/len(groups_class[key])
        for i in groups_class[key]:
            sample_weight[i] = weight
    # Rescale the total weights to len(y)
    sample_weight = sample_weight * len(y) / sum(sample_weight)
    return sample_weight

def FairBalance(X, y, A):
    # X: independent variables (2-d pd.DataFrame)
    # y: the dependent variable (1-d np.array)
    # A: the name of the sensitive attributes (list of string)
    groups_class = {}
    group_weight = {}
    for i in range(len(y)):
        key_class = tuple([X[a][i] for a in A] + [y[i]])
        key = key_class[:-1]
        if key not in group_weight:
            group_weight[key] = 0
        group_weight[key] += 1
        if key_class not in groups_class:
            groups_class[key_class] = []
        groups_class[key_class].append(i)
    sample_weight = np.array([1.0]*len(y))
    for key in groups_class:
        weight = group_weight[key[:-1]]/len(groups_class[key])
        for i in groups_class[key]:
            sample_weight[i] = weight
    # Rescale the total weights to len(y)
    sample_weight = sample_weight * len(y) / sum(sample_weight)
    return sample_weight

def FairBalanceVariant(X, y, A):
    # X: independent variables (2-d pd.DataFrame)
    # y: the dependent variable (1-d np.array)
    # A: the name of the sensitive attributes (list of string)
    groups_class = {}
    for i in range(len(y)):
        key_class = tuple([X[a][i] for a in A] + [y[i]])
        if key_class not in groups_class:
            groups_class[key_class] = []
        groups_class[key_class].append(i)
    sample_weight = np.array([1.0]*len(y))
    for key in groups_class:
        weight = 1.0/len(groups_class[key])
        for i in groups_class[key]:
            sample_weight[i] = weight
    # Rescale the total weights to len(y)
    sample_weight = sample_weight * len(y) / sum(sample_weight)
    return sample_weight

In [26]:
x = df.drop(['income'], axis = 1)
y = df['income'] 
default_seed = 1
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=default_seed)

sample_weight = FairBalanceVariant(x_train, y_train, ["gender", "race"])

KeyError: 2

In [None]:
def __init__(self, data, treatment="None", inject = None):
    #  Load data
    self.data, self.A = load(data)
    # Separate independent variables and dependent variables
    independent = self.data.keys().tolist()
    dependent = independent.pop(-1)
    self.X = self.data[independent]
    self.y = np.array(self.data[dependent])
    self.treatment = treatment
    self.inject = inject
    if treatment == "FERMI":
        self.clf = FERMI()
    else:
        self.clf = LogisticRegression(max_iter=100000)

def one_exp(self):
    X_train, X_test, y_train, y_test = self.train_test_split(test_size=0.5)
    #########################################
    self.data_preprocess(X_train)
    sample_weight = self.treat(X_train, y_train)
    self.fit(X_train, y_train, sample_weight)
    m_train = Metrics(self.clf, X_train, y_train, self.A, self.preprocessor)
    m_test = Metrics(self.clf, X_test, y_test, self.A, self.preprocessor)
    return m_train, m_test

def fit(self, X, y, sample_weight=None):
    X_train_processed = self.preprocessor.fit_transform(X)
    if type(self.clf) == FERMI:
        S = []
        groups = {}
        count = 0
        for i in range(len(y)):
            group = tuple([X[a][i] for a in self.A])
            if group not in groups:
                groups[group] = count
                count += 1
            S.append(groups[group])
        S = np.array(S)
        self.clf.fit(X_train_processed, y, S, sample_weight=sample_weight)
    else:
        self.clf.fit(X_train_processed, y, sample_weight=sample_weight)

def data_preprocess(self, X):
    numerical_columns_selector = selector(dtype_exclude=object)
    categorical_columns_selector = selector(dtype_include=object)

    numerical_columns = numerical_columns_selector(X)
    categorical_columns = categorical_columns_selector(X)

    categorical_preprocessor = OneHotEncoder(handle_unknown = 'ignore')
    numerical_preprocessor = StandardScaler()
    self.preprocessor = ColumnTransformer([
        ('OneHotEncoder', categorical_preprocessor, categorical_columns),
        ('StandardScaler', numerical_preprocessor, numerical_columns)])

def treat(self, X_train, y_train):
    if self.treatment == "Reweighing":
        sample_weight = Reweighing(X_train, y_train, self.A)
    elif self.treatment == "FairBalanceVariant":
        sample_weight = FairBalanceVariant(X_train, y_train, self.A)
    elif self.treatment == "FairBalance":
        sample_weight = FairBalance(X_train, y_train, self.A)
    else:
        sample_weight = None
    return sample_weight

def train_test_split(self, test_size=0.5):
    # Split training and testing data proportionally across each group
    groups = {}
    for i in range(len(self.y)):
        key = tuple([self.X[a][i] for a in self.A] + [self.y[i]])
        if key not in groups:
            groups[key] = []
        groups[key].append(i)
    train = []
    test = []
    for key in groups:
        testing = list(np.random.choice(groups[key], int(len(groups[key])*test_size), replace=False))
        training = list(set(groups[key]) - set(testing))
        test.extend(testing)
        train.extend(training)
    X_train = self.X.iloc[train]
    X_test = self.X.iloc[test]
    y_train = self.y[train]
    y_test = self.y[test]
    X_train.index = range(len(X_train))
    X_test.index = range(len(X_test))
    return X_train, X_test, y_train, y_test