# Intersectional bias analysis

In [1]:
import pandas as pd
from etiq_core import *;
import opendatasets as od
import numpy as np
import warnings
import pprint
from copy import deepcopy
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import re
import datetime
import math
from IPython.core.interactiveshell import InteractiveShell
from pandas.api.types import is_string_dtype
import warnings
import logging

Thanks for trying out the ETIQ.ai toolkit!

Visit our getting started documentation at https://docs.etiq.ai/

Visit our Slack channel at https://etiqcore.slack.com/ for support or feedback.



In [None]:


# Setting display off warning and info messages
warnings.filterwarnings("ignore")
logger = logging.getLogger("etiq_core")
logger.setLevel(level = logging.CRITICAL)


def get_debias_params(protected, privileged, unprivileged, positive_label, negative_label):
    return BiasParams(protected=protected, privileged=privileged, unprivileged=unprivileged, positive_outcome_label=positive_label, negative_outcome_label=negative_label)


transforms = [Dropna, EncodeLabels]
metrics_bonus = [accuracy,  equal_opportunity,demographic_parity, equal_odds_tpr, equal_odds_tnr, true_neg_rate, true_pos_rate, individual_fairness, individual_fairness_cf]
metrics_initial = [accuracy,  equal_opportunity,demographic_parity, equal_odds_tnr, individual_fairness]
metrics_short = [accuracy, demographic_parity, equal_opportunity, individual_fairness]
metrics_sshort = [accuracy, demographic_parity, equal_opportunity]
metrics_list = ['accuracy','demographic_parity','equal_opportunity', 'individual_fairness']
metrics_list_short = ['accuracy','demographic_parity','equal_opportunity']
# Wrapper to avoid having to call the clean classifier, dl and pipe, returns metrics
def etiq_wrapper_run(data, debias_params, cont_val, cat_val, p_feature, metrics):
    xgb = DefaultXGBoostClassifier()
    dl = DatasetLoader(data=data, label=p_feature, transforms=transforms, bias_params=debias_params, train_valid_test_splits=[0.8, 0.1, 0.1], cat_col=cat_vars, cont_col=cont_vars, names_col = data.columns.values);
    pipeline_initial = DataPipeline(dataset_loader=dl, model=xgb, metrics=metrics);
    pipeline_initial.run();
    metrics = pipeline_initial.get_protected_metrics();
    return metrics


# Insert into the given dataset the column obtained by intersecting protected1 and protected2, drop the two original columns if drop is true
def get_intersection(data, protected1, protected2, drop = True):
    column_name = protected1 + "_" + protected2
    data[column_name] = data[protected1] + "_" + data[protected2]
    if(drop): data = data.drop([protected1, protected2],axis=1)
    return data


# Create a dataframe from metrics_result of two rows (privileged and unprivileged), the columns are the metrics in debias_params (set globally)
def get_df_from_metrics(metrics_result):
    m = deepcopy(metrics_result)
    item = m.popitem()
    if(item[1] is not None):
        l = item[1]
        d = {'privilege':['privileged', 'unprivileged']}
        d['class'] = [debias_params.privileged, debias_params.unprivileged]
        for x in l:
            ppitem = x.popitem()
            d[ppitem[0]] = [ppitem[1][1], ppitem[1][3]]
        df = pd.DataFrame(data = d)
        return df
    else:
        return null


# Create a dataframe from metrics_result with one row for metrics in metrics_in, columns are the parameters in biasParams_in and one column
# for the calculated disparity as a ratio
def get_disparity_df(metrics_result, biasParams_in, metrics_in):
    m = deepcopy(metrics_result)
    item = m.popitem()
    if(item[1] is not None):
        l = item[1]
        d = {'metric': ['group'], 'privileged': [biasParams_in.privileged], 'unprivileged': [biasParams_in.unprivileged], 'disparity': ['']}
        for x in l:
            ppitem = x.popitem()
            if(ppitem[0] in metrics_in):
                d['metric'].append(ppitem[0])
                d['privileged'].append(ppitem[1][1])
                d['unprivileged'].append(ppitem[1][3])
                d['disparity'].append(ppitem[1][3] / ppitem[1][1])
        df = pd.DataFrame(data = d)
        df = df.set_index('metric')
        return df
    else:
        return null


# Intersects feature1 and feature2 in dataset data. Calculate the EDF on the p_feature of the entries in the subgroups obtained by joining
# attributes1 and attributes2. If in_place modifies d (intersects the columns but does NOT delete the originals)
# I intersect f1 and f2, I choose which subgroups of the features I want to examine (e.g. I intersect Sex and Race, I choose 'White, Black' and
# 'Male, Female' to search for elements in "White_Male", "White_Female", "Black_Male" and " Black_Female")
def get_edf_df(data, feature1, feature2, attributes1, attributes2, p_feature, in_place = True):
    d = data
    if not in_place: d = data.deepcopy(data)
    subg = feature1+'_'+feature2
    intersections = []
    for a1 in attributes1:
        for a2 in attributes2:
            if a1 != a2: intersections.append(a1+'_'+a2)
    d = get_intersection(d, feature1, feature2, drop = False)
    result = {'group1': [], 'group2': []}
    for g in intersections:
        for other in intersections:
            if g != other:
                result['group1'].append(g)
                result['group2'].append(other)
                #for s in subgs:
                if 'disparity' not in result: result['disparity'] = []
                if g in d[subg].unique() and other in d[subg].unique():
                    p = d.groupby(subg).sum()
                    Nys_i = p.at[g, p_feature]
                    Ns_i = len(d[d[subg] == g])
                    Nys_j = p.at[other, p_feature]
                    Ns_j = len(d[d[subg] == other])
                    result['disparity'].append((Nys_i / Ns_i) * (Ns_j / Nys_j))
                else: result['disparity'].append('')
    df = pd.DataFrame(data = result)
    if not in_place: del d
    return df


# Extracts the n pairs with greatest difference of EDF from edf obtained from get_edf_df
def read_edf(edf, n = 1):
    groups = []
    wedf = edf
    if n > 1:
        wedf = edf.copy()
    for i in range(n):
        idx = wedf['disparity'].idxmax()
    val = wedf['disparity'].max()
    groups.append((wedf.at[idx, 'group1'], wedf.at[idx, 'group2'], val))
    if n > 1:
        wedf.drop([idx], axis = 0, inplace = True)
    return groups


# Create a dataframe from dataset data with modal values and their occurrences of features in features for positive_outcome outcomes
# and negative_outcome outcomes of the p_feature
def get_mode_df(data, features, p_feature, positive_outcome, negative_outcome):
    d = {'feature': [],
        'mode_pos': [], 'mode_occ_pos': [], 'total_mode_occ_pos': [],
        'mode_neg': [], 'mode_occ_neg': [], 'total_mode_occ_neg': []}
    for ft in features:
        pos = data[data[p_feature] == positive_outcome].groupby(ft).size()
        neg = data[data[p_feature] == negative_outcome].groupby(ft).size()
        d['feature'].append(ft)
        d['mode_pos'].append(pos.idxmax())
        d['mode_occ_pos'].append(pos.max())
        d['total_mode_occ_pos'].append(len(data[data[ft] == pos.idxmax()]))
        d['mode_neg'].append(neg.idxmax())
        d['mode_occ_neg'].append(neg.max())
        d['total_mode_occ_neg'].append(len(data[data[ft] == neg.idxmax()]))
    df = pd.DataFrame(data = d)
    df = df.set_index('feature')
    return df


# Like get_mode_df, but instead of the modal value, the value with the greatest ratio between occurrences and total cardinality
def get_ratio_df(data, features, p_feature, positive_outcome, negative_outcome):
    d = {'feature': [], 'max_ratio_pos': [], 'max_ratio_occ_pos': [], 'max_ratio_tot_occ_pos': [], 'max_ratio_neg': [], 'max_ratio_occ_neg':[], 'max_ratio_tot_occ_neg':[]}
    for ft in features:
        pos = data[data[p_feature] == positive_outcome].groupby(ft).size()
        neg = data[data[p_feature] == negative_outcome].groupby(ft).size()
        d['feature'].append(ft)
        ratios = {}
        for i, v in enumerate(pos):
            n = len(data[data[ft] == pos.index[i]])
            ratios[pos.index[i]] = v / n
        pos_ratios = sorted(ratios.items(), key = lambda x : x[1], reverse = True)
        cat, val = pos_ratios[0]
        max_i = (cat, round(val, 2))
        d['max_ratio_pos'].append(max_i)
        d['max_ratio_occ_pos'].append(pos[max_i[0]])
        d['max_ratio_tot_occ_pos'].append(len(data[data[ft] == max_i[0]]))
        ratios = {}
        for i, v in enumerate(neg):
            n = len(data[data[ft] == neg.index[i]])
            ratios[neg.index[i]] = v / n
        neg_ratios = sorted(ratios.items(), key = lambda x : x[1], reverse = True)
        cat, val = neg_ratios[0]
        max_i = (cat, round(val, 2))
        d['max_ratio_neg'].append(max_i)
        d['max_ratio_occ_neg'].append(neg[max_i[0]])
        d['max_ratio_tot_occ_neg'].append(len(data[data[ft] == max_i[0]]))
    df = pd.DataFrame(data = d)
    df = df.set_index('feature')
    return df


# Merge of the two functions above
def get_mode_ratio_df(data, features, p_feature, positive_outcome, negative_outcome):
    d = {'feature': [], 'mode_pos': [], 'mode_occ_pos': [], 'total_mode_occ_pos': [], 'max_ratio_pos': [], 'max_ratio_occ_pos': [], 'max_ratio_tot_occ_pos': [], 'mode_neg': [], 'mode_occ_neg': [], 'total_mode_occ_neg': [], 'max_ratio_neg': [], 'max_ratio_occ_neg':[], 'max_ratio_tot_occ_neg':[]}
    for ft in features:
        pos = data[data[p_feature] == positive_outcome].groupby(ft).size()
        neg = data[data[p_feature] == negative_outcome].groupby(ft).size()
        d['feature'].append(ft)
        d['mode_pos'].append(pos.idxmax())
        d['mode_occ_pos'].append(pos.max())
        d['total_mode_occ_pos'].append(len(data[data[ft] == pos.idxmax()]))
        ratios = {}
        for i, v in enumerate(pos):
            n = len(data[data[ft] == pos.index[i]])
            ratios[pos.index[i]] = v / n
        pos_ratios = sorted(ratios.items(), key = lambda x : x[1], reverse = True)
        cat, val = pos_ratios[0]
        max_i = (cat, round(val, 2))
        d['max_ratio_pos'].append(max_i)
        d['max_ratio_occ_pos'].append(pos[max_i[0]])
        d['max_ratio_tot_occ_pos'].append(len(data[data[ft] == max_i[0]]))
        d['mode_neg'].append(neg.idxmax())
        d['mode_occ_neg'].append(neg.max())
        d['total_mode_occ_neg'].append(len(data[data[ft] == neg.idxmax()]))
        ratios = {}
        for i, v in enumerate(neg):
            n = len(data[data[ft] == neg.index[i]])
            ratios[neg.index[i]] = v / n
        neg_ratios = sorted(ratios.items(), key = lambda x : x[1], reverse = True)
        cat, val = neg_ratios[0]
        max_i = (cat, round(val, 2))
        d['max_ratio_neg'].append(max_i)
        d['max_ratio_occ_neg'].append(neg[max_i[0]])
        d['max_ratio_tot_occ_neg'].append(len(data[data[ft] == max_i[0]]))
    df = pd.DataFrame(data = d)
    df = df.set_index('feature')
    return df


# Returns a pair of dicts, the first for occurrences of positive_outcome of the p_feature and the second for occurrences
# of negative_outcome. Each dict contains, for each feature in features, the number of times a value has been modal for at least
# one of the samples
def get_maxOccurrences_in_samples(samples, features, p_feature, positive_outcome, negative_outcome):
    results_pos = {}
    results_neg = {}
    for i in range(len(samples)):
        df = get_mode_ratio_df(data = samples[i], features = features, p_feature = p_feature, positive_outcome = positive_outcome, negative_outcome = negative_outcome)
        for ind in df.index:
            ft = ind
            if ft not in results_pos: results_pos[ft] = {}
            val = df.at[ind, 'mode_pos']
            if val not in results_pos[ft]: results_pos[ft][val] = 0
            results_pos[ft][val] += 1
            if ft not in results_neg: results_neg[ft] = {}
            val = df.at[ind, 'mode_neg']
            if val not in results_neg[ft]: results_neg[ft][val] = 0
            results_neg[ft][val] += 1
    return (results_pos, results_neg)


# As above but with reports
def get_maxOccurrences_ratio_in_samples(samples, features, p_feature, positive_outcome, negative_outcome):
    results_pos = {}
    results_neg = {}
    for i in range(len(samples)):
        df = get_mode_ratio_df(data = samples[i], features = features, p_feature = p_feature, positive_outcome = positive_outcome, negative_outcome = negative_outcome)
        for ind in df.index:
            ft = ind
            if ft not in results_pos: results_pos[ft] = {}
            val, k = df.at[ind, 'max_ratio_pos']
            if val not in results_pos[ft]: results_pos[ft][val] = 0
            results_pos[ft][val] += 1
            if ft not in results_neg: results_neg[ft] = {}
            val, k = df.at[ind, 'max_ratio_neg']
            if val not in results_neg[ft]: results_neg[ft][val] = 0
            results_neg[ft][val] += 1
    return (results_pos, results_neg)


# Not used(?)
def get_intersections_count(samples, features1, features2, attributes1, attributes2):
    results = {}
    subgs = []
    for f1 in features1:
        for f2 in features2:
            if f1 != f2: subgs.append(f1+'_'+f2)
    intersections = []
    for a1 in attributes1:
        for a2 in attributes2:
            if a1 != a2: intersections.append(a1+'_'+a2)
    for sample in samples:
        for f1 in features1:
            for f2 in features2:
                if f1 != f2:
                    sample = get_intersection(sample, f1, f2, drop = False)
    for g in subgs:
        for t in intersections:
            n = len(sample[sample[g] == t])
            if n > 0:
                if g not in results: results[g] = {}
                if t not in results[g]: results[g][t] = 0
                results[g][t] += n
    return results


# Dict with modal feature values for the feature's value (bad name for var, sorry)
# the usage examples in the rest of the notebook are far more helpful than the explanation
def get_values_of(samples, feature, value, features):
    result = {}
    for sample in samples:
        subset = sample[sample[feature] == value]
    for ft in features:
        if ft not in result: result[ft] = None
        count = subset.groupby(ft).size()
        if len(count) > 0: result[ft] = count.idxmax()
    return result


# Same as above, but filter by the outcome of the p_feature
def get_values_of_outcome(samples, feature, value, features, p_feature, outcome):
    result = {}
    for sample in samples:
        subset = sample[(sample[feature] == value) & (sample[p_feature] == outcome)]
        for ft in features:
            if ft not in result: result[ft] = None
            count = subset.groupby(ft).size()
            if len(count) > 0: result[ft] = count.idxmax()
    return result


# Removes the feature most closely related to the p_feature among the features in relevant_features
# Returns a pair with the data changed and the index removed
def remove_max_corr(data, relevant_features, p_feature):
    corr = data.corr().abs()
    corr.drop([p_feature], axis = 0, inplace = True)
    for i in corr.index:
        if i not in relevant_features:
            corr.drop([i], axis = 0, inplace = True)
    idx = corr[p_feature].idxmax()
    data = data.drop([idx], axis = 1)
    return (data, idx)


# As above but repeated n times
# Returns a pair with data and indexes removed
def iterative_correlation_removal(n, data, relevant_features, p_feature):
    removed = []
    for i in range(n):
        data, r = remove_max_corr(data, relevant_features, p_feature)
        removed.append(r)
    return data, removed


# Iterates the process of removing a feature related to p_feature n times. If reinsert = false the result is cumulative,
# otherwise I reinsert at each step and calculate the next in order of correlation
# Returns a dict with disparity changes
def disparity_change(data, n, reinsert, relevant_features, p_feature, debias_params, cont_vars, cat_vars, metrics):
    old = {}
    result = {}
    metrics_list = []
    columns = {}
    for m in metrics:
        metrics_list.append(m.__name__)
    metrics_in = get_disparity_df(etiq_wrapper_run(data, debias_params, cont_vars, cat_vars, p_feature, metrics), debias_params, metrics_list)
    for m in metrics_list:
        old[m] = metrics_in.loc[m]['disparity']
    corr = data.corr().abs()
    corr.drop([p_feature], axis = 0, inplace = True)
    for i in corr.index:
        if i not in relevant_features:
            corr.drop([i], axis = 0, inplace = True)
    for i in range(n):
        idx = corr[p_feature].idxmax()
        corr.drop([idx], axis = 0, inplace = True)
        column = data[idx]
        columns[idx] = column
        data.drop([idx], axis = 1, inplace = True)
        cont_vars = list(set(cont_vars) - set([idx]))
        xgb = DefaultXGBoostClassifier()
        dl = DatasetLoader(data=data, label=p_feature, transforms=transforms, bias_params=debias_params, train_valid_test_splits=[0.8, 0.1, 0.1], cat_col=cat_vars, cont_col=cont_vars, names_col = data.columns.values);
        pipeline_initial = DataPipeline(dataset_loader=dl, model=xgb, metrics=metrics);
        pipeline_initial.run();
        metr = pipeline_initial.get_protected_metrics();
        metrics_out = get_disparity_df(metr, debias_params, metrics_list)
        #metrics_out = get_disparity_df(etiq_wrapper_run(data, debias_params, cont_vars, cat_vars, p_feature, metrics),
        #                              debias_params, metrics_list)
        if idx not in result: result[idx] = {}
        for m in metrics_list:
            new = metrics_out.loc[m]['disparity']
            result[idx][m] = new - old[m]
        if reinsert:
            cont_vars.append(idx)
            data[idx] = column
    if not reinsert:
        for c in columns:
            data[c] = columns[c]
    return result


# From the above function it extracts the features whose removal modifies the value of p_feature the most
def disparity_change_get_max(result):
    out = {}
    for ft in result:
        for m in result[ft]:
            if m not in out: out[m] = (ft, result[ft][m], ft, result[ft][m])
            if result[ft][m] < out[m][1]: out[m] = (ft, result[ft][m], out[m][2], out[m][3])
            if result[ft][m] > out[m][3]: out[m] = (out[m][0], out[m][1], ft, result[ft][m])
    return out


def to_num(x):
    threshold = dataset[privileged_cols].unique().tolist()
    if (x == threshold[0]):
        return 1
    else:
        return 0


# Convert 'date of birth' or similar columns in 'Age' column
def fix_age(x):
    if x <= 0:
        x += 99
    return x


def to_age(dataset_to_transform, label_to_transform):
    now = datetime.date.today() # calcola data odierna
    dob_copy = dataset_to_transform[label_to_transform] # copia del campo DOB
    dob_copy = pd.to_datetime(dob_copy, format = '%m/%d/%y') # imposta il formato corretto
    date_now = pd.to_datetime(now)
    dataset_to_transform["Age"] = (date_now - dob_copy)/np.timedelta64(1,'Y') # la differenza tra le due date, espressa in anni
    dataset_to_transform["Age"] = dataset_to_transform["Age"].astype(int) # imposta il tipo di dato del campo come intero
    dataset_to_transform['Age'] = dataset_to_transform['Age'].apply(fix_age) # serve per correggere un errore del parser di python nella funzione di conversione
    # to_datetime: gli anni con valori < 69 venivano attribuiti al 1900 mentre quelli >= 69 al 2000, sfasando l'età di 99 anni

dataset = pd.read_csv("adult.csv")
dropnan = 1
biased_cols = ['gender', 'race']
privileged_cols = 'income'
pos_outcome = '>50K'

# Checking input
dataset_labels = []
for elem in dataset.columns:
    dataset_labels.append(str(elem))
for col in biased_cols:
    if col not in dataset_labels:
        raise Exception("One or both biased columns are not in the dataset.")
if privileged_cols not in dataset_labels:
    raise Exception("The privileged variable is not in the dataset.")
if pos_outcome not in dataset[privileged_cols].unique():
    raise Exception("The positive outcome is not a possible value of the privileged variable.")
# Detecting dataset NaN values
if dropnan == 1:
    valuesToCheck = "?\/-"
    for elem in valuesToCheck:
        if elem in dataset.values:
            dataset.replace(elem, np.nan)
    dataset.dropna()


if "DOB" in dataset_labels:
    to_age(dataset, "DOB")
match_birth = []
for label in dataset_labels:
    if len(re.findall("birth", label, re.IGNORECASE)) > 0:
        match_birth.append(label)
if len(match_birth) > 0:
    to_age(dataset, match_birth[0])


# Clean columns from space characters at the beginning and at the end of the string
for elem in dataset.columns:
    elem = elem.strip()


# Check that the input biased columns are in dataset
error_message = 1
count_bias_cols = 0
for elem in dataset.columns:
    for label in biased_cols:
        if label == elem:
            count_bias_cols += 1
if count_bias_cols == 0:
    raise Exception("The specified biased columns are not in the dataset")
elif count_bias_cols == 1:
    raise Exception("One specified biased column is not in the dataset")


# Here start the real bias analysis
result = ""
result = "Sensitive fields were found"
# Check that the input privileged column is in dataset
error_message = 1
for label in dataset.columns:
    if label == privileged_cols:
        error_message = 0
if error_message == 1:
    raise Exception("The specified privileged column is not in the dataset")


# Calculate the number of bins
number_of_rows = len(dataset.index)
number_of_bins = math.ceil(math.sqrt(number_of_rows))


# Calculate EDF metric
edf_list = []
df_edf_list = []
for ind in range(len(biased_cols)):
    attribute1_set = np.array(dataset[biased_cols[ind]].unique()).tolist()
    for i in range(len(biased_cols)):
        if (ind == i) or not (is_string_dtype(dataset[biased_cols[ind]].dtype)) or not (is_string_dtype(dataset[biased_cols[i]].dtype)):
            pass
        else:
            attribute2_set = np.array(dataset[biased_cols[i]].unique()).tolist()
            new_privileged_cols = privileged_cols + "_01"
            dataset[new_privileged_cols] = dataset[privileged_cols].apply(to_num)   # .apply must be replaced with another method, performance issue
            edf = get_edf_df(dataset, biased_cols[ind], biased_cols[i], attribute1_set, attribute2_set, new_privileged_cols)
            edf_list.append(edf)
            df = pd.DataFrame(read_edf(edf, n = 3))
            df_edf_list.append(df)
# Setting final message
edf_result = 0
if not df_edf_list:
    result += " but compatible columns cannot be found to calculate EDF metric."
    raise Exception(result)
else:
    max_edf = 0
    max_df_edf = df_edf_list[0]
    for elem in df_edf_list:
        if elem.iloc[0][2] > max_edf:
            max_edf = elem.iloc[0][2]
            max_df_edf = elem
    edf_result = 1


if edf_result == 0:
    result += " but compatible columns cannot be found to calculate EDF metric."
    raise Exception(result)
else:


    # Calculate intersection, metrics and disparity
    data_copy = get_intersection(dataset, biased_cols[0], biased_cols[1])
    cont_vars = []
    for label in dataset.columns:
        if dataset[label].dtype == np.int64 or dataset[label].dtype == np.float64 or dataset[label].dtype == np.complex128 or dataset[label].dtype == np.int32 or dataset[label].dtype == np.float32:
            cont_vars.append(label)
    cat_vars = list(set(dataset.columns.values) - set(cont_vars))
    intersect_var = biased_cols[0] + "_" + biased_cols[1]
    privilege_values = dataset[privileged_cols].unique().tolist()
    debias_params = get_debias_params(intersect_var, max_df_edf.iloc[0, 0], max_df_edf.iloc[0, 1], str(privilege_values[0]), str(privilege_values[1]))
    metrics = etiq_wrapper_run(dataset, debias_params, cont_vars, cat_vars, privileged_cols, metrics_bonus)
    df_metrics = get_df_from_metrics(metrics)
    df_disparity = get_disparity_df(metrics, debias_params, metrics_list)


    # Calculate modal values, ratio between positive and negative outcome, occurrences of associating values to a datum feature
    features = cat_vars
    neg_outcome = ""
    for v in privilege_values:
        if str(v) != pos_outcome:
            neg_outcome = str(v)
    if not neg_outcome:
        raise ValueError("The value of the negative outcome could not be found. Please check that the privilege variable contains exactly two values.")
    df_ratio = get_ratio_df(data = dataset, features = features, p_feature = privileged_cols, positive_outcome = pos_outcome, negative_outcome = neg_outcome)
    df_mode = get_mode_df(data = dataset, features = features, p_feature = privileged_cols, positive_outcome = pos_outcome, negative_outcome = neg_outcome)
    df_intersection = get_intersection(dataset, biased_cols[0], biased_cols[1], drop = False)
    samples = []
    for i in range(50):
        sample = dataset.sample(n = 1000, ignore_index = True)
        samples.append(sample)
    results_pos, results_neg = get_maxOccurrences_in_samples(samples = samples, features = features, p_feature = privileged_cols, positive_outcome = pos_outcome, negative_outcome = neg_outcome)
    df_values_of_1st = get_values_of(samples = [dataset], feature = intersect_var, value = max_df_edf.iloc[0, 0], features = features)
    df_values_of_2nd = get_values_of(samples = [dataset], feature = intersect_var, value = max_df_edf.iloc[0, 1], features = features)
    df_values_of_outcome_1st = get_values_of_outcome(samples = [dataset], feature = intersect_var, value = max_df_edf.iloc[0, 0], features = features, p_feature = privileged_cols, outcome = pos_outcome)
    df_values_of_outcome_2nd = get_values_of_outcome(samples = [dataset], feature = intersect_var, value = max_df_edf.iloc[0, 1], features = features, p_feature = privileged_cols, outcome = pos_outcome)
    df_outcome_1st_neg = get_values_of_outcome(samples = [dataset], feature = intersect_var, value = max_df_edf.iloc[0, 0], features = features, p_feature = privileged_cols, outcome = neg_outcome)
    df_outcome_2nd_neg = get_values_of_outcome(samples = [dataset], feature = intersect_var, value = max_df_edf.iloc[0, 1], features = features, p_feature = privileged_cols, outcome = neg_outcome)

## EDF
The Empirical Differential Fairness (EDF) is the ratio between the ratios between positive and total cases of two groups, calculated on the data, without the contribution of a classifier.
EDF of the {{ biased_cols[0] }} and {{ biased_cols[1] }} intersection on the privileged variable {{ privileged_cols }}:

In [None]:
max_df_edf

## Metrics
Equal opportunity is the probability of a privileged individual being classified as such must be the same for everyone. In other words all groups should have similar, or ideally equal, True Positive Rates.
Also it is a relaxation of the Equalized Odds, in which it is required that in addition to the same True Positive Rate there is also the same False Positive Rate.
Demographic Parity is obtained when all groups have the same Predictive Positive Rate.
The set of all these metrics are defined here as Fairness metrics.
Fairness metrics for {{ df_metrics.iloc[0]["class"] }} and {{ df_metrics.iloc[1]["class"] }}:

In [None]:
df_metrics

## Disparity
Disparity is the ratio of its value to the unprivileged group to its value to the privileged group.
Disparity on fairness metrics for {{ df_metrics.iloc[0]["class"] }} and {{ df_metrics.iloc[1]["class"] }}:

In [None]:
df_disparity

## Ratio
Ratio between positive and negative outcomes:

In [None]:
df_ratio

## Mode

In [None]:
df_mode

## Intersection

In [None]:
df_intersection

## Positive privileged mode
Positive mode for {{ privileged_cols }}:

In [None]:
results_pos

## Negative privileged mode
Negative mode for {{ privileged_cols }}:

In [None]:
results_neg

## Frequent pattern #1
Below is a list that provides the values most frequently associated with {{ max_df_edf.iloc[0, 0] }}, and it's useful to observe any differences in modal values between the privileged and the unprivileged group:

In [None]:
df_values_of_1st

## Frequent pattern #2
Below is a list that provides the values most frequently associated with {{ max_df_edf.iloc[0, 1] }}, and it's useful to observe any differences in modal values between the privileged and the unprivileged group:

In [None]:
df_values_of_2nd

## Frequent pattern positive privileged #1
Instead below there is a list like the previous one but it filters the observations based on the result of the privilege feature, with positive outcome, and it's useful to observe the differences between the modal values of the privileged and non-privileged individuals for {{ max_df_edf.iloc[0, 0] }}:

In [None]:
df_values_of_outcome_1st

## Frequent pattern positive privileged #2
Instead below there is a list like the previous one but it filters the observations based on the result of the privilege feature, with positive outcome, and it's useful to observe the differences between the modal values of the privileged and non-privileged individuals for {{ max_df_edf.iloc[0, 1] }}:

In [None]:
df_values_of_outcome_2nd

## Frequent pattern negative privileged #1
Instead below there is a list like the previous one but it filters the observations based on the result of the privilege feature, with negative outcome, and it's useful to observe the differences between the modal values of the privileged and non-privileged individuals for {{ max_df_edf.iloc[0, 0] }}:

In [None]:
df_outcome_1st_neg

## Frequent pattern negative privileged #2
Instead below there is a list like the previous one but it filters the observations based on the result of the privilege feature, with negative outcome, and it's useful to observe the differences between the modal values of the privileged and non-privileged individuals for {{ max_df_edf.iloc[0, 1] }}:

In [None]:
df_outcome_2nd_neg