In [1]:
# !pip install duckdb==0.10.1

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [4]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "fairness-variance":
    os.chdir("..")

print('Current location: ', os.getcwd())

Current location:  /Users/denys_herasymuk/UCU/4course_2term/Bachelor_Thesis/Code/fairness-variance


# Aggregations for Different Fairness Interventions

In [5]:
import duckdb
import pandas as pd

## Initialize Configs

In [6]:
DB_COLLECTION_NAME = 'one_repair_lvl_many_models'
DATASETS_DB_CONFIG = {
    'Folktables_GA_2018_Income': {
        'Baseline': 'ac796b56-9c83-474a-8e7a-2d95ee5efa7e',
        'LFR': '3855ad0b-4d2e-4baf-a5a8-6e3e92f624ba',
        'DIR': 'f009c98f-b732-44cf-9084-4bf12c11bdcf',
        'AdversarialDebiasing': 'bbba3cc4-760b-4e93-bb97-3a2077202cce',
        'ExponentiatedGradientReduction': '9c5d2ffc-834e-4a7d-abc0-49dd26950c6d',
        'EqOddsPostprocessing': '3976c1bf-99fa-49f4-97ec-44b13cb64ef4',
        'ROC': '426f4b41-5b22-4118-b5c8-15a0ab4aa18f',
    },
    'Folktables_CA_2018_Public_Coverage': {
        'Baseline': 'c36d3f62-2edd-4df4-97db-a9c3cb5be8ed',
        'LFR': '3021b0c7-483d-4682-9dc6-bb4a2cf90020',
        'DIR': 'ac38ec40-c789-4f7e-91be-9ffb55bc3e6f',
        'AdversarialDebiasing': 'bf843ff8-62e9-4aac-83bc-d805e3299fdc',
        'ExponentiatedGradientReduction': 'dffb2122-b870-40d5-92b2-f0f6bfb730f0',
        'EqOddsPostprocessing': 'c1fbe1ae-f5e9-4ca1-8f38-abe1b118f95b',
        'ROC': 'aa7a7273-bf7c-44f9-becc-3b0384b722c5',
    },
    'Law_School': {
        'Baseline': 'b37c9c58-ed69-425c-b3d5-8f1ffd3adcd6',
        'LFR': 'a1345526-1623-4fc7-b1a4-2a7790c4eef5',
        'DIR': '336e4621-fef9-4722-a3c1-b2ecca7e7f45',
        'AdversarialDebiasing': '594152d2-738f-43f3-87ac-e98b98a17b4a',
        'ExponentiatedGradientReduction': '5a766803-decf-487f-bc23-bade50373a6e',
        'EqOddsPostprocessing': '567e7e80-5ed4-4a45-8927-2f7b61ba5647',
        'ROC': 'b1f109f5-3def-4da0-872b-6f0f1a9be953',
    },
    'Student_Performance_Por': {
        'Baseline': '647daccf-7c14-463c-99ab-ef16972dba80',
        'LFR': '638da338-6a90-44a7-a835-17f062b941fb',
        'DIR': '647daccf-7c14-463c-99ab-ef16972dba80',
        'AdversarialDebiasing': '7bd775d4-0821-4711-8eed-902c990fbc94',
        'ExponentiatedGradientReduction': '65f39387-1ed2-4e6d-9670-ae8c552a0f65',
        'EqOddsPostprocessing': 'b35e31f2-bb03-494f-a24c-b9264e383667',
        'ROC': '0626d80f-e288-4f16-b8ab-260eb34d62d3',
    },
}
DATASET_NAMES = list(DATASETS_DB_CONFIG.keys())
DATASETS_SENSITIVE_ATTRS = {
    'Folktables_GA_2018_Income': ['SEX', 'RAC1P', 'SEX&RAC1P'],
    'Law_School': ['male', 'race', 'male&race'],
    'Folktables_CA_2018_Public_Coverage': ['SEX', 'RAC1P', 'SEX&RAC1P'],
    'Student_Performance_Por': ['sex'],
}

In [7]:
vals_to_replace = {
    'Model_Name': {
        'LGBMClassifier': 'LGBM',
        'LogisticRegression': 'LR',
        'RandomForestClassifier': 'RF',
        'MLPClassifier': 'MLP',
        'AdversarialDebiasing': 'In-processing',
        'ExponentiatedGradientReduction': 'In-processing',
    }
}

## Group Metrics Composition

In [8]:
all_subgroup_metrics_df = pd.read_csv(os.path.join(os.getcwd(), 'results', 'diff_fairness_interventions_subgroup_metrics.csv'), header=0)
all_group_metrics_df = pd.read_csv(os.path.join(os.getcwd(), 'results', 'diff_fairness_interventions_group_metrics.csv'), header=0)

In [9]:
all_subgroup_metrics_df = all_subgroup_metrics_df.replace(vals_to_replace)
all_group_metrics_df = all_group_metrics_df.replace(vals_to_replace)

In [10]:
all_subgroup_metrics_df.head(20)

Unnamed: 0,Metric,Model_Name,Dataset_Name,Num_Estimators,Record_Create_Date_Time,Session_Uuid,Experiment_Iteration,Dataset_Split_Seed,Fair_Intervention_Params_Lst,Intervention_Param,Subgroup,Metric_Value,Fairness_Intervention
0,IQR,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv,0.065426,Baseline
1,Mean_Prediction,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv,0.61653,Baseline
2,Overall_Uncertainty,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv,0.647023,Baseline
3,Aleatoric_Uncertainty,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv,0.634515,Baseline
4,Epistemic_Uncertainty,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv,0.012508,Baseline
5,PPV,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv,0.758273,Baseline
6,FPR,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv,0.145833,Baseline
7,Overall_Uncertainty,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv_correct,0.587053,Baseline
8,Epistemic_Uncertainty,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv_correct,0.011259,Baseline
9,Label_Stability,LGBM,Folktables_GA_2018_Income,200,2024-01-06 18:59:47.583,ac796b56-9c83-474a-8e7a-2d95ee5efa7e,Exp_iter_1,100,[0.0],0.0,SEX_priv_correct,0.936248,Baseline


In [11]:
all_group_metrics_df.head(20)

Unnamed: 0,Metric,Model_Name,Dataset_Name,Fairness_Intervention,Experiment_Iteration,Subgroup,Metric_Value
0,Accuracy_Difference,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,0.046051
1,Aleatoric_Uncertainty_Difference,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,-0.039453
2,Aleatoric_Uncertainty_Ratio,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,0.937821
3,Epistemic_Uncertainty_Difference,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,0.000196
4,Epistemic_Uncertainty_Ratio,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,1.015684
5,Equalized_Odds_FNR,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,0.029783
6,Equalized_Odds_FPR,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,-0.038663
7,IQR_Difference,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,-0.001807
8,Jitter_Difference,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,-0.005488
9,Label_Stability_Ratio,LGBM,Folktables_GA_2018_Income,Baseline,Exp_iter_1,SEX,1.006402


## Metric Aggregations

In [12]:
query = """
SELECT MIN(metric_value), MAX(metric_value), MEDIAN(metric_value)
FROM all_subgroup_metrics_df
WHERE dataset_name = 'Student_Performance_Por'
    AND fairness_intervention = 'LFR'
    AND model_name = 'LR'
    AND subgroup = 'overall'
    AND metric = 'Epistemic_Uncertainty'
"""
duckdb.query(query).to_df()

Unnamed: 0,min(metric_value),max(metric_value),median(metric_value)
0,0.003632,0.004774,0.003824


In [15]:
query = """
SELECT MIN(metric_value), MAX(metric_value), MEDIAN(metric_value)
FROM all_group_metrics_df
WHERE dataset_name = 'Student_Performance_Por'
    AND fairness_intervention = 'ExponentiatedGradientReduction'
    AND model_name = 'In-processing'
    AND subgroup = 'sex'
    AND metric = 'Epistemic_Uncertainty_Difference'
"""
duckdb.query(query).to_df()

Unnamed: 0,min(metric_value),max(metric_value),median(metric_value)
0,-0.078203,-0.00749,-0.047835
