In [15]:
# !pip install -r ./requirements.txt

In [3]:
# !pip uninstall virny -y

In [5]:
# Install using an HTTP link
# !pip install git+https://github.com/DataResponsibly/Virny.git@development

# Install using an SSH link
# !pip install git+ssh://git@github.com/DataResponsibly/Virny.git@development

In [6]:
# !pip install aif360

In [7]:
# !pip install BlackBoxAuditing==0.1.54

In [5]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [6]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [7]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "fairness-variance":
    os.chdir("../..")

print('Current location: ', os.getcwd())

Current location:  /home/denys_herasymuk/UCU/4course_2term/Bachelor_Thesis/Code/fairness-variance


## Import dependencies

In [8]:
import os
import copy

from virny.utils.custom_initializers import create_config_obj
from virny.datasets import ACSIncomeDataset

from configs.constants import TEST_SET_FRACTION, EXPERIMENT_SEEDS
from configs.models_config_for_tuning import get_folktables_employment_models_params_for_tuning

from source.experiment_interface import run_exp_iter_with_disparate_impact_and_mult_sets

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'


## Define Input Variables

In [9]:
# ROOT_DIR = os.path.join(os.getcwd(), "..", "..")
ROOT_DIR = os.getcwd()
EXPERIMENT_NAME = 'one_repair_lvl_many_models_income_GA'
DB_COLLECTION_NAME = 'one_repair_lvl_many_models'
SAVE_RESULTS_DIR_PATH = os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME)
FAIR_INTERVENTION_PARAMS_LST = [0.0, 0.7]

config_yaml_path = os.path.join(ROOT_DIR, 'notebooks', EXPERIMENT_NAME, 'folk_GA_2018_config.yaml')
metrics_computation_config = create_config_obj(config_yaml_path=config_yaml_path)

## Define a db writer and custom fields to insert into your database

In [10]:
import os
from dotenv import load_dotenv

load_dotenv('./configs/secrets.env')
os.getenv("DB_NAME")

'fairness_variance'

In [11]:
from source.utils.db_functions import connect_to_mongodb

client, collection_obj, db_writer_func = connect_to_mongodb(DB_COLLECTION_NAME)

In [13]:
import uuid

custom_table_fields_dct = {
    # 'session_uuid': str(uuid.uuid4()),
    'session_uuid': '78049b4f-0ef4-41d7-b3c5-9c3ab8a019f8',
}
print('Current session uuid: ', custom_table_fields_dct['session_uuid'])

Current session uuid:  78049b4f-0ef4-41d7-b3c5-9c3ab8a019f8


## Initialize custom objects

In [48]:
data_loader = ACSIncomeDataset(state=['GA'], year=2018, with_nulls=False,
                               subsample_size=15_000, subsample_seed=42)
data_loader.X_data.head()

Unnamed: 0,SCHL,COW,MAR,OCCP,POBP,RELP,SEX,RAC1P,AGEP,WKHP
0,23,7,3,230,36,0,1,1,55,55.0
1,16,1,5,4110,13,2,2,1,20,35.0
2,16,4,3,4130,51,0,2,1,59,30.0
3,18,4,1,4020,13,0,1,2,43,40.0
4,14,1,1,8300,20,1,2,2,33,20.0


In [49]:
data_loader.X_data.shape

(15000, 10)

In [50]:
data_loader2 = ACSIncomeDataset(state=['GA'], year=2017, with_nulls=False,
                               subsample_size=15_000, subsample_seed=42)
data_loader2.X_data.head()

Unnamed: 0,SCHL,COW,MAR,OCCP,POBP,RELP,SEX,RAC1P,AGEP,WKHP
0,16,1,1,6260,104,1,1,1,31,40.0
1,20,1,1,5160,13,1,2,1,47,40.0
2,16,1,1,530,13,0,1,1,43,40.0
3,22,2,5,800,17,0,2,1,32,43.0
4,22,3,1,3800,51,0,1,1,71,30.0


In [51]:
data_loader2.X_data.shape

(15000, 10)

In [52]:
data_loader3 = ACSIncomeDataset(state=['NY'], year=2018, with_nulls=False,
                                subsample_size=15_000, subsample_seed=42)
data_loader3.X_data.head()

Unnamed: 0,SCHL,COW,MAR,OCCP,POBP,RELP,SEX,RAC1P,AGEP,WKHP
0,15,1,1,4220,207,0,1,6,61,40.0
1,21,3,5,3870,36,2,1,1,30,40.0
2,16,1,5,3930,36,3,1,2,30,40.0
3,17,2,5,4965,329,6,2,8,51,28.0
4,16,1,1,4000,231,0,1,6,54,15.0


In [53]:
data_loader3.X_data.shape

(15000, 10)

In [54]:
data_loader4 = ACSIncomeDataset(state=['NY'], year=2017, with_nulls=False,
                                subsample_size=15_000, subsample_seed=42)
data_loader4.X_data.head()

Unnamed: 0,SCHL,COW,MAR,OCCP,POBP,RELP,SEX,RAC1P,AGEP,WKHP
0,16,2,1,630,36,0,1,1,48,50.0
1,20,7,1,9130,330,0,1,2,51,40.0
2,21,1,1,430,36,1,1,1,44,40.0
3,16,1,1,4760,36,0,2,1,59,28.0
4,19,7,1,5860,36,1,2,1,51,40.0


In [55]:
data_loader4.X_data.shape

(15000, 10)

In [56]:
extra_data_loaders = [data_loader2, data_loader3, data_loader4]

In [57]:
from source.preprocessing import preprocess_mult_data_loaders_for_disp_imp, remove_disparate_impact_with_mult_sets

init_base_flow_dataset, extra_base_flow_datasets = \
    preprocess_mult_data_loaders_for_disp_imp(data_loaders=[data_loader] + extra_data_loaders,
                                              test_set_fraction=TEST_SET_FRACTION,
                                              experiment_seed=100)

In [58]:
# Fair preprocessing
cur_base_flow_dataset, cur_extra_test_sets = \
    remove_disparate_impact_with_mult_sets(init_base_flow_dataset, alpha=0.7,
                                           init_extra_base_flow_datasets=extra_base_flow_datasets)

In [59]:
init_base_flow_dataset.X_test.head(5)

Unnamed: 0,cat__SCHL_1,cat__SCHL_10,cat__SCHL_11,cat__SCHL_12,cat__SCHL_13,cat__SCHL_14,cat__SCHL_15,cat__SCHL_16,cat__SCHL_17,cat__SCHL_18,...,cat__RELP_3,cat__RELP_4,cat__RELP_5,cat__RELP_6,cat__RELP_7,cat__RELP_8,cat__RELP_9,num__AGEP,num__WKHP,RACE
10155,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.160915,0.453605,0
11689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.455527,1.621426,0
12599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.824156,0.064331,1
12193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.756804,0.842879,1
8678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.92699,0.064331,0


In [60]:
cur_base_flow_dataset.X_test.head(5)

Unnamed: 0,cat__SCHL_1,cat__SCHL_10,cat__SCHL_11,cat__SCHL_12,cat__SCHL_13,cat__SCHL_14,cat__SCHL_15,cat__SCHL_16,cat__SCHL_17,cat__SCHL_18,...,cat__RELP_2,cat__RELP_3,cat__RELP_4,cat__RELP_5,cat__RELP_6,cat__RELP_7,cat__RELP_8,cat__RELP_9,num__AGEP,num__WKHP
10155,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.160915,0.453605
11689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.455527,1.543572
12599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.689453,-0.091378
12193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.622101,0.687169
8678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.92699,0.064331


In [61]:
data_loader.X_data.loc[init_base_flow_dataset.X_test.index, :].head(5)

Unnamed: 0,SCHL,COW,MAR,OCCP,POBP,RELP,AGEP,WKHP,RACE
10155,15,2,1,4120,423,0,60,45.0,0
11689,22,3,1,2310,13,0,36,60.0,0
12599,19,1,3,4720,13,0,55,40.0,1
12193,23,1,1,2100,17,1,54,50.0,1
8678,16,1,5,7810,13,13,29,40.0,0


In [62]:
extra_base_flow_datasets[0].X_test.head(5)

Unnamed: 0,cat__SCHL_1,cat__SCHL_10,cat__SCHL_11,cat__SCHL_12,cat__SCHL_13,cat__SCHL_14,cat__SCHL_15,cat__SCHL_16,cat__SCHL_17,cat__SCHL_18,...,cat__RELP_3,cat__RELP_4,cat__RELP_5,cat__RELP_6,cat__RELP_7,cat__RELP_8,cat__RELP_9,num__AGEP,num__WKHP,RACE
10155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.399185,-0.305913,1
11689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.602286,0.085971,1
12599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.684017,0.085971,0
12193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142416,1.653506,1
8678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.602286,0.869738,1


In [63]:
cur_extra_test_sets[0][0].head(5)

Unnamed: 0,cat__SCHL_1,cat__SCHL_10,cat__SCHL_11,cat__SCHL_12,cat__SCHL_13,cat__SCHL_14,cat__SCHL_15,cat__SCHL_16,cat__SCHL_17,cat__SCHL_18,...,cat__RELP_2,cat__RELP_3,cat__RELP_4,cat__RELP_5,cat__RELP_6,cat__RELP_7,cat__RELP_8,cat__RELP_9,num__AGEP,num__WKHP
10155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.534585,-0.305913
11689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.669986,0.085971
12599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.684017,0.085971
12193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.060684,1.653506
8678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.669986,0.712985


In [64]:
extra_base_flow_datasets[2].X_test.head(5)

Unnamed: 0,cat__SCHL_1,cat__SCHL_10,cat__SCHL_11,cat__SCHL_12,cat__SCHL_13,cat__SCHL_14,cat__SCHL_15,cat__SCHL_16,cat__SCHL_17,cat__SCHL_18,...,cat__RELP_3,cat__RELP_4,cat__RELP_5,cat__RELP_6,cat__RELP_7,cat__RELP_8,cat__RELP_9,num__AGEP,num__WKHP,RACE
10155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.095199,-1.523256,1
11689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75866,1.723586,1
12599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.095199,1.723586,1
12193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.035851,1.723586,1
8678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.095199,-0.209058,1


In [65]:
cur_extra_test_sets[2][0].head(5)

Unnamed: 0,cat__SCHL_1,cat__SCHL_10,cat__SCHL_11,cat__SCHL_12,cat__SCHL_13,cat__SCHL_14,cat__SCHL_15,cat__SCHL_16,cat__SCHL_17,cat__SCHL_18,...,cat__RELP_2,cat__RELP_3,cat__RELP_4,cat__RELP_5,cat__RELP_6,cat__RELP_7,cat__RELP_8,cat__RELP_9,num__AGEP,num__WKHP
10155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.161409,-1.523256
11689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.427614,1.723586
12599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.161409,1.723586
12193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.300688,1.723586
8678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.161409,-0.209058


In [66]:
extra_data_loaders[2].X_data.loc[extra_base_flow_datasets[2].X_test.index, :].head(5)

Unnamed: 0,SCHL,COW,MAR,OCCP,POBP,RELP,AGEP,WKHP,RACE
10155,18,1,5,4510,36,13,27,18.0,1
11689,22,1,1,710,36,1,55,60.0,1
12599,19,1,5,310,26,11,27,60.0,1
12193,22,1,1,136,119,0,43,60.0,1
8678,20,1,5,4110,36,0,27,35.0,1


## Run experiment iterations

### Experiment iteration 1

In [23]:
# tuned_params_filenames = ['tuning_results_Folktables_NY_2018_Employment_alpha_0.8_20230706__115508.csv']
# tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME, tuned_params_filename)
#                          for tuned_params_filename in tuned_params_filenames]

In [24]:
# Configs for an experiment iteration
exp_iter_num = 1
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
exp_extra_data_loaders = copy.deepcopy(extra_data_loaders)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)

In [25]:
run_exp_iter_with_disparate_impact_and_mult_sets(data_loader=exp_iter_data_loader,
                                                 extra_data_loaders=exp_extra_data_loaders,
                                                 experiment_seed=experiment_seed,
                                                 test_set_fraction=TEST_SET_FRACTION,
                                                 db_writer_func=db_writer_func,
                                                 fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                                 models_params_for_tuning=models_params_for_tuning,
                                                 metrics_computation_config=metrics_computation_config,
                                                 custom_table_fields_dct=custom_table_fields_dct,
                                                 with_tuning=True,
                                                 # with_tuning=False,
                                                 # tuned_params_df_paths=tuned_params_df_paths,
                                                 save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                                 verbose=True)

2023-07-27 00:16:14 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:
INFO:root:Start an experiment iteration for the following custom params:


{'dataset_split_seed': 100,
 'experiment_iteration': 'Exp_iter_1',
 'fair_intervention_params_lst': '[0.0, 0.7]',
 'model_init_seed': 100,
 'session_uuid': '78049b4f-0ef4-41d7-b3c5-9c3ab8a019f8'}




2023-07-27 00:16:15 experiment_interface.py INFO    : The dataset is preprocessed
INFO:root:The dataset is preprocessed


Top indexes of an X_test in a base flow dataset:  Int64Index([10155, 11689, 12599, 12193,  8678,  8217,  4670, 12087,  5235,
             4189,  7278, 10642,  5284,  7002, 14642, 10594,  7701,  8686,
             8665,  6253],
           dtype='int64')
Top indexes of an y_test in a base flow dataset:  Int64Index([10155, 11689, 12599, 12193,  8678,  8217,  4670, 12087,  5235,
             4189,  7278, 10642,  5284,  7002, 14642, 10594,  7701,  8686,
             8665,  6253],
           dtype='int64')


Multiple alphas:   0%|          | 0/2 [00:00<?, ?it/s]

intervention_param:  0.0


KeyboardInterrupt: 

### Experiment iteration 2

In [18]:
# Configs for an experiment iteration
exp_iter_num = 2
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_GA_2018_Income_alpha_0.0_20230723__005125.csv',
    'tuning_results_Folktables_GA_2018_Income_alpha_0.1_20230723__013608.csv',
    'tuning_results_Folktables_GA_2018_Income_alpha_0.2_20230723__021138.csv',
    'tuning_results_Folktables_GA_2018_Income_alpha_0.3_20230723__040607.csv',
    'tuning_results_Folktables_GA_2018_Income_alpha_0.4_20230723__005850.csv',
    'tuning_results_Folktables_GA_2018_Income_alpha_0.5_20230723__014556.csv',
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)

In [4]:
# run_exp_iter_with_disparate_impact(data_loader=exp_iter_data_loader,
#                                    experiment_seed=experiment_seed,
#                                    test_set_fraction=TEST_SET_FRACTION,
#                                    db_writer_func=db_writer_func,
#                                    fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
#                                    models_params_for_tuning=models_params_for_tuning,
#                                    metrics_computation_config=metrics_computation_config,
#                                    custom_table_fields_dct=custom_table_fields_dct,
# #                                    with_tuning=True,
#                                    with_tuning=False,
#                                    tuned_params_df_paths=tuned_params_df_paths,
#                                    save_results_dir_path=SAVE_RESULTS_DIR_PATH,
#                                    verbose=True)

### Experiment iteration 3

### Experiment iteration 4

### Experiment iteration 5

### Experiment iteration 6