In [1]:
# !pip install -r ./requirements.txt

In [2]:
# !pip uninstall virny -y

In [3]:
# Install using an HTTP link
# !pip install git+https://github.com/DataResponsibly/Virny.git@feature/add_meta_learner_computation_mode

# Install using an SSH link
# !pip install git+ssh://git@github.com/DataResponsibly/Virny.git@feature/add_meta_learner_computation_mode

In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [6]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "fairness-variance":
    os.chdir("../../..")

print('Current location: ', os.getcwd())

Current location:  /Users/denys_herasymuk/UCU/4course_2term/Bachelor_Thesis/Code/fairness-variance


## Import dependencies

In [7]:
import copy

from virny.utils.custom_initializers import create_config_obj
from virny.datasets import ACSIncomeDataset

from configs.constants import TEST_SET_FRACTION, EXPERIMENT_SEEDS
from configs.models_config_for_tuning import get_folktables_employment_models_params_for_tuning

from source.experiment_interface import run_exp_iter_with_disparate_impact

pip install 'aif360[LawSchoolGPA]'


## Define Input Variables

In [8]:
ROOT_DIR = os.getcwd()
EXPERIMENT_NAME = 'acs_income'
DB_COLLECTION_NAME = 'one_repair_lvl_many_models'
FAIR_INTERVENTION_PARAMS_LST = [0.0]
SAVE_RESULTS_DIR_PATH = os.path.join(ROOT_DIR, 'results', 'test_meta_learner', EXPERIMENT_NAME)

config_yaml_path = os.path.join(ROOT_DIR, 'notebooks', 'test_meta_learner', EXPERIMENT_NAME, 'folk_GA_2018_config.yaml')
metrics_computation_config = create_config_obj(config_yaml_path=config_yaml_path)

## Define a db writer and custom fields to insert into your database

In [9]:
import os
from dotenv import load_dotenv

load_dotenv('./configs/secrets.env')
os.getenv("DB_NAME")

'fairness_variance'

In [10]:
from source.utils.db_functions import connect_to_mongodb

client, collection_obj, db_writer_func = connect_to_mongodb(DB_COLLECTION_NAME)

In [11]:
import uuid

custom_table_fields_dct = {
    # 'session_uuid': str(uuid.uuid4()),
    'session_uuid': '8c9cf23a-9b8f-49fd-a7e2-270c2c3db69c',
}
print('Current session uuid: ', custom_table_fields_dct['session_uuid'])

Current session uuid:  8c9cf23a-9b8f-49fd-a7e2-270c2c3db69c


## Initialize custom objects

In [12]:
data_loader = ACSIncomeDataset(state=['GA'], year=2018, with_nulls=False,
                               subsample_size=15_000, subsample_seed=42)
data_loader.X_data.head()

Unnamed: 0,SCHL,COW,MAR,OCCP,POBP,RELP,SEX,RAC1P,AGEP,WKHP
0,23,7,3,230,36,0,1,1,55,55.0
1,16,1,5,4110,13,2,2,1,20,35.0
2,16,4,3,4130,51,0,2,1,59,30.0
3,18,4,1,4020,13,0,1,2,43,40.0
4,14,1,1,8300,20,1,2,2,33,20.0


In [13]:
data_loader.X_data.shape

(15000, 10)

## Run experiment iterations

### Experiment iteration 1

In [14]:
tuned_params_filenames = [
    'tuning_results_Folktables_GA_2018_Income_alpha_0.0.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', 'test_meta_learner', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]

In [15]:
# Configs for an experiment iteration
exp_iter_num = 1
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
models_params_for_tuning = {
    'RandomForestClassifier': models_params_for_tuning['RandomForestClassifier']
}

In [16]:
run_exp_iter_with_disparate_impact(data_loader=exp_iter_data_loader,
                                   experiment_seed=experiment_seed,
                                   test_set_fraction=TEST_SET_FRACTION,
                                   db_writer_func=db_writer_func,
                                   fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                   models_params_for_tuning=models_params_for_tuning,
                                   metrics_computation_config=metrics_computation_config,
                                   custom_table_fields_dct=custom_table_fields_dct,
                                   # with_tuning=True,
                                   with_tuning=False,
                                   tuned_params_df_paths=tuned_params_df_paths,
                                   save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                   dataset_name='ACSIncomeDataset',
                                   verbose=True)

2024-03-05 02:03:36 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 100,
 'experiment_iteration': 'Exp_iter_1',
 'fair_intervention_params_lst': '[0.0]',
 'model_init_seed': 100,
 'session_uuid': '8c9cf23a-9b8f-49fd-a7e2-270c2c3db69c'}


Multiple alphas:   0%|          | 0/1 [00:00<?, ?it/s]

intervention_param:  0.0


2024-03-05 02:03:36 experiment_interface.py INFO    : The dataset is preprocessed
2024-03-05 02:03:36 experiment_interface.py INFO    : Models config is loaded from the input file


Skip preprocessing
cur_base_flow_dataset.X_train_val.columns:  Index(['cat__SCHL_1', 'cat__SCHL_10', 'cat__SCHL_11', 'cat__SCHL_12',
       'cat__SCHL_13', 'cat__SCHL_14', 'cat__SCHL_15', 'cat__SCHL_16',
       'cat__SCHL_17', 'cat__SCHL_18',
       ...
       'cat__RELP_2', 'cat__RELP_3', 'cat__RELP_4', 'cat__RELP_5',
       'cat__RELP_6', 'cat__RELP_7', 'cat__RELP_8', 'cat__RELP_9', 'num__AGEP',
       'num__WKHP'],
      dtype='object', length=723)
Top indexes of an X_test in the current base flow dataset:  Int64Index([10155, 11689, 12599, 12193,  8678,  8217,  4670, 12087,  5235,
             4189,  7278, 10642,  5284,  7002, 14642, 10594,  7701,  8686,
             8665,  6253],
           dtype='int64')
Top indexes of an y_test in the current base flow dataset:  Int64Index([10155, 11689, 12599, 12193,  8678,  8217,  4670, 12087,  5235,
             4189,  7278, 10642,  5284,  7002, 14642, 10594,  7701,  8686,
             8665,  6253],
           dtype='int64')
Path for tuned par

Analyze multiple models:   0%|          | 0/1 [00:00<?, ?it/s]

Enabled a 'meta_learner' mode
self.y_pred_test.index[:10] --  Int64Index([10155, 11689, 12599, 12193, 8678, 8217, 4670, 12087, 5235, 4189], dtype='int64')
self.error_test.index[:10] --  Int64Index([10155, 11689, 12599, 12193, 8678, 8217, 4670, 12087, 5235, 4189], dtype='int64')
[Voting] ................... (1 of 10) Processing gbt-1, total= 7.4min
[Voting] ................... (2 of 10) Processing gbt-2, total= 7.0min
[Voting] ................... (3 of 10) Processing gbt-3, total= 6.0min
[Voting] ................... (4 of 10) Processing gbt-4, total= 7.0min
[Voting] ................... (5 of 10) Processing gbt-5, total= 6.7min
[Voting] ................... (6 of 10) Processing gbt-6, total= 5.0min
[Voting] ................... (7 of 10) Processing gbt-7, total= 6.6min
[Voting] ................... (8 of 10) Processing gbt-8, total= 6.6min
[Voting] ................... (9 of 10) Processing gbt-9, total= 7.0min
[Voting] ................. (10 of 10) Processing gbt-10, total= 6.6min


Best para

### Experiment iteration 2

In [17]:
# Configs for an experiment iteration
exp_iter_num = 2
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_GA_2018_Income_alpha_0.0.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', 'test_meta_learner', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
models_params_for_tuning = {
    'RandomForestClassifier': models_params_for_tuning['RandomForestClassifier']
}

In [18]:
run_exp_iter_with_disparate_impact(data_loader=exp_iter_data_loader,
                                   experiment_seed=experiment_seed,
                                   test_set_fraction=TEST_SET_FRACTION,
                                   db_writer_func=db_writer_func,
                                   fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                   models_params_for_tuning=models_params_for_tuning,
                                   metrics_computation_config=metrics_computation_config,
                                   custom_table_fields_dct=custom_table_fields_dct,
                                   # with_tuning=True,
                                   with_tuning=False,
                                   tuned_params_df_paths=tuned_params_df_paths,
                                   save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                   dataset_name='ACSIncomeDataset',
                                   verbose=True)

2024-03-05 03:09:40 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 200,
 'experiment_iteration': 'Exp_iter_2',
 'fair_intervention_params_lst': '[0.0]',
 'intervention_param': 0.0,
 'model_init_seed': 200,
 'run_start_date_time': datetime.datetime(2024, 3, 5, 0, 3, 36, 875294, tzinfo=datetime.timezone.utc),
 'session_uuid': '8c9cf23a-9b8f-49fd-a7e2-270c2c3db69c'}


Multiple alphas:   0%|          | 0/1 [00:00<?, ?it/s]

2024-03-05 03:09:40 experiment_interface.py INFO    : The dataset is preprocessed
2024-03-05 03:09:40 experiment_interface.py INFO    : Models config is loaded from the input file


intervention_param:  0.0
Skip preprocessing
cur_base_flow_dataset.X_train_val.columns:  Index(['cat__SCHL_1', 'cat__SCHL_10', 'cat__SCHL_11', 'cat__SCHL_12',
       'cat__SCHL_13', 'cat__SCHL_14', 'cat__SCHL_15', 'cat__SCHL_16',
       'cat__SCHL_17', 'cat__SCHL_18',
       ...
       'cat__RELP_2', 'cat__RELP_3', 'cat__RELP_4', 'cat__RELP_5',
       'cat__RELP_6', 'cat__RELP_7', 'cat__RELP_8', 'cat__RELP_9', 'num__AGEP',
       'num__WKHP'],
      dtype='object', length=729)
Top indexes of an X_test in the current base flow dataset:  Int64Index([ 6043,  3745,  5159,  7241,  7820,  3695, 11501, 11432,  1163,
             8994,  7972,  2554,  9884,  2008,  6884, 11995,  5200,  4649,
            10244, 13775],
           dtype='int64')
Top indexes of an y_test in the current base flow dataset:  Int64Index([ 6043,  3745,  5159,  7241,  7820,  3695, 11501, 11432,  1163,
             8994,  7972,  2554,  9884,  2008,  6884, 11995,  5200,  4649,
            10244, 13775],
           dtype='i

Analyze multiple models:   0%|          | 0/1 [00:00<?, ?it/s]

Enabled a 'meta_learner' mode
self.y_pred_test.index[:10] --  Int64Index([6043, 3745, 5159, 7241, 7820, 3695, 11501, 11432, 1163, 8994], dtype='int64')
self.error_test.index[:10] --  Int64Index([6043, 3745, 5159, 7241, 7820, 3695, 11501, 11432, 1163, 8994], dtype='int64')
[Voting] ................... (1 of 10) Processing gbt-1, total= 6.0min
[Voting] ................... (2 of 10) Processing gbt-2, total= 7.2min
[Voting] ................... (3 of 10) Processing gbt-3, total= 4.9min
[Voting] ................... (4 of 10) Processing gbt-4, total= 4.9min
[Voting] ................... (5 of 10) Processing gbt-5, total= 4.9min
[Voting] ................... (6 of 10) Processing gbt-6, total= 4.9min
[Voting] ................... (7 of 10) Processing gbt-7, total= 7.2min
[Voting] ................... (8 of 10) Processing gbt-8, total= 6.5min
[Voting] ................... (9 of 10) Processing gbt-9, total= 6.2min
[Voting] ................. (10 of 10) Processing gbt-10, total= 7.0min


Best params for

### Experiment iteration 3

In [19]:
# Configs for an experiment iteration
exp_iter_num = 3
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_GA_2018_Income_alpha_0.0.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', 'test_meta_learner', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
models_params_for_tuning = {
    'RandomForestClassifier': models_params_for_tuning['RandomForestClassifier']
}

In [20]:
run_exp_iter_with_disparate_impact(data_loader=exp_iter_data_loader,
                                   experiment_seed=experiment_seed,
                                   test_set_fraction=TEST_SET_FRACTION,
                                   db_writer_func=db_writer_func,
                                   fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                   models_params_for_tuning=models_params_for_tuning,
                                   metrics_computation_config=metrics_computation_config,
                                   custom_table_fields_dct=custom_table_fields_dct,
                                   # with_tuning=True,
                                   with_tuning=False,
                                   tuned_params_df_paths=tuned_params_df_paths,
                                   save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                   dataset_name='ACSIncomeDataset',
                                   verbose=True)

2024-03-05 04:09:24 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 300,
 'experiment_iteration': 'Exp_iter_3',
 'fair_intervention_params_lst': '[0.0]',
 'intervention_param': 0.0,
 'model_init_seed': 300,
 'run_start_date_time': datetime.datetime(2024, 3, 5, 1, 9, 40, 684959, tzinfo=datetime.timezone.utc),
 'session_uuid': '8c9cf23a-9b8f-49fd-a7e2-270c2c3db69c'}


Multiple alphas:   0%|          | 0/1 [00:00<?, ?it/s]

2024-03-05 04:09:25 experiment_interface.py INFO    : The dataset is preprocessed
2024-03-05 04:09:25 experiment_interface.py INFO    : Models config is loaded from the input file


intervention_param:  0.0
Skip preprocessing
cur_base_flow_dataset.X_train_val.columns:  Index(['cat__SCHL_1', 'cat__SCHL_10', 'cat__SCHL_11', 'cat__SCHL_12',
       'cat__SCHL_13', 'cat__SCHL_14', 'cat__SCHL_15', 'cat__SCHL_16',
       'cat__SCHL_17', 'cat__SCHL_18',
       ...
       'cat__RELP_2', 'cat__RELP_3', 'cat__RELP_4', 'cat__RELP_5',
       'cat__RELP_6', 'cat__RELP_7', 'cat__RELP_8', 'cat__RELP_9', 'num__AGEP',
       'num__WKHP'],
      dtype='object', length=727)
Top indexes of an X_test in the current base flow dataset:  Int64Index([ 4329,   723,  8157,  8934,  5880,  1877,  9980,  9115, 13634,
            12168,  6957,  9083, 14260,  5349,  7879,  3078, 10032, 13643,
            12025,  7489],
           dtype='int64')
Top indexes of an y_test in the current base flow dataset:  Int64Index([ 4329,   723,  8157,  8934,  5880,  1877,  9980,  9115, 13634,
            12168,  6957,  9083, 14260,  5349,  7879,  3078, 10032, 13643,
            12025,  7489],
           dtype='i

Analyze multiple models:   0%|          | 0/1 [00:00<?, ?it/s]

Enabled a 'meta_learner' mode
self.y_pred_test.index[:10] --  Int64Index([4329, 723, 8157, 8934, 5880, 1877, 9980, 9115, 13634, 12168], dtype='int64')
self.error_test.index[:10] --  Int64Index([4329, 723, 8157, 8934, 5880, 1877, 9980, 9115, 13634, 12168], dtype='int64')
[Voting] ................... (1 of 10) Processing gbt-1, total= 4.9min
[Voting] ................... (2 of 10) Processing gbt-2, total= 7.1min
[Voting] ................... (3 of 10) Processing gbt-3, total= 6.7min
[Voting] ................... (4 of 10) Processing gbt-4, total= 6.1min
[Voting] ................... (5 of 10) Processing gbt-5, total= 7.1min
[Voting] ................... (6 of 10) Processing gbt-6, total= 4.9min
[Voting] ................... (7 of 10) Processing gbt-7, total= 6.9min
[Voting] ................... (8 of 10) Processing gbt-8, total= 6.6min
[Voting] ................... (9 of 10) Processing gbt-9, total= 4.9min
[Voting] ................. (10 of 10) Processing gbt-10, total= 5.0min


Best params for e

### Experiment iteration 4

In [21]:
# Configs for an experiment iteration
exp_iter_num = 4
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_GA_2018_Income_alpha_0.0.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', 'test_meta_learner', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
models_params_for_tuning = {
    'RandomForestClassifier': models_params_for_tuning['RandomForestClassifier']
}

In [22]:
run_exp_iter_with_disparate_impact(data_loader=exp_iter_data_loader,
                                   experiment_seed=experiment_seed,
                                   test_set_fraction=TEST_SET_FRACTION,
                                   db_writer_func=db_writer_func,
                                   fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                   models_params_for_tuning=models_params_for_tuning,
                                   metrics_computation_config=metrics_computation_config,
                                   custom_table_fields_dct=custom_table_fields_dct,
                                   # with_tuning=True,
                                   with_tuning=False,
                                   tuned_params_df_paths=tuned_params_df_paths,
                                   save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                   dataset_name='ACSIncomeDataset',
                                   verbose=True)

2024-03-05 05:09:49 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 400,
 'experiment_iteration': 'Exp_iter_4',
 'fair_intervention_params_lst': '[0.0]',
 'intervention_param': 0.0,
 'model_init_seed': 400,
 'run_start_date_time': datetime.datetime(2024, 3, 5, 2, 9, 25, 412182, tzinfo=datetime.timezone.utc),
 'session_uuid': '8c9cf23a-9b8f-49fd-a7e2-270c2c3db69c'}


Multiple alphas:   0%|          | 0/1 [00:00<?, ?it/s]

2024-03-05 05:09:50 experiment_interface.py INFO    : The dataset is preprocessed
2024-03-05 05:09:50 experiment_interface.py INFO    : Models config is loaded from the input file


intervention_param:  0.0
Skip preprocessing
cur_base_flow_dataset.X_train_val.columns:  Index(['cat__SCHL_1', 'cat__SCHL_10', 'cat__SCHL_11', 'cat__SCHL_12',
       'cat__SCHL_13', 'cat__SCHL_14', 'cat__SCHL_15', 'cat__SCHL_16',
       'cat__SCHL_17', 'cat__SCHL_18',
       ...
       'cat__RELP_2', 'cat__RELP_3', 'cat__RELP_4', 'cat__RELP_5',
       'cat__RELP_6', 'cat__RELP_7', 'cat__RELP_8', 'cat__RELP_9', 'num__AGEP',
       'num__WKHP'],
      dtype='object', length=718)
Top indexes of an X_test in the current base flow dataset:  Int64Index([ 6425, 11811, 14319,  3284,  9129,  5763,  7549,  1393, 13879,
            14802,  8634, 10336,  1486, 14287,  8890,  5961, 10137, 14550,
            14981, 11017],
           dtype='int64')
Top indexes of an y_test in the current base flow dataset:  Int64Index([ 6425, 11811, 14319,  3284,  9129,  5763,  7549,  1393, 13879,
            14802,  8634, 10336,  1486, 14287,  8890,  5961, 10137, 14550,
            14981, 11017],
           dtype='i

Analyze multiple models:   0%|          | 0/1 [00:00<?, ?it/s]

Enabled a 'meta_learner' mode
self.y_pred_test.index[:10] --  Int64Index([6425, 11811, 14319, 3284, 9129, 5763, 7549, 1393, 13879, 14802], dtype='int64')
self.error_test.index[:10] --  Int64Index([6425, 11811, 14319, 3284, 9129, 5763, 7549, 1393, 13879, 14802], dtype='int64')
[Voting] ................... (1 of 10) Processing gbt-1, total= 5.8min
[Voting] ................... (2 of 10) Processing gbt-2, total= 4.7min
[Voting] ................... (3 of 10) Processing gbt-3, total= 6.6min
[Voting] ................... (4 of 10) Processing gbt-4, total= 6.5min
[Voting] ................... (5 of 10) Processing gbt-5, total= 7.0min
[Voting] ................... (6 of 10) Processing gbt-6, total= 5.4min
[Voting] ................... (7 of 10) Processing gbt-7, total= 6.8min
[Voting] ................... (8 of 10) Processing gbt-8, total= 4.9min
[Voting] ................... (9 of 10) Processing gbt-9, total= 3.3min
[Voting] ................. (10 of 10) Processing gbt-10, total= 3.0min


Best params

### Experiment iteration 5

In [23]:
# Configs for an experiment iteration
exp_iter_num = 5
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_GA_2018_Income_alpha_0.0.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', 'test_meta_learner', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
models_params_for_tuning = {
    'RandomForestClassifier': models_params_for_tuning['RandomForestClassifier']
}

In [24]:
run_exp_iter_with_disparate_impact(data_loader=exp_iter_data_loader,
                                   experiment_seed=experiment_seed,
                                   test_set_fraction=TEST_SET_FRACTION,
                                   db_writer_func=db_writer_func,
                                   fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                   models_params_for_tuning=models_params_for_tuning,
                                   metrics_computation_config=metrics_computation_config,
                                   custom_table_fields_dct=custom_table_fields_dct,
                                   # with_tuning=True,
                                   with_tuning=False,
                                   tuned_params_df_paths=tuned_params_df_paths,
                                   save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                   dataset_name='ACSIncomeDataset',
                                   verbose=True)

2024-03-05 06:04:05 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 500,
 'experiment_iteration': 'Exp_iter_5',
 'fair_intervention_params_lst': '[0.0]',
 'intervention_param': 0.0,
 'model_init_seed': 500,
 'run_start_date_time': datetime.datetime(2024, 3, 5, 3, 9, 50, 379641, tzinfo=datetime.timezone.utc),
 'session_uuid': '8c9cf23a-9b8f-49fd-a7e2-270c2c3db69c'}


Multiple alphas:   0%|          | 0/1 [00:00<?, ?it/s]

intervention_param:  0.0
Skip preprocessing


2024-03-05 06:04:05 experiment_interface.py INFO    : The dataset is preprocessed
2024-03-05 06:04:05 experiment_interface.py INFO    : Models config is loaded from the input file


cur_base_flow_dataset.X_train_val.columns:  Index(['cat__SCHL_1', 'cat__SCHL_10', 'cat__SCHL_11', 'cat__SCHL_12',
       'cat__SCHL_13', 'cat__SCHL_14', 'cat__SCHL_15', 'cat__SCHL_16',
       'cat__SCHL_17', 'cat__SCHL_18',
       ...
       'cat__RELP_2', 'cat__RELP_3', 'cat__RELP_4', 'cat__RELP_5',
       'cat__RELP_6', 'cat__RELP_7', 'cat__RELP_8', 'cat__RELP_9', 'num__AGEP',
       'num__WKHP'],
      dtype='object', length=729)
Top indexes of an X_test in the current base flow dataset:  Int64Index([ 8104,   580,  8954,  6500,  5436,  3490,  1621, 11610,  7067,
            10637,   767, 10997,  6202,  4468,  8440, 10810, 11409,  3682,
            12856, 14104],
           dtype='int64')
Top indexes of an y_test in the current base flow dataset:  Int64Index([ 8104,   580,  8954,  6500,  5436,  3490,  1621, 11610,  7067,
            10637,   767, 10997,  6202,  4468,  8440, 10810, 11409,  3682,
            12856, 14104],
           dtype='int64')
Path for tuned params:  /Users/denys_

Analyze multiple models:   0%|          | 0/1 [00:00<?, ?it/s]

Enabled a 'meta_learner' mode
self.y_pred_test.index[:10] --  Int64Index([8104, 580, 8954, 6500, 5436, 3490, 1621, 11610, 7067, 10637], dtype='int64')
self.error_test.index[:10] --  Int64Index([8104, 580, 8954, 6500, 5436, 3490, 1621, 11610, 7067, 10637], dtype='int64')
[Voting] ................... (1 of 10) Processing gbt-1, total= 2.4min
[Voting] ................... (2 of 10) Processing gbt-2, total= 2.4min
[Voting] ................... (3 of 10) Processing gbt-3, total= 2.5min
[Voting] ................... (4 of 10) Processing gbt-4, total= 2.4min
[Voting] ................... (5 of 10) Processing gbt-5, total= 2.4min
[Voting] ................... (6 of 10) Processing gbt-6, total= 3.1min
[Voting] ................... (7 of 10) Processing gbt-7, total= 3.1min
[Voting] ................... (8 of 10) Processing gbt-8, total= 2.5min
[Voting] ................... (9 of 10) Processing gbt-9, total= 2.4min
[Voting] ................. (10 of 10) Processing gbt-10, total= 2.5min


Best params for e

### Experiment iteration 6

In [25]:
# Configs for an experiment iteration
exp_iter_num = 6
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_GA_2018_Income_alpha_0.0.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', 'test_meta_learner', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
models_params_for_tuning = {
    'RandomForestClassifier': models_params_for_tuning['RandomForestClassifier']
}

In [26]:
run_exp_iter_with_disparate_impact(data_loader=exp_iter_data_loader,
                                   experiment_seed=experiment_seed,
                                   test_set_fraction=TEST_SET_FRACTION,
                                   db_writer_func=db_writer_func,
                                   fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                   models_params_for_tuning=models_params_for_tuning,
                                   metrics_computation_config=metrics_computation_config,
                                   custom_table_fields_dct=custom_table_fields_dct,
                                   # with_tuning=True,
                                   with_tuning=False,
                                   tuned_params_df_paths=tuned_params_df_paths,
                                   save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                   dataset_name='ACSIncomeDataset',
                                   verbose=True)

2024-03-05 06:29:48 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 600,
 'experiment_iteration': 'Exp_iter_6',
 'fair_intervention_params_lst': '[0.0]',
 'intervention_param': 0.0,
 'model_init_seed': 600,
 'run_start_date_time': datetime.datetime(2024, 3, 5, 4, 4, 5, 704373, tzinfo=datetime.timezone.utc),
 'session_uuid': '8c9cf23a-9b8f-49fd-a7e2-270c2c3db69c'}


Multiple alphas:   0%|          | 0/1 [00:00<?, ?it/s]

2024-03-05 06:29:48 experiment_interface.py INFO    : The dataset is preprocessed


intervention_param:  0.0
Skip preprocessing


2024-03-05 06:29:48 experiment_interface.py INFO    : Models config is loaded from the input file


cur_base_flow_dataset.X_train_val.columns:  Index(['cat__SCHL_1', 'cat__SCHL_10', 'cat__SCHL_11', 'cat__SCHL_12',
       'cat__SCHL_13', 'cat__SCHL_14', 'cat__SCHL_15', 'cat__SCHL_16',
       'cat__SCHL_17', 'cat__SCHL_18',
       ...
       'cat__RELP_2', 'cat__RELP_3', 'cat__RELP_4', 'cat__RELP_5',
       'cat__RELP_6', 'cat__RELP_7', 'cat__RELP_8', 'cat__RELP_9', 'num__AGEP',
       'num__WKHP'],
      dtype='object', length=730)
Top indexes of an X_test in the current base flow dataset:  Int64Index([ 2649,  1964, 14464,  4377, 14152,  5747,  1529, 10243, 12578,
             9794,  7142, 14347,  6387,  2917,   317,  9085,  5821, 11007,
             8142,  3719],
           dtype='int64')
Top indexes of an y_test in the current base flow dataset:  Int64Index([ 2649,  1964, 14464,  4377, 14152,  5747,  1529, 10243, 12578,
             9794,  7142, 14347,  6387,  2917,   317,  9085,  5821, 11007,
             8142,  3719],
           dtype='int64')
Path for tuned params:  /Users/denys_

Analyze multiple models:   0%|          | 0/1 [00:00<?, ?it/s]

Enabled a 'meta_learner' mode
self.y_pred_test.index[:10] --  Int64Index([2649, 1964, 14464, 4377, 14152, 5747, 1529, 10243, 12578, 9794], dtype='int64')
self.error_test.index[:10] --  Int64Index([2649, 1964, 14464, 4377, 14152, 5747, 1529, 10243, 12578, 9794], dtype='int64')
[Voting] ................... (1 of 10) Processing gbt-1, total= 3.0min
[Voting] ................... (2 of 10) Processing gbt-2, total= 3.1min
[Voting] ................... (3 of 10) Processing gbt-3, total= 3.1min
[Voting] ................... (4 of 10) Processing gbt-4, total= 3.1min
[Voting] ................... (5 of 10) Processing gbt-5, total= 3.1min
[Voting] ................... (6 of 10) Processing gbt-6, total= 3.0min
[Voting] ................... (7 of 10) Processing gbt-7, total= 3.1min
[Voting] ................... (8 of 10) Processing gbt-8, total= 3.1min
[Voting] ................... (9 of 10) Processing gbt-9, total= 3.1min
[Voting] ................. (10 of 10) Processing gbt-10, total= 3.1min


Best params