In [15]:
# !pip install -r ./requirements.txt

In [2]:
# !pip uninstall virny -y

In [4]:
# Install using an HTTP link
# !pip install git+https://github.com/DataResponsibly/Virny.git@development

# Install using an SSH link
# !pip install git+ssh://git@github.com/DataResponsibly/Virny.git@development

In [5]:
# !pip install lightgbm==3.3.5

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "fairness-variance":
    os.chdir("../..")

print('Current location: ', os.getcwd())

Current location:  /home/dh3553/projects/fairness-variance


## Import dependencies

In [4]:
import os
import copy

from virny.utils.custom_initializers import create_config_obj
from virny.datasets import ACSIncomeDataset

from configs.constants import TEST_SET_FRACTION, EXPERIMENT_SEEDS
from configs.models_config_for_tuning import get_folktables_employment_models_params_for_tuning

from source.preprocessing import get_simple_preprocessor
from source.experiment_interface import run_exp_iter_with_preprocessing_intervention

## Define Input Variables

In [5]:
# ROOT_DIR = os.path.join(os.getcwd(), "..", "..")
ROOT_DIR = os.getcwd()
EXPERIMENT_NAME = 'mult_repair_levels_income_GA'
DB_COLLECTION_NAME = 'exp_mult_repair_levels'
SAVE_RESULTS_DIR_PATH = os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME)
FAIR_INTERVENTION_PARAMS_LST = [0.3]

config_yaml_path = os.path.join(ROOT_DIR, 'notebooks', EXPERIMENT_NAME, 'folk_GA_2018_config.yaml')
metrics_computation_config = create_config_obj(config_yaml_path=config_yaml_path)

## Define a db writer and custom fields to insert into your database

In [6]:
import os
from dotenv import load_dotenv

load_dotenv('./configs/secrets.env')
os.getenv("DB_NAME")

'fairness_variance'

In [7]:
from source.utils.db_functions import connect_to_mongodb

client, collection_obj, db_writer_func = connect_to_mongodb(DB_COLLECTION_NAME)

In [8]:
import uuid

custom_table_fields_dct = {
#     'session_uuid': str(uuid.uuid4()),
    'session_uuid': '78c3da49-e095-43f3-8cc1-fa34bab0357c',
}
print('Current session uuid: ', custom_table_fields_dct['session_uuid'])

Current session uuid:  78c3da49-e095-43f3-8cc1-fa34bab0357c


## Initialize custom objects

In [9]:
data_loader = ACSIncomeDataset(state=['GA'], year=2018, with_nulls=False,
                               subsample_size=20_000, subsample_seed=42)
data_loader.X_data.head()

Unnamed: 0,SCHL,COW,MAR,OCCP,POBP,RELP,SEX,RAC1P,AGEP,WKHP
0,23,7,3,230,36,0,1,1,55,55.0
1,16,1,5,4110,13,2,2,1,20,35.0
2,16,4,3,4130,51,0,2,1,59,30.0
3,18,4,1,4020,13,0,1,2,43,40.0
4,14,1,1,8300,20,1,2,2,33,20.0


In [10]:
data_loader.X_data.shape

(20000, 10)

In [11]:
# Check if RAC1P 4 is in the X_data
data_loader.X_data.RAC1P.unique()

array(['1', '2', '6', '9', '8', '7', '3', '5'], dtype=object)

## Run experiment iterations

### Experiment iteration 1

In [12]:
# tuned_params_filenames = ['tuning_results_Folktables_GA_2018_Income_alpha_0.3_20230717__223109.csv']
# tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME, tuned_params_filename)
#                          for tuned_params_filename in tuned_params_filenames]

In [13]:
# Configs for an experiment iteration
exp_iter_num = 1
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
preprocessor = get_simple_preprocessor(exp_iter_data_loader)

In [None]:
run_exp_iter_with_preprocessing_intervention(data_loader=exp_iter_data_loader,
                                             experiment_seed=experiment_seed,
                                             test_set_fraction=TEST_SET_FRACTION,
                                             db_writer_func=db_writer_func,
                                             fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                             column_transformer=preprocessor,
                                             models_params_for_tuning=models_params_for_tuning,
                                             metrics_computation_config=metrics_computation_config,
                                             custom_table_fields_dct=custom_table_fields_dct,
                                             with_tuning=True,
#                                              with_tuning=False,
#                                              tuned_params_df_paths=tuned_params_df_paths,
                                             save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                             verbose=True)

2023-07-18 09:38:38 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 100,
 'experiment_iteration': 'Exp_iter_1',
 'fair_intervention_params_lst': '[0.3]',
 'model_init_seed': 100,
 'session_uuid': '78c3da49-e095-43f3-8cc1-fa34bab0357c'}




2023-07-18 09:38:38 experiment_interface.py INFO    : The dataset is preprocessed


Top indexes of an X_test in a base flow dataset:  Int64Index([ 2917,  2234, 14396,  1781, 11102,   732,  7692, 10589, 16098,
             4920,  6601,  7611,  3825, 18755,  6862,  3847,  7256, 13711,
            12389,  9772],
           dtype='int64')
Top indexes of an y_test in a base flow dataset:  Int64Index([ 2917,  2234, 14396,  1781, 11102,   732,  7692, 10589, 16098,
             4920,  6601,  7611,  3825, 18755,  6862,  3847,  7256, 13711,
            12389,  9772],
           dtype='int64')


Multiple alphas:   0%|          | 0/1 [00:00<?, ?it/s]

intervention_param:  0.3
2023/07/18, 09:38:39: Tuning RandomForestClassifier...


2023-07-18 10:14:38 experiment_interface.py INFO    : Models are tuned and saved to a file


2023/07/18, 10:14:38: Tuning for RandomForestClassifier is finished [F1 score = 0.7844860930095351, Accuracy = 0.8095]



Multiple runs progress:   0%|          | 0/1 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/1 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/200 [00:00<?, ?it/s]

### Experiment iteration 2

In [53]:
# Configs for an experiment iteration
exp_iter_num = 2
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_GA_2018_Income_alpha_0.0_20230717__173414.csv',
    'tuning_results_Folktables_GA_2018_Income_alpha_0.2_20230717__173705.csv',
    'tuning_results_Folktables_GA_2018_Income_alpha_0.5_20230717__174154.csv',
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
preprocessor = get_simple_preprocessor(exp_iter_data_loader)

In [54]:
run_exp_iter_with_preprocessing_intervention(data_loader=exp_iter_data_loader,
                                             experiment_seed=experiment_seed,
                                             test_set_fraction=TEST_SET_FRACTION,
                                             db_writer_func=db_writer_func,
                                             fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                             column_transformer=preprocessor,
                                             models_params_for_tuning=models_params_for_tuning,
                                             metrics_computation_config=metrics_computation_config,
                                             custom_table_fields_dct=custom_table_fields_dct,
                                             # with_tuning=True,
                                             with_tuning=False,
                                             tuned_params_df_paths=tuned_params_df_paths,
                                             save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                             verbose=True)

2023-07-17 20:46:56 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 200,
 'experiment_iteration': 'Exp_iter_2',
 'fair_intervention_params_lst': '[0.0, 0.2, 0.5]',
 'intervention_param': 0.5,
 'model_init_seed': 200,
 'session_uuid': 'da196782-b063-488c-9ad7-bc52ccf8e6ed'}




2023-07-17 20:46:56 experiment_interface.py INFO    : The dataset is preprocessed


Top indexes of an X_test in a base flow dataset:  Int64Index([ 4298,  9201,   679, 18780,  6955,  8571,  9405, 13699,  3930,
            13573,  3774,  9085, 12646, 16782, 13102, 18789,  8042, 19647,
            13861,  8780],
           dtype='int64')
Top indexes of an y_test in a base flow dataset:  Int64Index([ 4298,  9201,   679, 18780,  6955,  8571,  9405, 13699,  3930,
            13573,  3774,  9085, 12646, 16782, 13102, 18789,  8042, 19647,
            13861,  8780],
           dtype='int64')


Multiple alphas:   0%|          | 0/3 [00:00<?, ?it/s]

intervention_param:  0.0


2023-07-17 20:46:56 experiment_interface.py INFO    : Models config is loaded from the input file


Path for tuned params:  /home/denys_herasymuk/UCU/4course_2term/Bachelor_Thesis/Code/fairness-variance/results/mult_repair_levels_income_GA/tuning_results_Folktables_GA_2018_Income_alpha_0.0_20230717__173414.csv
RandomForestClassifier:  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 200, 'verbose': 0, 'warm_start': False}


Multiple runs progress:   0%|          | 0/1 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/1 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/20 [00:00<?, ?it/s]

intervention_param:  0.2
Path for tuned params:  /home/denys_herasymuk/UCU/4course_2term/Bachelor_Thesis/Code/fairness-variance/results/mult_repair_levels_income_GA/tuning_results_Folktables_GA_2018_Income_alpha_0.2_20230717__173705.csv
RandomForestClassifier:  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': 201, 'verbose': 0, 'warm_start': False}


Multiple runs progress:   0%|          | 0/1 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/1 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/20 [00:00<?, ?it/s]

intervention_param:  0.5
Path for tuned params:  /home/denys_herasymuk/UCU/4course_2term/Bachelor_Thesis/Code/fairness-variance/results/mult_repair_levels_income_GA/tuning_results_Folktables_GA_2018_Income_alpha_0.5_20230717__174154.csv
RandomForestClassifier:  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': 201, 'verbose': 0, 'warm_start': False}


Multiple runs progress:   0%|          | 0/1 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/1 [00:00<?, ?it/s]

Classifiers testing by bootstrap:   0%|          | 0/20 [00:00<?, ?it/s]

### Experiment iteration 3

In [47]:
# Configs for an experiment iteration
exp_iter_num = 3
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.0_20230705__174859.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.1_20230705__182108.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.2_20230705__185830.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.30000000000000004_20230705__194739.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.4_20230705__202544.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.5_20230705__210307.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.6000000000000001_20230705__214028.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.7_20230706__095236.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.8_20230706__115508.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.9_20230706__161158.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_1.0_20230706__175511.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
preprocessor = get_simple_preprocessor(exp_iter_data_loader)

In [5]:
run_exp_iter_with_preprocessing_intervention(data_loader=exp_iter_data_loader,
                                             experiment_seed=experiment_seed,
                                             test_set_fraction=TEST_SET_FRACTION,
                                             db_writer_func=db_writer_func,
                                             fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                             column_transformer=preprocessor,
                                             models_params_for_tuning=models_params_for_tuning,
                                             metrics_computation_config=metrics_computation_config,
                                             custom_table_fields_dct=custom_table_fields_dct,
                                             # with_tuning=True,
                                             with_tuning=False,
                                             tuned_params_df_paths=tuned_params_df_paths,
                                             save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                             verbose=True)

### Experiment iteration 4

In [64]:
# Configs for an experiment iteration
exp_iter_num = 4
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.0_20230705__174859.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.1_20230705__182108.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.2_20230705__185830.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.30000000000000004_20230705__194739.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.4_20230705__202544.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.5_20230705__210307.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.6000000000000001_20230705__214028.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.7_20230706__095236.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.8_20230706__115508.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.9_20230706__161158.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_1.0_20230706__175511.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
preprocessor = get_simple_preprocessor(exp_iter_data_loader)

In [6]:
run_exp_iter_with_preprocessing_intervention(data_loader=exp_iter_data_loader,
                                             experiment_seed=experiment_seed,
                                             test_set_fraction=TEST_SET_FRACTION,
                                             db_writer_func=db_writer_func,
                                             fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                             column_transformer=preprocessor,
                                             models_params_for_tuning=models_params_for_tuning,
                                             metrics_computation_config=metrics_computation_config,
                                             custom_table_fields_dct=custom_table_fields_dct,
                                             # with_tuning=True,
                                             with_tuning=False,
                                             tuned_params_df_paths=tuned_params_df_paths,
                                             save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                             verbose=True)

### Experiment iteration 5

In [None]:
# Configs for an experiment iteration
exp_iter_num = 5
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.0_20230705__174859.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.1_20230705__182108.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.2_20230705__185830.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.30000000000000004_20230705__194739.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.4_20230705__202544.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.5_20230705__210307.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.6000000000000001_20230705__214028.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.7_20230706__095236.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.8_20230706__115508.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.9_20230706__161158.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_1.0_20230706__175511.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
preprocessor = get_simple_preprocessor(exp_iter_data_loader)

In [None]:
run_exp_iter_with_preprocessing_intervention(data_loader=exp_iter_data_loader,
                                             experiment_seed=experiment_seed,
                                             test_set_fraction=TEST_SET_FRACTION,
                                             db_writer_func=db_writer_func,
                                             fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                             column_transformer=preprocessor,
                                             models_params_for_tuning=models_params_for_tuning,
                                             metrics_computation_config=metrics_computation_config,
                                             custom_table_fields_dct=custom_table_fields_dct,
                                             # with_tuning=True,
                                             with_tuning=False,
                                             tuned_params_df_paths=tuned_params_df_paths,
                                             save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                             verbose=True)

### Experiment iteration 6

In [None]:
# Configs for an experiment iteration
exp_iter_num = 6
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
tuned_params_filenames = [
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.0_20230705__174859.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.1_20230705__182108.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.2_20230705__185830.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.30000000000000004_20230705__194739.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.4_20230705__202544.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.5_20230705__210307.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.6000000000000001_20230705__214028.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.7_20230706__095236.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.8_20230706__115508.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_0.9_20230706__161158.csv',
    'tuning_results_Folktables_NY_2018_Employment_alpha_1.0_20230706__175511.csv'
]
tuned_params_df_paths = [os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME, tuned_params_filename)
                         for tuned_params_filename in tuned_params_filenames]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
preprocessor = get_simple_preprocessor(exp_iter_data_loader)

In [None]:
run_exp_iter_with_preprocessing_intervention(data_loader=exp_iter_data_loader,
                                             experiment_seed=experiment_seed,
                                             test_set_fraction=TEST_SET_FRACTION,
                                             db_writer_func=db_writer_func,
                                             fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
                                             column_transformer=preprocessor,
                                             models_params_for_tuning=models_params_for_tuning,
                                             metrics_computation_config=metrics_computation_config,
                                             custom_table_fields_dct=custom_table_fields_dct,
                                             # with_tuning=True,
                                             with_tuning=False,
                                             tuned_params_df_paths=tuned_params_df_paths,
                                             save_results_dir_path=SAVE_RESULTS_DIR_PATH,
                                             verbose=True)