In [15]:
# !pip install -r ./requirements.txt

In [50]:
# !pip uninstall virny -y

In [52]:
# Install using an HTTP link
# !pip install git+https://github.com/DataResponsibly/Virny.git@development

# Install using an SSH link
# !pip install git+ssh://git@github.com/DataResponsibly/Virny.git@development

In [53]:
# !pip install lightgbm==3.3.5

In [49]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [51]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "fairness-variance":
    os.chdir("../..")

print('Current location: ', os.getcwd())

Current location:  /home/denys_herasymuk/UCU/4course_2term/Bachelor_Thesis/Code/fairness-variance


## Import dependencies

In [52]:
import os
import copy

from virny.utils.custom_initializers import create_config_obj
from virny.datasets import ACSEmploymentDataset

from configs.constants import TEST_SET_FRACTION, EXPERIMENT_SEEDS
from configs.models_config_for_tuning import get_folktables_employment_models_params_for_tuning

from source.preprocessing import get_simple_preprocessor, create_extra_test_sets
from source.experiment_interface import run_exp_iter_with_mult_set_and_preprocessing_intervention

## Define Input Variables

In [53]:
# ROOT_DIR = os.path.join(os.getcwd(), "..", "..")
ROOT_DIR = os.getcwd()
EXPERIMENT_NAME = 'one_repair_lvl_many_models'
DB_COLLECTION_NAME = f'exp_{EXPERIMENT_NAME}'
SAVE_RESULTS_DIR_PATH = os.path.join(ROOT_DIR, 'results', EXPERIMENT_NAME)
FAIR_INTERVENTION_PARAMS_LST = [0.0, 0.6]

config_yaml_path = os.path.join(ROOT_DIR, 'notebooks', EXPERIMENT_NAME, 'folk_ny_2018_config.yaml')
metrics_computation_config = create_config_obj(config_yaml_path=config_yaml_path)

## Define a db writer and custom fields to insert into your database

In [54]:
import os
from dotenv import load_dotenv

load_dotenv('./configs/secrets.env')
os.getenv("DB_NAME")

'fairness_variance'

In [55]:
from source.utils.db_functions import connect_to_mongodb

client, collection_obj, db_writer_func = connect_to_mongodb(DB_COLLECTION_NAME)

In [56]:
import uuid

custom_table_fields_dct = {
    # 'session_uuid': str(uuid.uuid4()),
    'session_uuid': 'b3b23062-272d-4ab7-8b3f-57ed27011abc',
}
print('Current session uuid: ', custom_table_fields_dct['session_uuid'])

Current session uuid:  b3b23062-272d-4ab7-8b3f-57ed27011abc


## Initialize custom objects

In [57]:
data_loader = ACSEmploymentDataset(state=['NY'], year=2018, with_nulls=False,
                                   subsample_size=20_000, subsample_seed=42)
data_loader.X_data.head()

Unnamed: 0,MAR,MIL,ESP,MIG,DREM,NATIVITY,DIS,DEAR,DEYE,SEX,RAC1P,RELP,CIT,ANC,SCHL,AGEP
0,5,4,0,1,2,1,2,2,2,1,1,2,1,1,15,42
1,1,4,0,1,2,1,2,2,2,2,1,0,1,2,21,59
2,2,4,0,1,2,2,2,2,2,2,1,0,4,4,19,78
3,5,4,0,1,2,1,2,2,2,1,1,10,1,4,14,19
4,1,2,0,1,2,1,2,2,2,1,1,0,1,1,17,87


In [58]:
data_loader.X_data.shape

(20000, 16)

In [59]:
data_loader2 = ACSEmploymentDataset(state=['NY'], year=2017, with_nulls=False,
                                    subsample_size=20_000, subsample_seed=42)
data_loader2.X_data.head()

Unnamed: 0,MAR,MIL,ESP,MIG,DREM,NATIVITY,DIS,DEAR,DEYE,SEX,RAC1P,RELP,CIT,ANC,SCHL,AGEP
0,1,4,0,1,2,2,2,2,2,1,1,1,5,1,19,45
1,1,4,0,3,2,2,2,2,2,2,6,0,4,1,22,38
2,5,4,0,1,2,1,2,2,2,1,1,2,1,1,21,26
3,5,4,0,1,2,2,2,2,2,2,6,0,5,1,16,36
4,1,4,0,1,2,1,2,2,2,2,1,2,1,1,16,42


In [60]:
data_loader2.X_data.shape

(20000, 16)

In [61]:
data_loader3 = ACSEmploymentDataset(state=['CA'], year=2018, with_nulls=False,
                                    subsample_size=20_000, subsample_seed=42)
data_loader3.X_data.head()

Unnamed: 0,MAR,MIL,ESP,MIG,DREM,NATIVITY,DIS,DEAR,DEYE,SEX,RAC1P,RELP,CIT,ANC,SCHL,AGEP
0,5,4,0,3,2,1,2,2,2,2,1,13,1,1,16,28
1,1,4,0,3,2,1,2,2,2,2,1,1,1,1,19,78
2,1,4,0,1,2,2,2,2,2,1,1,1,4,1,14,45
3,1,4,0,1,2,1,2,2,2,2,1,1,1,1,19,71
4,4,4,0,1,2,2,2,2,2,1,1,0,5,1,21,44


In [62]:
data_loader3.X_data.shape

(20000, 16)

In [63]:
data_loader4 = ACSEmploymentDataset(state=['CA'], year=2017, with_nulls=False,
                                    subsample_size=20_000, subsample_seed=42)
data_loader4.X_data.head()

Unnamed: 0,MAR,MIL,ESP,MIG,DREM,NATIVITY,DIS,DEAR,DEYE,SEX,RAC1P,RELP,CIT,ANC,SCHL,AGEP
0,1,4,0,1,2,2,2,2,2,2,6,0,4,1,21,34
1,3,4,0,1,2,1,2,2,2,1,1,6,1,1,21,61
2,1,4,0,1,2,2,2,2,2,1,9,0,4,1,1,66
3,1,4,0,1,2,1,2,2,2,1,1,1,1,1,19,64
4,1,4,0,1,2,2,1,2,2,2,1,1,5,1,9,62


In [64]:
data_loader4.X_data.shape

(20000, 16)

Create out-of-domain test sets

In [65]:
column_transformer = get_simple_preprocessor(data_loader)
extra_data_loaders = [data_loader2, data_loader3, data_loader4]
extra_test_sets = create_extra_test_sets(extra_data_loaders, column_transformer, TEST_SET_FRACTION, seed=42)

## Run experiment iterations

### Experiment iteration 1

In [66]:
# Configs for an experiment iteration
exp_iter_num = 1
experiment_seed = EXPERIMENT_SEEDS[exp_iter_num - 1]
custom_table_fields_dct['experiment_iteration'] = f'Exp_iter_{exp_iter_num}'

exp_iter_data_loader = copy.deepcopy(data_loader)  # Add deepcopy to avoid data leakage
exp_iter_extra_test_sets = copy.deepcopy(extra_test_sets)  # Add deepcopy to avoid data leakage
models_params_for_tuning = get_folktables_employment_models_params_for_tuning(experiment_seed)
preprocessor = get_simple_preprocessor(exp_iter_data_loader)

In [67]:
run_exp_iter_with_mult_set_and_preprocessing_intervention(
    data_loader=exp_iter_data_loader,
    experiment_seed=experiment_seed,
    test_set_fraction=TEST_SET_FRACTION,
    db_writer_func=db_writer_func,
    fair_intervention_params_lst=FAIR_INTERVENTION_PARAMS_LST,
    extra_test_sets=exp_iter_extra_test_sets,
    column_transformer=preprocessor,
    models_params_for_tuning=models_params_for_tuning,
    metrics_computation_config=metrics_computation_config,
    custom_table_fields_dct=custom_table_fields_dct,
    with_tuning=True,
    # with_tuning=False,
    # tuned_params_df_paths=tuned_params_df_paths,
    save_results_dir_path=SAVE_RESULTS_DIR_PATH,
    verbose=True
)

2023-07-11 01:45:21 experiment_interface.py INFO    : Start an experiment iteration for the following custom params:


{'dataset_split_seed': 100,
 'experiment_iteration': 'Exp_iter_1',
 'fair_intervention_params_lst': '[0.0, 0.6]',
 'model_init_seed': 100,
 'session_uuid': 'b3b23062-272d-4ab7-8b3f-57ed27011abc'}




2023-07-11 01:45:21 experiment_interface.py INFO    : The dataset is preprocessed


Top indexes of an X_test in a base flow dataset:  Int64Index([ 2917,  2234, 14396,  1781, 11102,   732,  7692, 10589, 16098,
             4920,  6601,  7611,  3825, 18755,  6862,  3847,  7256, 13711,
            12389,  9772],
           dtype='int64')
Top indexes of an y_test in a base flow dataset:  Int64Index([ 2917,  2234, 14396,  1781, 11102,   732,  7692, 10589, 16098,
             4920,  6601,  7611,  3825, 18755,  6862,  3847,  7256, 13711,
            12389,  9772],
           dtype='int64')


Multiple alphas:   0%|          | 0/2 [00:00<?, ?it/s]

intervention_param:  0.0
2023/07/11, 01:45:21: Tuning LogisticRegression...
Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV 3/3; 2/32] START C=0.001, penalty=l1, solver=lbfgs..........................
[CV 3/3; 2/32] END C=0.001, penalty=l1, solver=lbfgs; Accuracy_Score: (test=nan) F1_Score: (test=nan) total time=   0.0s
[CV 1/3; 4/32] START C=0.001, penalty=l1, solver=saga...........................
[CV 1/3; 4/32] END C=0.001, penalty=l1, solver=saga; Accuracy_Score: (test=0.720) F1_Score: (test=0.675) total time=   0.8s
[CV 3/3; 10/32] START C=0.01, penalty=l1, solver=lbfgs..........................
[CV 3/3; 10/32] END C=0.01, penalty=l1, solver=lbfgs; Accuracy_Score: (test=nan) F1_Score: (test=nan) total time=   0.0s
[CV 1/3; 12/32] START C=0.01, penalty=l1, solver=saga...........................
[CV 1/3; 12/32] END C=0.01, penalty=l1, solver=saga; Accuracy_Score: (test=0.738) F1_Score: (test=0.712) total time=   1.6s
[CV 2/3; 21/32] START C=0.1, penalty=l2, solver=n

KeyboardInterrupt: 

### Experiment iteration 2

### Experiment iteration 3

### Experiment iteration 4

### Experiment iteration 5

### Experiment iteration 6

### Experiment iteration 7

### Experiment iteration 8

### Experiment iteration 9

### Experiment iteration 10