In [31]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

## Import dependencies

In [33]:
import os
import pandas as pd
from IPython.display import display
from tqdm import tqdm

from configs import config
from configs.constants import ModelSetting
from utils.analyzers.subgroups_variance_analyzer import SubgroupsVarianceAnalyzer
from utils.common_helpers import create_tuned_base_model, save_metrics_to_file
from utils.custom_classes.data_loader import CompasDataset
from utils.custom_classes.generic_pipeline import GenericPipeline
from utils.analyzers.subgroups_statistical_bias_analyzer import SubgroupsStatisticalBiasAnalyzer

## Configs

In [34]:
DATASET_NAME = "COMPAS"
EXPERIMENT_NAME = 'Hypothesis_Space'

SEX_priv = 1
RACE_priv = 'Caucasian'
N_ESTIMATORS = 200
SENSITIVE_ATTRIBUTES = ['sex', 'race']
PRIV_VALUES = [SEX_priv, RACE_priv]
TUNED_PARAMS_FILE_PATH = os.path.join('..', '..', 'results', 'models_tuning', 'tuning_results_COMPAS_20230115__142510.csv')

## Models tuned hyper-parameters

In [35]:
models_tuned_params_df = pd.read_csv(TUNED_PARAMS_FILE_PATH)
models_tuned_params_df

Unnamed: 0,Dataset_Name,Model_Name,F1_Score,Accuracy_Score,Model_Best_Params
0,COMPAS,LogisticRegression,0.6785,0.6837,"{'max_iter': 50, 'penalty': 'none', 'solver': ..."
1,COMPAS,DecisionTreeClassifier,0.6835,0.6856,"{'criterion': 'entropy', 'max_depth': 5, 'max_..."
2,COMPAS,RandomForestClassifier,0.6921,0.6989,"{'max_depth': 3, 'max_features': 'auto', 'min_..."
3,COMPAS,XGBClassifier,0.7012,0.7064,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti..."
4,COMPAS,KNeighborsClassifier,0.6941,0.696,"{'metric': 'minkowski', 'n_neighbors': 15, 'we..."


## Load dataset

In [36]:
dataset = CompasDataset(dataset_path='../../data/COMPAS.csv')
dataset.X_data.head()

Unnamed: 0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,race,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,c_charge_degree_F,c_charge_degree_M,sex
0,25,0.0,-2.340451,1.0,-15.010999,African-American,1,0,0,0,1,1
1,26,0.0,0.0,0.0,0.0,Caucasian,1,0,0,1,0,0
2,21,0.0,0.0,0.0,0.0,Caucasian,0,0,1,1,0,1
3,29,0.0,0.0,0.0,6.0,African-American,1,0,0,0,1,1
4,40,0.0,0.0,0.0,7.513697,Caucasian,1,0,0,1,0,1


## Run experiments

In [37]:
def create_base_pipeline(dataset, protected_groups, priv_values, model_seed):
    base_pipeline = GenericPipeline(dataset, protected_groups, priv_values)
    _ = base_pipeline.create_preprocessed_train_test_split(dataset, config.TEST_SET_FRACTION, seed=model_seed)

    print('\nProtected groups splits:')
    for g in base_pipeline.test_groups.keys():
        print(g, base_pipeline.test_groups[g].shape)

    return base_pipeline


def get_model_metrics(base_model, n_estimators, dataset, protected_groups, priv_values, model_seed,
                      dataset_name, base_model_name, exp_num=1):
    base_pipeline = create_base_pipeline(dataset, protected_groups, priv_values, model_seed)
    print('\n\nX train and validation set: ')
    display(base_pipeline.X_train_val.head(10))

    # Compute variance metrics for subgroups
    save_results = False
    stability_fairness_analyzer = SubgroupsVarianceAnalyzer(ModelSetting.BATCH, n_estimators, base_model, base_model_name,
                                                            base_pipeline.X_train_val, base_pipeline.y_train_val,
                                                            base_pipeline.X_test, base_pipeline.y_test,
                                                            base_pipeline.protected_groups, base_pipeline.priv_values, base_pipeline.test_groups,
                                                            base_pipeline.target, dataset_name)

    y_preds, variance_metrics_df = stability_fairness_analyzer.compute_metrics(save_results=save_results,
                                                                               result_filename=None,
                                                                               save_dir_path=None,
                                                                               make_plots=False)

    # Compute bias metrics for subgroups
    bias_analyzer = SubgroupsStatisticalBiasAnalyzer(base_pipeline.X_test, base_pipeline.y_test,
                                                     base_pipeline.protected_groups, base_pipeline.priv_values,
                                                     base_pipeline.test_groups)
    dtc_res = bias_analyzer.compute_subgroups_metrics(y_preds,
                                                      save_results=False,
                                                      result_filename=None,
                                                      save_dir_path=None)
    bias_metrics_df = pd.DataFrame(dtc_res)

    # Save metrics
    metrics_df = pd.concat([variance_metrics_df, bias_metrics_df])
    result_filename = f'{EXPERIMENT_NAME}_Metrics_{dataset_name}_Experiment_{exp_num}_{base_model_name}'
    save_dir_path = os.path.join('..', '..', 'results', 'hypothesis_space')
    save_metrics_to_file(metrics_df, result_filename, save_dir_path)

    return metrics_df


In [38]:
def run_experiment(dataset, exp_num: int, model_seed: int):
    """
    Find variance and bias metrics for each model in config.MODELS_CONFIG.
    Save results in results/config.MODELS_CONFIG folder.

    :param exp_num: the number of experiment; is used to name the result file with metrics
    """
    for model_idx in tqdm(range(len(config.MODELS_CONFIG))):
        print('#' * 30, f' [Experiment {exp_num}] Analyze {config.MODELS_CONFIG[model_idx]["model_name"]} ', '#' * 30)
        model_seed += 1
        try:
            base_model = create_tuned_base_model(config.MODELS_CONFIG[model_idx]['model'],
                                                 config.MODELS_CONFIG[model_idx]['model_name'],
                                                 models_tuned_params_df)
            results_df = get_model_metrics(base_model, N_ESTIMATORS, dataset, SENSITIVE_ATTRIBUTES, PRIV_VALUES,
                                           model_seed=model_seed,
                                           dataset_name=DATASET_NAME,
                                           base_model_name=config.MODELS_CONFIG[model_idx]['model_name'],
                                           exp_num=exp_num)
            print(f'\n[Experiment {exp_num}] Metrics confusion matrix:')
            display(results_df)
        except Exception as err:
            print(f'ERROR with {config.MODELS_CONFIG[model_idx]["model_name"]}: ', err)

        print('\n\n\n')


### Experiment 1

In [39]:
# TOD: add dataset as a parameter
run_experiment(dataset, exp_num=1, model_seed=100)

  0%|          | 0/2 [00:00<?, ?it/s]

##############################  [Experiment 1] Analyze RandomForestClassifier  ##############################
Baseline X_train shape:  (4222, 12)
Baseline X_test shape:  (1056, 12)

Protected groups splits:
sex_race_priv (339, 12)
sex_race_dis (111, 12)
sex_priv (857, 12)
sex_dis (199, 12)
race_priv (427, 12)
race_dis (629, 12)


X train and validation set: 


Unnamed: 0,sex_1,juv_misd_count,age_cat_Greater than 45_1,age,c_charge_degree_F_0,priors_count,race_Caucasian,sex_0,age_cat_Greater than 45_0,age_cat_Less than 25_1,c_charge_degree_F_1,race_African-American,age_cat_25 - 45_1,age_cat_Less than 25_0,juv_fel_count,c_charge_degree_M_1,age_cat_25 - 45_0,juv_other_count,c_charge_degree_M_0
470,0,-0.12724,0,-0.525686,0,0.106154,0,1,1,0,1,1,1,1,-0.098697,0,0,-0.148179,1
1328,1,0.348324,0,0.232609,0,2.409132,0,0,1,0,1,1,1,1,-0.098697,0,0,-0.148179,1
3551,1,-0.12724,0,-0.441431,1,0.620185,1,0,1,0,0,0,1,1,-0.098697,1,0,-0.148179,0
1628,1,-0.12724,0,-0.272921,1,-0.085761,1,0,1,0,0,0,1,1,-0.098697,1,0,-0.148179,0
4138,1,-0.12724,0,-0.778452,0,-0.280851,0,0,1,0,1,1,1,1,-0.098697,0,0,1.111816,1
5030,0,-0.12724,0,-0.778452,0,0.693041,1,1,1,0,1,0,1,1,-0.098697,0,0,-0.148179,1
4510,0,-0.12724,0,-0.525686,1,-0.085761,0,1,1,0,0,1,1,1,-0.098697,1,0,-0.148179,0
1676,1,-0.12724,0,-1.199727,0,-0.661506,0,0,1,1,1,1,0,0,-0.098697,0,1,-0.148179,1
5047,1,-0.12724,0,-0.357176,1,-0.469591,1,0,1,0,0,0,1,1,-0.098697,1,0,-0.148179,0
1554,1,-0.12724,1,1.07516,1,-0.668317,0,0,0,0,0,1,0,1,-0.098697,1,1,-0.148179,0


2023-01-15 19:32:25 abstract_stability_analyzer.py INFO    : Start testing of classifier 1 / 200
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    : Classifier 1 / 200 was tested
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    : Start testing of classifier 2 / 200
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    : Classifier 2 / 200 was tested
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    : Start testing of classifier 3 / 200
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    : Classifier 3 / 200 was tested
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    : Start testing of classifier 4 / 200
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    : Classifier 4 / 200 was tested
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    : Start testing of classifier 5 / 200
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    : Classifier 5 / 200 was tested
2023-01-15 19:32:26 abstract_stability_analyzer.py INFO    :



##############################  Stability metrics  ##############################
General Ensemble Accuracy: 0.6657
Mean: 0.5295
Std: 0.0257
IQR: 0.0347
Entropy: 0.0
Jitter: 0.098
Per sample accuracy: 0.6676
Label stability: 0.8664



[Experiment 1] Metrics confusion matrix:


Unnamed: 0,overall,sex_race_priv,sex_race_dis,sex_priv,sex_dis,race_priv,race_dis
General_Ensemble_Accuracy,0.66572,0.657817,0.711712,0.655776,0.708543,0.667447,0.664547
Mean,0.52953,0.578704,0.552545,0.518016,0.579116,0.585696,0.491401
Std,0.02568,0.025871,0.027821,0.025381,0.026963,0.025873,0.025548
IQR,0.034651,0.035358,0.037442,0.034196,0.036612,0.0354,0.034142
Entropy,0.0,0.0,0.0,0.0,0.137103,0.0,0.178576
Jitter,0.097991,0.079559,0.10803,0.101176,0.084275,0.074356,0.114036
Per_Sample_Accuracy,0.667566,0.660074,0.709054,0.658804,0.705302,0.668419,0.666987
Label_Stability,0.866383,0.892478,0.856306,0.861085,0.889196,0.900351,0.843323
TPR,0.581109,0.365672,0.488889,0.611511,0.4,0.345912,0.695122
TNR,0.738137,0.84878,0.863636,0.697727,0.875969,0.858209,0.631229


 50%|█████     | 1/2 [01:04<01:04, 64.23s/it]





##############################  [Experiment 1] Analyze XGBClassifier  ##############################
Baseline X_train shape:  (4222, 12)
Baseline X_test shape:  (1056, 12)

Protected groups splits:
sex_race_priv (331, 12)
sex_race_dis (115, 12)
sex_priv (842, 12)
sex_dis (214, 12)
race_priv (430, 12)
race_dis (626, 12)


X train and validation set: 


Unnamed: 0,sex_1,juv_misd_count,age_cat_Greater than 45_1,age,c_charge_degree_F_0,priors_count,race_Caucasian,sex_0,age_cat_Greater than 45_0,age_cat_Less than 25_1,c_charge_degree_F_1,race_African-American,age_cat_25 - 45_1,age_cat_Less than 25_0,juv_fel_count,c_charge_degree_M_1,age_cat_25 - 45_0,juv_other_count,c_charge_degree_M_0
2997,1,-0.124987,0,-0.353654,0,1.257834,0,0,1,0,1,1,1,1,-0.091229,0,0,-0.173871,1
3785,0,-0.124987,0,-0.691293,1,0.104958,1,1,1,0,0,0,1,1,-0.091229,1,0,-0.173871,0
3913,1,-0.124987,0,-0.522474,1,0.873936,0,0,1,0,0,1,1,1,-0.091229,1,0,-0.173871,0
5233,1,-0.124987,1,1.165724,1,2.143334,1,0,0,0,0,0,0,1,-0.091229,1,1,-0.173871,0
44,1,-0.124987,0,-0.775703,0,-0.661654,0,0,1,0,1,1,1,1,-0.091229,0,0,-0.173871,1
3223,1,-0.124987,1,0.912494,0,0.106141,0,0,0,0,1,1,0,1,-0.091229,0,1,-0.173871,1
4307,1,-0.124987,0,0.659265,1,-0.661654,0,0,1,0,0,1,1,1,-0.091229,1,0,-0.173871,0
4665,0,-0.124987,0,0.659265,1,-0.661654,1,1,1,0,0,0,1,1,-0.091229,1,0,-0.173871,0
851,1,-0.124987,1,1.672183,1,-0.661654,1,0,0,0,0,0,0,1,-0.091229,1,1,-0.173871,0
379,1,-0.124987,1,2.009823,1,0.582694,1,0,0,0,0,0,0,1,-0.091229,1,1,-0.173871,0


2023-01-15 19:33:30 abstract_stability_analyzer.py INFO    : Start testing of classifier 1 / 200
2023-01-15 19:33:30 abstract_stability_analyzer.py INFO    : Classifier 1 / 200 was tested
2023-01-15 19:33:30 abstract_stability_analyzer.py INFO    : Start testing of classifier 2 / 200
2023-01-15 19:33:30 abstract_stability_analyzer.py INFO    : Classifier 2 / 200 was tested
2023-01-15 19:33:30 abstract_stability_analyzer.py INFO    : Start testing of classifier 3 / 200
2023-01-15 19:33:30 abstract_stability_analyzer.py INFO    : Classifier 3 / 200 was tested
2023-01-15 19:33:30 abstract_stability_analyzer.py INFO    : Start testing of classifier 4 / 200
2023-01-15 19:33:31 abstract_stability_analyzer.py INFO    : Classifier 4 / 200 was tested
2023-01-15 19:33:31 abstract_stability_analyzer.py INFO    : Start testing of classifier 5 / 200
2023-01-15 19:33:31 abstract_stability_analyzer.py INFO    : Classifier 5 / 200 was tested
2023-01-15 19:33:31 abstract_stability_analyzer.py INFO    :



##############################  Stability metrics  ##############################
General Ensemble Accuracy: 0.6894
Mean: 0.5389000177383423
Std: 0.09139999747276306
IQR: 0.123
Entropy: 0.2367
Jitter: 0.1506
Per sample accuracy: 0.6725
Label stability: 0.7897



[Experiment 1] Metrics confusion matrix:


Unnamed: 0,overall,sex_race_priv,sex_race_dis,sex_priv,sex_dis,race_priv,race_dis
General_Ensemble_Accuracy,0.689394,0.694864,0.66087,0.684086,0.71028,0.711628,0.674121
Mean,0.538875,0.602856,0.616434,0.512035,0.644478,0.619938,0.483192
Std,0.091413,0.085586,0.102336,0.089041,0.100747,0.088651,0.09331
IQR,0.122999,0.114289,0.14006,0.119282,0.137623,0.119009,0.12574
Entropy,0.236674,0.0,0.256909,0.0,0.250103,0.217079,0.0
Jitter,0.150575,0.132365,0.163839,0.14781,0.161456,0.138425,0.158921
Per_Sample_Accuracy,0.672453,0.675589,0.67213,0.665517,0.699743,0.688535,0.661406
Label_Stability,0.789678,0.811843,0.77713,0.793789,0.773505,0.802047,0.781182
TPR,0.60521,0.466165,0.42,0.637681,0.447059,0.470238,0.673716
TNR,0.764811,0.848485,0.846154,0.728972,0.883721,0.866412,0.674576


100%|██████████| 2/2 [02:38<00:00, 79.10s/it]










### Experiment 2

In [40]:
# run_experiment(exp_num=2, model_seed=200)