In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

## Import dependencies

In [3]:
import os
import pandas as pd
from IPython.display import display
from tqdm import tqdm

from configs import config
from configs.constants import ModelSetting
from utils.analyzers.subgroups_variance_analyzer import SubgroupsVarianceAnalyzer
from utils.common_helpers import create_tuned_base_model, save_metrics_to_file
from utils.custom_classes.data_loader import CompasWithoutSensitiveAttrsDataset
from utils.custom_classes.generic_pipeline import GenericPipeline
from utils.analyzers.subgroups_statistical_bias_analyzer import SubgroupsStatisticalBiasAnalyzer

## Configs

In [4]:
DATASET_NAME = "COMPAS_Without_Sensitive_Attributes"
EXPERIMENT_NAME = 'Hypothesis_Space'

SEX_priv = 1
RACE_priv = 'Caucasian'
N_ESTIMATORS = 200
SENSITIVE_ATTRIBUTES = ['sex', 'race']
PRIV_VALUES = [SEX_priv, RACE_priv]
TUNED_PARAMS_FILE_PATH = os.path.join('..', '..', 'results', 'models_tuning', f'tuning_results_{DATASET_NAME}_20230116__155240.csv')

## Models tuned hyper-parameters

In [5]:
models_tuned_params_df = pd.read_csv(TUNED_PARAMS_FILE_PATH)
models_tuned_params_df

Unnamed: 0,Dataset_Name,Model_Name,F1_Score,Accuracy_Score,Model_Best_Params
0,COMPAS_Without_Sensitive_Attributes,LogisticRegression,0.6785,0.6837,"{'max_iter': 50, 'penalty': 'l2', 'solver': 'n..."
1,COMPAS_Without_Sensitive_Attributes,DecisionTreeClassifier,0.6135,0.6165,"{'criterion': 'gini', 'max_depth': 5, 'max_fea..."
2,COMPAS_Without_Sensitive_Attributes,RandomForestClassifier,0.6944,0.6989,"{'max_depth': 4, 'max_features': 0.6, 'min_sam..."
3,COMPAS_Without_Sensitive_Attributes,XGBClassifier,0.6984,0.7027,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti..."
4,COMPAS_Without_Sensitive_Attributes,KNeighborsClassifier,0.6946,0.696,"{'metric': 'manhattan', 'n_neighbors': 15, 'we..."


## Load dataset

In [6]:
dataset = CompasWithoutSensitiveAttrsDataset(dataset_path='../../data/COMPAS.csv')
dataset.X_data.head()

Unnamed: 0,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,c_charge_degree_F,c_charge_degree_M
0,0.0,-2.340451,1.0,-15.010999,1,0,0,0,1
1,0.0,0.0,0.0,0.0,1,0,0,1,0
2,0.0,0.0,0.0,0.0,0,0,1,1,0
3,0.0,0.0,0.0,6.0,1,0,0,0,1
4,0.0,0.0,0.0,7.513697,1,0,0,1,0


## Run experiments

In [7]:
def create_base_pipeline(dataset, sensitive_attributes, priv_values, model_seed):
    base_pipeline = GenericPipeline(dataset, sensitive_attributes, priv_values)
    _ = base_pipeline.create_train_test_split_without_sensitive_attrs(dataset, config.TEST_SET_FRACTION, seed=model_seed)

    print('\nProtected groups splits:')
    for g in base_pipeline.test_groups.keys():
        print(g, base_pipeline.test_groups[g].shape)

    return base_pipeline


def get_model_metrics(base_model, n_estimators, dataset, sensitive_attributes, priv_values, model_seed,
                      dataset_name, base_model_name, exp_num=1):
    base_pipeline = create_base_pipeline(dataset, sensitive_attributes, priv_values, model_seed)
    print('\n\nX train and validation set: ')
    display(base_pipeline.X_train_val.head(10))

    # Compute variance metrics for subgroups
    save_results = False
    stability_fairness_analyzer = SubgroupsVarianceAnalyzer(ModelSetting.BATCH, n_estimators, base_model, base_model_name,
                                                            base_pipeline.X_train_val, base_pipeline.y_train_val,
                                                            base_pipeline.X_test, base_pipeline.y_test,
                                                            base_pipeline.sensitive_attributes, base_pipeline.priv_values, base_pipeline.test_groups,
                                                            base_pipeline.target, dataset_name)

    y_preds, variance_metrics_df = stability_fairness_analyzer.compute_metrics(save_results=save_results,
                                                                               result_filename=None,
                                                                               save_dir_path=None,
                                                                               make_plots=False)

    # Compute bias metrics for subgroups
    bias_analyzer = SubgroupsStatisticalBiasAnalyzer(base_pipeline.X_test, base_pipeline.y_test,
                                                     base_pipeline.sensitive_attributes, base_pipeline.priv_values,
                                                     base_pipeline.test_groups)
    dtc_res = bias_analyzer.compute_subgroups_metrics(y_preds,
                                                      save_results=False,
                                                      result_filename=None,
                                                      save_dir_path=None)
    bias_metrics_df = pd.DataFrame(dtc_res)

    # Save metrics
    metrics_df = pd.concat([variance_metrics_df, bias_metrics_df])
    result_filename = f'{EXPERIMENT_NAME}_Metrics_{dataset_name}_Experiment_{exp_num}_{base_model_name}'
    save_dir_path = os.path.join('..', '..', 'results', 'hypothesis_space')
    save_metrics_to_file(metrics_df, result_filename, save_dir_path)

    return metrics_df


In [8]:
def run_experiment(dataset, exp_num: int, model_seed: int):
    """
    Find variance and bias metrics for each model in config.MODELS_CONFIG.
    Save results in results/config.MODELS_CONFIG folder.

    :param exp_num: the number of experiment; is used to name the result file with metrics
    """
    for model_idx in tqdm(range(len(config.MODELS_CONFIG))):
        print('#' * 30, f' [Experiment {exp_num}] Analyze {config.MODELS_CONFIG[model_idx]["model_name"]} ', '#' * 30)
        model_seed += 1
        try:
            base_model = create_tuned_base_model(config.MODELS_CONFIG[model_idx]['model'],
                                                 config.MODELS_CONFIG[model_idx]['model_name'],
                                                 models_tuned_params_df)
            results_df = get_model_metrics(base_model, N_ESTIMATORS, dataset, SENSITIVE_ATTRIBUTES, PRIV_VALUES,
                                           model_seed=model_seed,
                                           dataset_name=DATASET_NAME,
                                           base_model_name=config.MODELS_CONFIG[model_idx]['model_name'],
                                           exp_num=exp_num)
            print(f'\n[Experiment {exp_num}] Metrics confusion matrix:')
            display(results_df)
        except Exception as err:
            print(f'ERROR with {config.MODELS_CONFIG[model_idx]["model_name"]}: ', err)

        print('\n\n\n')


### Experiment 1

In [9]:
# TOD: add dataset as a parameter
run_experiment(dataset, exp_num=1, model_seed=100)

  0%|          | 0/5 [00:00<?, ?it/s]

##############################  [Experiment 1] Analyze LogisticRegression  ##############################
Baseline X_train shape:  (4222, 9)
Baseline X_test shape:  (1056, 9)

Protected groups splits:
sex_race_priv (339, 11)
sex_race_dis (111, 11)
sex_priv (857, 11)
sex_dis (199, 11)
race_priv (427, 11)
race_dis (629, 11)


X train and validation set: 


Unnamed: 0,age_cat_Greater than 45_0,c_charge_degree_F_1,c_charge_degree_M_0,priors_count,age_cat_Greater than 45_1,age_cat_25 - 45_1,age_cat_Less than 25_0,age_cat_Less than 25_1,juv_misd_count,age_cat_25 - 45_0,juv_other_count,juv_fel_count,c_charge_degree_F_0,c_charge_degree_M_1
470,1,1,1,0.106154,0,1,1,0,-0.12724,0,-0.148179,-0.098697,0,0
1328,1,1,1,2.409132,0,1,1,0,0.348324,0,-0.148179,-0.098697,0,0
3551,1,0,0,0.620185,0,1,1,0,-0.12724,0,-0.148179,-0.098697,1,1
1628,1,0,0,-0.085761,0,1,1,0,-0.12724,0,-0.148179,-0.098697,1,1
4138,1,1,1,-0.280851,0,1,1,0,-0.12724,0,1.111816,-0.098697,0,0
5030,1,1,1,0.693041,0,1,1,0,-0.12724,0,-0.148179,-0.098697,0,0
4510,1,0,0,-0.085761,0,1,1,0,-0.12724,0,-0.148179,-0.098697,1,1
1676,1,1,1,-0.661506,0,0,0,1,-0.12724,1,-0.148179,-0.098697,0,0
5047,1,0,0,-0.469591,0,1,1,0,-0.12724,0,-0.148179,-0.098697,1,1
1554,0,0,0,-0.668317,1,0,1,0,-0.12724,1,-0.148179,-0.098697,1,1


2023-01-16 18:08:13 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 1 / 200
2023-01-16 18:08:13 abstract_overall_variance_analyzer.py INFO    : Classifier 1 / 200 was tested
2023-01-16 18:08:13 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 2 / 200
2023-01-16 18:08:13 abstract_overall_variance_analyzer.py INFO    : Classifier 2 / 200 was tested
2023-01-16 18:08:13 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 3 / 200
2023-01-16 18:08:13 abstract_overall_variance_analyzer.py INFO    : Classifier 3 / 200 was tested
2023-01-16 18:08:13 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 4 / 200
2023-01-16 18:08:14 abstract_overall_variance_analyzer.py INFO    : Classifier 4 / 200 was tested
2023-01-16 18:08:14 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 5 / 200
2023-01-16 18:08:14 abstract_overall_variance_analyzer.py INFO    : Classifier 5 / 200 w



##############################  Stability metrics  ##############################
General Ensemble Accuracy: 0.643
Mean: 0.5254
Std: 0.0226
IQR: 0.0302
Entropy: 0.0
Jitter: 0.0364
Per sample accuracy: 0.6426
Label stability: 0.9543



[Experiment 1] Metrics confusion matrix:


Unnamed: 0,overall,sex_race_priv,sex_race_dis,sex_priv,sex_dis,race_priv,race_dis
General_Ensemble_Accuracy,0.642992,0.654867,0.657658,0.647608,0.623116,0.639344,0.645469
Mean,0.525411,0.588531,0.530116,0.516933,0.561921,0.591315,0.480671
Std,0.0226,0.02121,0.020623,0.023063,0.020606,0.021081,0.023631
IQR,0.03019,0.028079,0.027884,0.030828,0.027441,0.027832,0.03179
Entropy,0.0,0.0,0.0,0.0,0.059222,0.0,0.071021
Jitter,0.036394,0.028761,0.035182,0.037083,0.033427,0.029267,0.041232
Per_Sample_Accuracy,0.642609,0.652758,0.661306,0.647071,0.623392,0.63685,0.646518
Label_Stability,0.95429,0.963982,0.957027,0.953326,0.958442,0.963208,0.948235
TPR,0.581109,0.432836,0.577778,0.601918,0.457143,0.402516,0.667683
TNR,0.695958,0.8,0.712121,0.690909,0.713178,0.779851,0.621262


 20%|██        | 1/5 [00:50<03:22, 50.58s/it]





##############################  [Experiment 1] Analyze DecisionTreeClassifier  ##############################
Baseline X_train shape:  (4222, 9)
Baseline X_test shape:  (1056, 9)

Protected groups splits:
sex_race_priv (331, 11)
sex_race_dis (115, 11)
sex_priv (842, 11)
sex_dis (214, 11)
race_priv (430, 11)
race_dis (626, 11)


X train and validation set: 


Unnamed: 0,age_cat_Greater than 45_0,c_charge_degree_F_1,c_charge_degree_M_0,priors_count,age_cat_Greater than 45_1,age_cat_25 - 45_1,age_cat_Less than 25_0,age_cat_Less than 25_1,juv_misd_count,age_cat_25 - 45_0,juv_other_count,juv_fel_count,c_charge_degree_F_0,c_charge_degree_M_1
2997,1,1,1,1.257834,0,1,1,0,-0.124987,0,-0.173871,-0.091229,0,0
3785,1,0,0,0.104958,0,1,1,0,-0.124987,0,-0.173871,-0.091229,1,1
3913,1,0,0,0.873936,0,1,1,0,-0.124987,0,-0.173871,-0.091229,1,1
5233,0,0,0,2.143334,1,0,1,0,-0.124987,1,-0.173871,-0.091229,1,1
44,1,1,1,-0.661654,0,1,1,0,-0.124987,0,-0.173871,-0.091229,0,0
3223,0,1,1,0.106141,1,0,1,0,-0.124987,1,-0.173871,-0.091229,0,0
4307,1,0,0,-0.661654,0,1,1,0,-0.124987,0,-0.173871,-0.091229,1,1
4665,1,0,0,-0.661654,0,1,1,0,-0.124987,0,-0.173871,-0.091229,1,1
851,0,0,0,-0.661654,1,0,1,0,-0.124987,1,-0.173871,-0.091229,1,1
379,0,0,0,0.582694,1,0,1,0,-0.124987,1,-0.173871,-0.091229,1,1


2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 1 / 200
2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Classifier 1 / 200 was tested
2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 2 / 200
2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Classifier 2 / 200 was tested
2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 3 / 200
2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Classifier 3 / 200 was tested
2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 4 / 200
2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Classifier 4 / 200 was tested
2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 5 / 200
2023-01-16 18:09:04 abstract_overall_variance_analyzer.py INFO    : Classifier 5 / 200 w



##############################  Stability metrics  ##############################
General Ensemble Accuracy: 0.6951
Mean: 0.5379
Std: 0.0831
IQR: 0.1056
Entropy: 0.204
Jitter: 0.1205
Per sample accuracy: 0.6701
Label stability: 0.8446



[Experiment 1] Metrics confusion matrix:


Unnamed: 0,overall,sex_race_priv,sex_race_dis,sex_priv,sex_dis,race_priv,race_dis
General_Ensemble_Accuracy,0.695076,0.688822,0.686957,0.691211,0.71028,0.7,0.691693
Mean,0.537914,0.597559,0.535291,0.527466,0.579026,0.604989,0.491841
Std,0.083093,0.078371,0.081106,0.083846,0.080129,0.078514,0.086238
IQR,0.105571,0.100866,0.102885,0.106945,0.100164,0.099977,0.109414
Entropy,0.203968,0.183521,0.229767,0.0,0.203723,0.181207,0.0
Jitter,0.12051,0.108013,0.138567,0.120072,0.122236,0.10692,0.129846
Per_Sample_Accuracy,0.670109,0.668369,0.666261,0.665558,0.688014,0.678709,0.664201
Label_Stability,0.844555,0.861692,0.816,0.846983,0.835,0.860628,0.833514
TPR,0.629259,0.473684,0.62,0.647343,0.541176,0.464286,0.712991
TNR,0.754039,0.833333,0.738462,0.733645,0.821705,0.851145,0.667797


 40%|████      | 2/5 [01:31<02:14, 44.70s/it]





##############################  [Experiment 1] Analyze RandomForestClassifier  ##############################
Baseline X_train shape:  (4222, 9)
Baseline X_test shape:  (1056, 9)

Protected groups splits:
sex_race_priv (332, 11)
sex_race_dis (113, 11)
sex_priv (848, 11)
sex_dis (208, 11)
race_priv (427, 11)
race_dis (629, 11)


X train and validation set: 


Unnamed: 0,age_cat_Greater than 45_0,c_charge_degree_F_1,c_charge_degree_M_0,priors_count,age_cat_Greater than 45_1,age_cat_25 - 45_1,age_cat_Less than 25_0,age_cat_Less than 25_1,juv_misd_count,age_cat_25 - 45_0,juv_other_count,juv_fel_count,c_charge_degree_F_0,c_charge_degree_M_1
1825,1,0,0,-0.461578,0,1,1,0,-0.126846,0,-0.164579,-0.09486,1,1
650,1,0,0,-0.656833,0,1,1,0,-0.126846,0,-0.164579,-0.09486,1,1
500,1,1,1,-0.656833,0,1,1,0,-0.126846,0,-0.164579,-0.09486,0,0
1888,1,1,1,0.514698,0,0,0,1,-0.126846,1,-0.164579,0.94811,0,0
3503,1,1,1,-0.656833,0,0,0,1,-0.126846,1,-0.164579,-0.09486,0,0
4670,1,1,1,-0.266322,0,1,1,0,-0.126846,0,-0.164579,-0.09486,0,0
4930,1,1,1,0.738725,0,1,1,0,-0.126846,0,-0.164579,1.046237,0,0
1393,1,1,1,-0.656833,0,1,1,0,-0.126846,0,-0.164579,-0.09486,0,0
2123,1,0,0,-0.656833,0,1,1,0,-0.126846,0,-0.164579,-0.09486,1,1
3699,1,0,0,-0.461578,0,0,0,1,-0.126846,1,-0.164579,-0.09486,1,1


2023-01-16 18:09:44 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 1 / 200
2023-01-16 18:09:45 abstract_overall_variance_analyzer.py INFO    : Classifier 1 / 200 was tested
2023-01-16 18:09:45 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 2 / 200
2023-01-16 18:09:45 abstract_overall_variance_analyzer.py INFO    : Classifier 2 / 200 was tested
2023-01-16 18:09:45 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 3 / 200
2023-01-16 18:09:45 abstract_overall_variance_analyzer.py INFO    : Classifier 3 / 200 was tested
2023-01-16 18:09:45 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 4 / 200
2023-01-16 18:09:45 abstract_overall_variance_analyzer.py INFO    : Classifier 4 / 200 was tested
2023-01-16 18:09:45 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 5 / 200
2023-01-16 18:09:45 abstract_overall_variance_analyzer.py INFO    : Classifier 5 / 200 w



##############################  Stability metrics  ##############################
General Ensemble Accuracy: 0.6809
Mean: 0.5315
Std: 0.0401
IQR: 0.0535
Entropy: 0.0
Jitter: 0.0625
Per sample accuracy: 0.6745
Label stability: 0.9175



[Experiment 1] Metrics confusion matrix:


Unnamed: 0,overall,sex_race_priv,sex_race_dis,sex_priv,sex_dis,race_priv,race_dis
General_Ensemble_Accuracy,0.680871,0.668675,0.681416,0.675708,0.701923,0.681499,0.680445
Mean,0.531464,0.577999,0.549369,0.521029,0.574005,0.58363,0.496051
Std,0.040051,0.039012,0.037418,0.040556,0.03799,0.038936,0.040808
IQR,0.053482,0.052063,0.050017,0.054176,0.050655,0.051919,0.054544
Entropy,0.0,0.096778,0.0,0.100552,0.0,0.098506,0.0
Jitter,0.062457,0.058079,0.074723,0.060986,0.068454,0.058728,0.064988
Per_Sample_Accuracy,0.674474,0.659187,0.680708,0.668685,0.698077,0.672436,0.675859
Label_Stability,0.917528,0.924157,0.898053,0.919351,0.910096,0.924215,0.912989
TPR,0.591255,0.493056,0.5,0.608889,0.486842,0.488764,0.643678
TNR,0.769811,0.803191,0.788732,0.751256,0.825758,0.819277,0.725979


 60%|██████    | 3/5 [02:29<01:42, 51.09s/it]





##############################  [Experiment 1] Analyze XGBClassifier  ##############################
Baseline X_train shape:  (4222, 9)
Baseline X_test shape:  (1056, 9)

Protected groups splits:
sex_race_priv (340, 11)
sex_race_dis (123, 11)
sex_priv (837, 11)
sex_dis (219, 11)
race_priv (436, 11)
race_dis (620, 11)


X train and validation set: 


Unnamed: 0,age_cat_Greater than 45_0,c_charge_degree_F_1,c_charge_degree_M_0,priors_count,age_cat_Greater than 45_1,age_cat_25 - 45_1,age_cat_Less than 25_0,age_cat_Less than 25_1,juv_misd_count,age_cat_25 - 45_0,juv_other_count,juv_fel_count,c_charge_degree_F_0,c_charge_degree_M_1
1421,1,1,1,2.244689,0,1,1,0,-0.132706,0,-0.153069,-0.090917,0,0
43,0,0,0,0.022412,1,0,1,0,-0.132706,1,-0.153069,-0.090917,1,1
963,1,0,0,-0.660704,0,1,1,0,-0.132706,0,-0.153069,-0.090917,1,1
1863,1,0,0,-0.660704,0,0,0,1,-0.132706,1,-0.153069,-0.090917,1,1
2737,1,1,1,-0.660704,0,1,1,0,-0.132706,0,-0.153069,-0.090917,0,0
5050,0,1,1,-0.660704,1,0,1,0,-0.132706,1,-0.153069,-0.090917,0,0
480,1,1,1,0.890481,0,1,1,0,-0.132706,0,-0.7609,2.774501,0,0
3876,0,1,1,0.310137,1,0,1,0,-0.132706,1,-0.153069,-0.090917,0,0
3777,1,1,1,-0.272908,0,0,0,1,-0.132706,1,-0.153069,-0.090917,0,0
1052,1,1,1,0.308786,0,1,1,0,-0.132706,0,-0.153069,-0.090917,0,0


2023-01-16 18:10:43 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 1 / 200
2023-01-16 18:10:43 abstract_overall_variance_analyzer.py INFO    : Classifier 1 / 200 was tested
2023-01-16 18:10:43 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 2 / 200
2023-01-16 18:10:44 abstract_overall_variance_analyzer.py INFO    : Classifier 2 / 200 was tested
2023-01-16 18:10:44 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 3 / 200
2023-01-16 18:10:44 abstract_overall_variance_analyzer.py INFO    : Classifier 3 / 200 was tested
2023-01-16 18:10:44 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 4 / 200
2023-01-16 18:10:44 abstract_overall_variance_analyzer.py INFO    : Classifier 4 / 200 was tested
2023-01-16 18:10:44 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 5 / 200
2023-01-16 18:10:44 abstract_overall_variance_analyzer.py INFO    : Classifier 5 / 200 w



##############################  Stability metrics  ##############################
General Ensemble Accuracy: 0.6998
Mean: 0.5260000228881836
Std: 0.061000000685453415
IQR: 0.0813
Entropy: 0.1598
Jitter: 0.1012
Per sample accuracy: 0.6904
Label stability: 0.8582



[Experiment 1] Metrics confusion matrix:


Unnamed: 0,overall,sex_race_priv,sex_race_dis,sex_priv,sex_dis,race_priv,race_dis
General_Ensemble_Accuracy,0.699811,0.676471,0.731707,0.691756,0.730594,0.688073,0.708065
Mean,0.525984,0.596286,0.556639,0.51692,0.560622,0.589557,0.481277
Std,0.061039,0.055427,0.062925,0.059454,0.067096,0.059173,0.062351
IQR,0.081264,0.073869,0.086301,0.078469,0.091946,0.079442,0.082545
Entropy,0.159849,0.0,0.20119,0.146471,0.210976,0.0,0.162017
Jitter,0.10119,0.087277,0.129193,0.092119,0.135858,0.099854,0.10213
Per_Sample_Accuracy,0.69035,0.680691,0.726504,0.680532,0.727877,0.691468,0.689565
Label_Stability,0.858182,0.877853,0.819675,0.871935,0.805616,0.857982,0.858323
TPR,0.646586,0.485075,0.642857,0.650943,0.621622,0.506024,0.716867
TNR,0.747312,0.800971,0.777778,0.733656,0.786207,0.8,0.697917


 80%|████████  | 4/5 [03:47<01:01, 61.67s/it]





##############################  [Experiment 1] Analyze KNeighborsClassifier  ##############################
Baseline X_train shape:  (4222, 9)
Baseline X_test shape:  (1056, 9)

Protected groups splits:
sex_race_priv (330, 11)
sex_race_dis (94, 11)
sex_priv (858, 11)
sex_dis (198, 11)
race_priv (434, 11)
race_dis (622, 11)


X train and validation set: 


Unnamed: 0,age_cat_Greater than 45_0,c_charge_degree_F_1,c_charge_degree_M_0,priors_count,age_cat_Greater than 45_1,age_cat_25 - 45_1,age_cat_Less than 25_0,age_cat_Less than 25_1,juv_misd_count,age_cat_25 - 45_0,juv_other_count,juv_fel_count,c_charge_degree_F_0,c_charge_degree_M_1
597,1,0,0,0.295811,0,1,1,0,-0.133157,0,-0.146069,-0.100065,1,1
3079,0,0,0,1.86039,1,0,1,0,-0.133157,1,-0.146069,-0.100065,1,1
414,1,1,1,-0.083087,0,1,1,0,-0.133157,0,-0.146069,-0.100065,0,0
936,1,0,0,2.287894,0,0,0,1,-0.133157,1,-0.146069,-0.100065,1,1
391,0,1,1,-3.115974,1,0,1,0,-0.133157,1,-0.146069,-0.100065,0,0
2494,0,0,0,-0.651433,1,0,1,0,-0.133157,1,-0.146069,-0.100065,1,1
3709,1,1,1,-0.651433,0,0,0,1,-0.133157,1,-0.146069,-0.100065,0,0
1102,1,1,1,0.106362,0,1,1,0,-0.133157,0,-0.146069,-0.100065,0,0
3241,0,1,1,-0.461984,1,0,1,0,-0.133157,1,-0.146069,-0.100065,0,0
124,1,1,1,-0.272535,0,1,1,0,-0.133157,0,-0.146069,-0.100065,0,0


2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 1 / 200
2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Classifier 1 / 200 was tested
2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 2 / 200
2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Classifier 2 / 200 was tested
2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 3 / 200
2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Classifier 3 / 200 was tested
2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 4 / 200
2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Classifier 4 / 200 was tested
2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Start testing of classifier 5 / 200
2023-01-16 18:12:01 abstract_overall_variance_analyzer.py INFO    : Classifier 5 / 200 w



##############################  Stability metrics  ##############################
General Ensemble Accuracy: 0.6884
Mean: 0.5387
Std: 0.1171
IQR: 0.162
Entropy: 0.3003
Jitter: 0.1914
Per sample accuracy: 0.6526
Label stability: 0.7306



[Experiment 1] Metrics confusion matrix:


Unnamed: 0,overall,sex_race_priv,sex_race_dis,sex_priv,sex_dis,race_priv,race_dis
General_Ensemble_Accuracy,0.688447,0.687879,0.680851,0.688811,0.686869,0.68894,0.688103
Mean,0.538719,0.602068,0.536376,0.529157,0.580152,0.606297,0.491566
Std,0.117054,0.113966,0.119373,0.117021,0.1172,0.11427,0.118997
IQR,0.162042,0.156869,0.167376,0.160878,0.167088,0.159255,0.163987
Entropy,0.300261,0.262233,0.0,0.296586,0.0,0.271286,0.320478
Jitter,0.191448,0.164676,0.217273,0.188377,0.204754,0.171569,0.205318
Per_Sample_Accuracy,0.652637,0.654818,0.628404,0.653054,0.650833,0.658721,0.648392
Label_Stability,0.730559,0.769697,0.685745,0.735921,0.707323,0.759424,0.710418
TPR,0.610656,0.496296,0.551724,0.630952,0.485294,0.482759,0.681529
TNR,0.755282,0.820513,0.738462,0.744292,0.792308,0.826923,0.694805


100%|██████████| 5/5 [04:39<00:00, 55.89s/it]








