In [21]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

## Import dependencies

In [23]:
import os
import pandas as pd
from IPython.display import display
from tqdm import tqdm

from configs import config
from configs.constants import ModelSetting
from utils.analyzers.stability_fairness_analyzer import StabilityFairnessAnalyzer
from utils.common_helpers import create_tuned_base_model, save_metrics_to_file
from utils.custom_classes.data_loader import CompasDataset
from utils.custom_classes.generic_pipeline import GenericPipeline
from utils.analyzers.bias_analyzer import BiasAnalyzer

## Configs

In [24]:
DATASET_NAME = "COMPAS"
EXPERIMENT_NAME = 'Hypothesis_Space'

SEX_priv = 1
RACE_priv = 'Caucasian'
N_ESTIMATORS = 200
PROTECTED_GROUPS = ['sex', 'race']
PRIV_VALUES = [SEX_priv, RACE_priv]
TUNED_PARAMS_FILE_PATH = os.path.join('..', '..', 'results', 'models_tuning', 'tuning_results_COMPAS_20230115__142510.csv')

## Models tuned hyper-parameters

In [25]:
models_tuned_params_df = pd.read_csv(TUNED_PARAMS_FILE_PATH)
models_tuned_params_df

Unnamed: 0,Dataset_Name,Model_Name,F1_Score,Accuracy_Score,Model_Best_Params
0,COMPAS,LogisticRegression,0.6785,0.6837,"{'max_iter': 50, 'penalty': 'none', 'solver': ..."
1,COMPAS,DecisionTreeClassifier,0.6835,0.6856,"{'criterion': 'entropy', 'max_depth': 5, 'max_..."
2,COMPAS,RandomForestClassifier,0.6921,0.6989,"{'max_depth': 3, 'max_features': 'auto', 'min_..."
3,COMPAS,XGBClassifier,0.7012,0.7064,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti..."
4,COMPAS,KNeighborsClassifier,0.6941,0.696,"{'metric': 'minkowski', 'n_neighbors': 15, 'we..."


## Load dataset

In [26]:
dataset = CompasDataset(dataset_path='../../data/COMPAS.csv')
dataset.X_data.head()

Unnamed: 0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,race,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,c_charge_degree_F,c_charge_degree_M,sex
0,25,0.0,-2.340451,1.0,-15.010999,African-American,1,0,0,0,1,1
1,26,0.0,0.0,0.0,0.0,Caucasian,1,0,0,1,0,0
2,21,0.0,0.0,0.0,0.0,Caucasian,0,0,1,1,0,1
3,29,0.0,0.0,0.0,6.0,African-American,1,0,0,0,1,1
4,40,0.0,0.0,0.0,7.513697,Caucasian,1,0,0,1,0,1


## Run experiments

In [27]:
def create_base_pipeline(dataset, protected_groups, priv_values, model_seed):
    base_pipeline = GenericPipeline(dataset, protected_groups, priv_values)
    _ = base_pipeline.create_preprocessed_train_test_split(dataset, config.TEST_SET_FRACTION, seed=model_seed)

    print('\nProtected groups splits:')
    for g in base_pipeline.test_groups.keys():
        print(g, base_pipeline.test_groups[g].shape)

    return base_pipeline


def get_model_metrics(base_model, n_estimators, dataset, protected_groups, priv_values, model_seed,
                      dataset_name, base_model_name, exp_num=1):
    base_pipeline = create_base_pipeline(dataset, protected_groups, priv_values, model_seed)

    # Compute variance metrics for subgroups
    save_results = False
    stability_fairness_analyzer = StabilityFairnessAnalyzer(ModelSetting.BATCH, n_estimators, base_model, base_model_name,
                                                            base_pipeline.X_train_val, base_pipeline.y_train_val,
                                                            base_pipeline.X_test, base_pipeline.y_test,
                                                            base_pipeline.protected_groups, base_pipeline.priv_values, base_pipeline.test_groups,
                                                            base_pipeline.target, dataset_name)

    y_preds, variance_metrics_df = stability_fairness_analyzer.compute_metrics(save_results=save_results,
                                                                               result_filename=None,
                                                                               save_dir_path=None,
                                                                               make_plots=False)

    # Compute bias metrics for subgroups
    bias_analyzer = BiasAnalyzer(base_pipeline.X_test, base_pipeline.y_test,
                                 base_pipeline.protected_groups, base_pipeline.priv_values,
                                 base_pipeline.test_groups)
    dtc_res = bias_analyzer.compute_subgroups_metrics(y_preds,
                                                      save_results=False,
                                                      result_filename=None,
                                                      save_dir_path=None)
    bias_metrics_df = pd.DataFrame(dtc_res)

    # Save metrics
    metrics_df = pd.concat([variance_metrics_df, bias_metrics_df])
    result_filename = f'{EXPERIMENT_NAME}_Metrics_{dataset_name}_Experiment_{exp_num}_{base_model_name}'
    save_dir_path = os.path.join('..', '..', 'results', 'hypothesis_space')
    save_metrics_to_file(metrics_df, result_filename, save_dir_path)

    return metrics_df


In [28]:
def run_experiment(dataset, exp_num: int, model_seed: int):
    """
    Find variance and bias metrics for each model in config.MODELS_CONFIG.
    Save results in results/config.MODELS_CONFIG folder.

    :param exp_num: the number of experiment; is used to name the result file with metrics
    """
    for model_idx in tqdm(range(len(config.MODELS_CONFIG))):
        print('#' * 30, f' [Experiment {exp_num}] Analyze {config.MODELS_CONFIG[model_idx]["model_name"]} ', '#' * 30)
        model_seed += 1
        try:
            base_model = create_tuned_base_model(config.MODELS_CONFIG[model_idx]['model'],
                                                 config.MODELS_CONFIG[model_idx]['model_name'],
                                                 models_tuned_params_df)
            results_df = get_model_metrics(base_model, N_ESTIMATORS, dataset, PROTECTED_GROUPS, PRIV_VALUES,
                                           model_seed=model_seed,
                                           dataset_name=DATASET_NAME,
                                           base_model_name=config.MODELS_CONFIG[model_idx]['model_name'],
                                           exp_num=exp_num)
            print(f'\n[Experiment {exp_num}] Metrics confusion matrix:')
            display(results_df)
        except Exception as err:
            print(f'ERROR with {config.MODELS_CONFIG[model_idx]["model_name"]}: ', err)

        print('\n\n\n')


### Experiment 1

In [29]:
# TOD: add dataset as a parameter
run_experiment(dataset, exp_num=1, model_seed=100)

  0%|          | 0/1 [00:00<?, ?it/s]

##############################  [Experiment 1] Analyze DecisionTreeClassifier  ##############################
Baseline X_train shape:  (4222, 12)
Baseline X_test shape:  (1056, 12)


2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 1 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 1 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 2 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 2 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 3 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 3 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 4 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 4 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 5 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 5 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    :


Protected groups splits:
sex_race_priv (339, 12)
sex_race_dis (111, 12)
sex_priv (857, 12)
sex_dis (199, 12)
race_priv (427, 12)
race_dis (629, 12)


2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 24 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 24 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 25 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 25 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 26 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 26 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 27 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 27 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Start testing of classifier 28 / 200
2023-01-15 18:59:36 abstract_stability_analyzer.py INFO    : Classifier 28 / 200 was tested
2023-01-15 18:59:36 abstract_stability_analyzer.py



##############################  Stability metrics  ##############################
General Ensemble Accuracy: 0.679
Mean: 0.53
Std: 0.1049
IQR: 0.1363
Entropy: 0.0
Jitter: 0.1761
Per sample accuracy: 0.6535
Label stability: 0.7482



[Experiment 1] Metrics confusion matrix:


Unnamed: 0,overall,sex_race_priv,sex_race_dis,sex_priv,sex_dis,race_priv,race_dis
General_Ensemble_Accuracy,0.679,0.6667,0.7207,0.6768,0.6884,0.6628,0.69
Mean,0.53,0.5922,0.5694,0.5147,0.5957,0.5998,0.4826
Std,0.1049,0.0974,0.1201,0.1006,0.1235,0.1037,0.1058
IQR,0.1363,0.1273,0.1531,0.1322,0.1541,0.133,0.1385
Entropy,0.0,0.0,0.3138,0.0,0.2976,0.0,0.2914
Jitter,0.1761,0.1553,0.2025,0.1724,0.1917,0.16,0.1869
Per_Sample_Accuracy,0.6535,0.6563,0.6827,0.6508,0.6652,0.6535,0.6534
Label_Stability,0.7482,0.7792,0.7129,0.7532,0.727,0.7721,0.7321
TPR,0.62423,0.455224,0.6,0.645084,0.5,0.433962,0.716463
TNR,0.725835,0.804878,0.80303,0.706818,0.790698,0.798507,0.66113


100%|██████████| 1/1 [00:34<00:00, 34.66s/it]










### Experiment 2

In [30]:
# run_experiment(exp_num=2, model_seed=200)