In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
import os
import shutil
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [6]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "Virny":
    os.chdir("../..")

print('Current location: ', os.getcwd())

Current location:  /home/denys_herasymuk/UCU/4course_2term/Bachelor_Thesis/Code/Virny


# Benchmark

## Import dependencies

In [7]:
import os
import pandas as pd
from datetime import datetime, timezone

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from virny.user_interfaces.metrics_computation_interfaces import compute_metrics_multiple_runs
from virny.utils.custom_initializers import create_config_obj, read_model_metric_dfs
from virny.custom_classes.base_dataset import BaseDataset

## Initialize Input Variables

### Create a Dataset class

In [8]:
class CompasWithoutSensitiveAttrsDataset(BaseDataset):
    """
    Dataset class for COMPAS dataset that does not contain sensitive attributes among feature columns
     to test blind classifiers

    Parameters
    ----------
    dataset_path
        Path to a dataset file

    """
    def __init__(self, dataset_path: str):
        # Read a dataset
        df = pd.read_csv(dataset_path)

        # Initial data types transformation
        int_columns = ['recidivism', 'age', 'age_cat_25 - 45', 'age_cat_Greater than 45',
                       'age_cat_Less than 25', 'c_charge_degree_F', 'c_charge_degree_M', 'sex']
        int_columns_dct = {col: "int" for col in int_columns}
        df = df.astype(int_columns_dct)

        # Define params
        target = 'recidivism'
        numerical_columns = ['juv_fel_count', 'juv_misd_count', 'juv_other_count','priors_count']
        categorical_columns = ['age_cat_25 - 45', 'age_cat_Greater than 45','age_cat_Less than 25',
                                    'c_charge_degree_F', 'c_charge_degree_M']
        features = numerical_columns + categorical_columns

        super().__init__(
            pandas_df=df,
            features=features,
            target=target,
            numerical_columns=numerical_columns,
            categorical_columns=categorical_columns
        )

In [9]:
dataset = CompasWithoutSensitiveAttrsDataset(dataset_path=os.path.join('virny', 'datasets', 'COMPAS.csv'))
dataset.X_data[dataset.X_data.columns[:6]].head()

Unnamed: 0,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat_25 - 45,age_cat_Greater than 45
0,0.0,-2.340451,1.0,-15.010999,1,0
1,0.0,0.0,0.0,0.0,1,0
2,0.0,0.0,0.0,0.0,0,0
3,0.0,0.0,0.0,6.0,1,0
4,0.0,0.0,0.0,7.513697,1,0


### Create a config object

In [10]:
ROOT_DIR = os.path.join('docs', 'examples')
config_yaml_path = os.path.join(ROOT_DIR, 'experiment_compas_config.yaml')
config_yaml_content = """
dataset_name: COMPAS_Without_Sensitive_Attributes
test_set_fraction: 0.2
bootstrap_fraction: 0.8
n_estimators: 10
#runs_seed_lst: [100, 200, 300, 400, 500, 600]
runs_seed_lst: [100, 200]
sensitive_attributes_dct: {'sex': 0, 'race': 'Caucasian', 'sex&race': None}
"""

with open(config_yaml_path, 'w', encoding='utf-8') as f:
    f.write(config_yaml_content)

In [11]:
config = create_config_obj(config_yaml_path=config_yaml_path)
SAVE_RESULTS_DIR_PATH = os.path.join(ROOT_DIR, 'results', 'benchmark', 'benchmark_results', config.dataset_name)
STANDARD_RESULTS_DIR_PATH = os.path.join(ROOT_DIR, 'results', 'benchmark', 'standard_results', config.dataset_name)

In [14]:
def clear_directory(dir_path):
    if not os.path.exists(dir_path):
        print('Directory does not exist --> skip deletion')
        return

    for filename in os.listdir(dir_path):
        file_path = os.path.join(dir_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

    print('Directory is cleared')

In [15]:
clear_directory(SAVE_RESULTS_DIR_PATH)

### Create a models config

In [16]:
models_config = {
    'DecisionTreeClassifier': DecisionTreeClassifier(criterion='gini',
                                                     max_depth=20,
                                                     max_features=0.6,
                                                     min_samples_split=0.1),
    'LogisticRegression': LogisticRegression(C=1,
                                             max_iter=50,
                                             penalty='l2',
                                             solver='newton-cg'),
#     'RandomForestClassifier': RandomForestClassifier(max_depth=4,
#                                                      max_features=0.6,
#                                                      min_samples_leaf=1,
#                                                      n_estimators=50),
#     'XGBClassifier': XGBClassifier(learning_rate=0.1,
#                                    max_depth=5,
#                                    n_estimators=20),
}

## Subgroup Metrics Computation

In [17]:
multiple_run_metrics_dct = compute_metrics_multiple_runs(dataset, config, models_config, SAVE_RESULTS_DIR_PATH, debug_mode=False)

Multiple runs progress:   0%|          | 0/2 [00:00<?, ?it/s]

Analyze models in one run:   0%|          | 0/2 [00:00<?, ?it/s]

##############################  [Model 1 / 2] Analyze DecisionTreeClassifier  ##############################
Model random_state:  101
Baseline X_train shape:  (4222, 9)
Baseline X_test shape:  (1056, 9)




2023-02-03 23:13:31 abstract_overall_variance_analyzer.py INFO    : Start classifiers testing by bootstrap


Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]





2023-02-03 23:13:31 abstract_overall_variance_analyzer.py INFO    : Successfully tested classifiers by bootstrap
2023-02-03 23:13:31 abstract_overall_variance_analyzer.py INFO    : Successfully computed predict proba metrics






##############################  [Model 2 / 2] Analyze LogisticRegression  ##############################
Model random_state:  102
Baseline X_train shape:  (4222, 9)
Baseline X_test shape:  (1056, 9)




2023-02-03 23:13:32 abstract_overall_variance_analyzer.py INFO    : Start classifiers testing by bootstrap


Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]





2023-02-03 23:13:32 abstract_overall_variance_analyzer.py INFO    : Successfully tested classifiers by bootstrap
2023-02-03 23:13:33 abstract_overall_variance_analyzer.py INFO    : Successfully computed predict proba metrics








Analyze models in one run:   0%|          | 0/2 [00:00<?, ?it/s]

##############################  [Model 1 / 2] Analyze DecisionTreeClassifier  ##############################
Model random_state:  201
Baseline X_train shape:  (4222, 9)
Baseline X_test shape:  (1056, 9)




2023-02-03 23:13:33 abstract_overall_variance_analyzer.py INFO    : Start classifiers testing by bootstrap


Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]





2023-02-03 23:13:33 abstract_overall_variance_analyzer.py INFO    : Successfully tested classifiers by bootstrap
2023-02-03 23:13:34 abstract_overall_variance_analyzer.py INFO    : Successfully computed predict proba metrics






##############################  [Model 2 / 2] Analyze LogisticRegression  ##############################
Model random_state:  202
Baseline X_train shape:  (4222, 9)
Baseline X_test shape:  (1056, 9)




2023-02-03 23:13:34 abstract_overall_variance_analyzer.py INFO    : Start classifiers testing by bootstrap


Classifiers testing by bootstrap:   0%|          | 0/10 [00:00<?, ?it/s]





2023-02-03 23:13:35 abstract_overall_variance_analyzer.py INFO    : Successfully tested classifiers by bootstrap
2023-02-03 23:13:35 abstract_overall_variance_analyzer.py INFO    : Successfully computed predict proba metrics








In [18]:
sample_model_metrics_df = multiple_run_metrics_dct[list(models_config.keys())[0]]
sample_model_metrics_df.head(20)

Unnamed: 0,Metric,overall,sex_priv,sex_dis,race_priv,race_dis,sex&race_priv,sex&race_dis,Model_Seed,Model_Name,Run_Number
0,Mean,0.523945,0.558551,0.515909,0.588504,0.480118,0.588024,0.468318,101,DecisionTreeClassifier,Run_1
1,Std,0.069135,0.075296,0.067704,0.066244,0.071097,0.081376,0.07123,101,DecisionTreeClassifier,Run_1
2,IQR,0.080063,0.083216,0.079331,0.078753,0.080953,0.096722,0.082762,101,DecisionTreeClassifier,Run_1
3,Entropy,0.0,0.240969,0.0,0.0,0.22721,0.227224,0.221926,101,DecisionTreeClassifier,Run_1
4,Jitter,0.154209,0.175209,0.149332,0.139006,0.164529,0.165909,0.160661,101,DecisionTreeClassifier,Run_1
5,Per_Sample_Accuracy,0.657576,0.673869,0.653792,0.660656,0.655485,0.653409,0.648069,101,DecisionTreeClassifier,Run_1
6,Label_Stability,0.791288,0.763819,0.797666,0.806557,0.780922,0.770455,0.785714,101,DecisionTreeClassifier,Run_1
7,TPR,0.603696,0.528571,0.616307,0.446541,0.679878,0.44,0.696113,101,DecisionTreeClassifier,Run_1
8,TNR,0.741652,0.782946,0.729545,0.794776,0.694352,0.730159,0.655319,101,DecisionTreeClassifier,Run_1
9,PPV,0.666667,0.569231,0.683511,0.563492,0.707937,0.392857,0.708633,101,DecisionTreeClassifier,Run_1


## Create a Benchmark Report

In [51]:
def convert_transposed_df_to_df(delta_model_metrics_df, subgroup_names, metric_names):
    converted_df_dct = {'Subgroup': []}
    for metric_name in metric_names:
        converted_df_dct[metric_name] = []

    for subgroup_name in subgroup_names:
        for metric_name in metric_names:
            subgroup_metric = delta_model_metrics_df.loc[delta_model_metrics_df['Metric'] == metric_name][subgroup_name].values[0]
            converted_df_dct[metric_name].append(subgroup_metric)

        converted_df_dct['Subgroup'].append(subgroup_name)

    return pd.DataFrame(converted_df_dct)


def populate_benchmark_report(report_df, models_metrics_dct, standard_models_metrics_dct, dataset_name, sensitive_attributes_dct):
    for model_name in models_metrics_dct.keys():
        benchmark_model_metrics_df = models_metrics_dct[model_name]
        standard_model_metrics_df = standard_models_metrics_dct[model_name]

        subgroup_names = ['overall']
        for group_name in sensitive_attributes_dct.keys():
            subgroup_names.append(group_name + '_priv')
            subgroup_names.append(group_name + '_dis')

        delta_model_metrics_df = pd.DataFrame()
        delta_model_metrics_df['Metric'] = standard_model_metrics_df['Metric']
        delta_model_metrics_df[subgroup_names] = \
            standard_model_metrics_df[subgroup_names].sub(benchmark_model_metrics_df[subgroup_names], fill_value=0) * 100

        metric_names = list(delta_model_metrics_df['Metric'].unique())
        converted_df = convert_transposed_df_to_df(delta_model_metrics_df, subgroup_names, metric_names)
        converted_df['Dataset'] = dataset_name
        converted_df['Model'] = model_name
        converted_df = converted_df.rename(columns={metric_name: 'Delta%_' + metric_name for metric_name in metric_names})
        columns_positions = ['Dataset', 'Model'] + [col for col in converted_df.columns if col not in ['Dataset', 'Model']]
        converted_df = converted_df[columns_positions]

        report_df = pd.concat([report_df, converted_df])

    return report_df

In [41]:
def create_averaged_dfs_dict(models_metrics_dct):
    models_average_metrics_dct = dict()
    for model_name in models_metrics_dct.keys():
        columns_to_group = [col for col in models_metrics_dct[model_name].columns
                            if col not in ('Model_Seed', 'Run_Number')]
        models_average_metrics_dct[model_name] = models_metrics_dct[model_name][columns_to_group].groupby(['Metric', 'Model_Name']).mean().reset_index()

    return models_average_metrics_dct

In [47]:
models_metrics_dct = read_model_metric_dfs(SAVE_RESULTS_DIR_PATH, model_names=models_config.keys())
standard_models_metrics_dct = read_model_metric_dfs(STANDARD_RESULTS_DIR_PATH, model_names=models_config.keys())

avg_models_metrics_dct = create_averaged_dfs_dict(models_metrics_dct)
avg_standard_models_metrics_dct = create_averaged_dfs_dict(standard_models_metrics_dct)

In [48]:
avg_models_metrics_dct[list(avg_models_metrics_dct.keys())[0]].head()

Unnamed: 0,Metric,Model_Name,overall,sex_priv,sex_dis,race_priv,race_dis,sex&race_priv,sex&race_dis
0,Accuracy,DecisionTreeClassifier,0.673295,0.679234,0.671919,0.662626,0.680342,0.662944,0.677754
1,Entropy,DecisionTreeClassifier,0.0,0.213263,0.0,0.0,0.113605,0.113612,0.110963
2,F1,DecisionTreeClassifier,0.635651,0.566,0.648309,0.500019,0.693246,0.481741,0.7049
3,FNR,DecisionTreeClassifier,0.373941,0.430519,0.363957,0.538634,0.297256,0.514375,0.284452
4,FPR,DecisionTreeClassifier,0.286661,0.258934,0.294616,0.220933,0.342479,0.253102,0.365568


In [52]:
report_df = pd.DataFrame()
report_df = populate_benchmark_report(report_df, avg_models_metrics_dct, avg_standard_models_metrics_dct,
                                      config.dataset_name, config.sensitive_attributes_dct)

In [53]:
report_df

Unnamed: 0,Dataset,Model,Subgroup,Delta_Accuracy,Delta_Entropy,Delta_F1,Delta_FNR,Delta_FPR,Delta_IQR,Delta_Jitter,Delta_Label_Stability,Delta_Mean,Delta_PPV,Delta_Per_Sample_Accuracy,Delta_Positive-Rate,Delta_Selection-Rate,Delta_Std,Delta_TNR,Delta_TPR
0,COMPAS_Without_Sensitive_Attributes,DecisionTreeClassifier,overall,0.662879,0.0,0.705521,0.18286,-1.425335,1.001948,-2.110786,2.255682,0.500808,1.533041,0.828598,-2.710905,-0.662879,0.477875,1.425335,-0.18286
1,COMPAS_Without_Sensitive_Attributes,DecisionTreeClassifier,sex_priv,0.418342,0.14675,-1.280805,1.233766,-1.087162,1.545537,-1.570699,0.784673,0.984647,-1.508151,1.852136,0.454545,-1.653266,0.757052,1.087162,-1.233766
2,COMPAS_Without_Sensitive_Attributes,DecisionTreeClassifier,sex_dis,0.719405,0.0,1.007574,0.055132,-1.507046,0.875877,-2.235643,2.59655,0.39074,2.059555,0.59047,-3.169023,-0.436142,0.413407,1.507046,-0.055132
3,COMPAS_Without_Sensitive_Attributes,DecisionTreeClassifier,race_priv,0.716318,0.0,0.869194,0.224618,-1.570867,1.841348,-2.093419,2.288967,0.461117,2.341495,0.056366,-4.017456,-0.914437,0.855857,1.570867,-0.224618
4,COMPAS_Without_Sensitive_Attributes,DecisionTreeClassifier,race_dis,0.64625,8.729628,0.781663,-0.152439,-1.025318,0.451637,-2.107766,2.225161,0.37758,1.244062,1.353202,-1.829268,-0.159965,0.235175,1.025318,0.152439
5,COMPAS_Without_Sensitive_Attributes,DecisionTreeClassifier,sex&race_priv,-1.521682,10.183621,-6.664638,4.5625,1.673882,2.208402,-1.192777,0.710293,0.042389,-8.690476,-2.277821,9.125,-1.332288,1.236724,-1.673882,-4.5625
6,COMPAS_Without_Sensitive_Attributes,DecisionTreeClassifier,sex&race_dis,0.371278,8.709648,0.613313,-0.176678,-0.386539,0.329728,-2.157888,2.517664,0.091965,0.956096,0.551617,-1.236749,0.206426,0.205572,0.386539,0.176678
0,COMPAS_Without_Sensitive_Attributes,LogisticRegression,overall,1.183712,0.0,0.745309,-1.124087,-0.935153,0.286156,-0.205406,0.346591,0.194552,0.218274,0.851326,1.381115,-0.284091,0.112051,0.935153,1.124087
1,COMPAS_Without_Sensitive_Attributes,LogisticRegression,sex_priv,0.345886,0.0,4.426281,-6.651584,3.276117,0.187878,-0.171096,0.395914,-0.781939,0.63505,1.106056,10.731523,4.766813,0.100439,-3.276117,6.651584
2,COMPAS_Without_Sensitive_Attributes,LogisticRegression,sex_dis,1.394232,0.0,0.388916,-0.270923,-2.240242,0.311586,-0.219091,0.341574,0.402038,0.506561,0.78881,-0.270923,-1.446087,0.115941,2.240242,0.270923
