# Visualizations for Different Datasets

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "data-cleaning-stability":
    os.chdir("../../../..")

print('Current location: ', os.getcwd())

Current location:  /Users/denys_herasymuk/Research/NYU/ML_Lifecycle_Project/Code/data-cleaning-stability


In [4]:
from source.visualizations.models_viz import create_box_plots_for_diff_imputers_and_datasets
from configs.constants import (ACS_INCOME_DATASET, ACS_EMPLOYMENT_DATASET, LAW_SCHOOL_DATASET, GERMAN_CREDIT_DATASET,
                               CARDIOVASCULAR_DISEASE_DATASET, BANK_MARKETING_DATASET, DIABETES_DATASET, ErrorRepairMethod)

## Initialize Configs

In [5]:
DATASETS_ALL_SENSITIVE_ATTRS = {
    ACS_INCOME_DATASET: ['SEX', 'RAC1P', 'SEX&RAC1P'],
    LAW_SCHOOL_DATASET: ['male', 'race', 'male&race'],
    GERMAN_CREDIT_DATASET: ['sex', 'age', 'sex&age'],
    CARDIOVASCULAR_DISEASE_DATASET: ['gender'],
    BANK_MARKETING_DATASET: ['age'],
    DIABETES_DATASET: ['Gender'],
    ACS_EMPLOYMENT_DATASET: ['SEX', 'RAC1P', 'SEX&RAC1P'],
}
DATASETS_SENSITIVE_ATTRS = {
    ACS_INCOME_DATASET: 'SEX&RAC1P',
    LAW_SCHOOL_DATASET: 'male&race',
    GERMAN_CREDIT_DATASET: 'sex',
    CARDIOVASCULAR_DISEASE_DATASET: 'gender',
    BANK_MARKETING_DATASET: 'age',
    DIABETES_DATASET: 'Gender',
    ACS_EMPLOYMENT_DATASET: 'SEX&RAC1P',
}

In [17]:
from source.custom_classes.database_client import DatabaseClient

db_client = DatabaseClient()
db_client.connect()

## Metric Visualizations

### Overall Metrics

In [7]:
create_box_plots_for_diff_imputers_and_datasets(train_injection_scenario='MAR3',
                                                test_injection_scenario='MCAR3',
                                                metric_name='accuracy',
                                                db_client=db_client)

query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mar3', 'metric': 'Accuracy', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for german
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mar3', 'metric': 'Accuracy', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for bank
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mar3', 'metric': 'Accuracy', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for heart
query: {'dataset_name': 'diabetes', 'evaluation_scenario': 'exp1_mar3', 'metric': 'Accuracy', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for diabetes
query: {'dataset_name': 'law_school', 'evaluation_scenario': 'exp1_mar3', 'metric': 'Accuracy', 'subgroup': 'overall', 'ta

In [8]:
create_box_plots_for_diff_imputers_and_datasets(train_injection_scenario='MCAR3',
                                                test_injection_scenario='MCAR3',
                                                metric_name='f1',
                                                db_client=db_client)

query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'F1', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for german
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'F1', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for bank
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'F1', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for heart
query: {'dataset_name': 'diabetes', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'F1', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for diabetes
query: {'dataset_name': 'law_school', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'F1', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_d

In [9]:
create_box_plots_for_diff_imputers_and_datasets(train_injection_scenario='MCAR3',
                                                test_injection_scenario='MCAR3',
                                                metric_name='label_stability',
                                                db_client=db_client)

query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Label_Stability', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for german
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Label_Stability', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for bank
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Label_Stability', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for heart
query: {'dataset_name': 'diabetes', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Label_Stability', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (774, 9)
models_metric_df.shape: (48, 10)
Extracted data for diabetes
query: {'dataset_name': 'law_school', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Labe

In [10]:
create_box_plots_for_diff_imputers_and_datasets(train_injection_scenario='MCAR3',
                                                test_injection_scenario='MCAR3',
                                                metric_name='aleatoric_uncertainty',
                                                db_client=db_client)

query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Aleatoric_Uncertainty', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (756, 9)
models_metric_df.shape: (42, 10)
Extracted data for german
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Aleatoric_Uncertainty', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (756, 9)
models_metric_df.shape: (42, 10)
Extracted data for bank
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Aleatoric_Uncertainty', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (756, 9)
models_metric_df.shape: (42, 10)
Extracted data for heart
query: {'dataset_name': 'diabetes', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Aleatoric_Uncertainty', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (756, 9)
models_metric_df.shape: (42, 10)
Extracted data for diabetes
query: {'dataset_name': 'law_school', 'evaluation_scenario': 'exp1

In [11]:
create_box_plots_for_diff_imputers_and_datasets(train_injection_scenario='MCAR3',
                                                test_injection_scenario='MCAR3',
                                                metric_name='std',
                                                db_client=db_client)

query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Std', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (756, 9)
models_metric_df.shape: (42, 10)
Extracted data for german
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Std', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (756, 9)
models_metric_df.shape: (42, 10)
Extracted data for bank
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Std', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (756, 9)
models_metric_df.shape: (42, 10)
Extracted data for heart
query: {'dataset_name': 'diabetes', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Std', 'subgroup': 'overall', 'tag': 'OK'}
models_metric_df.shape: (756, 9)
models_metric_df.shape: (42, 10)
Extracted data for diabetes
query: {'dataset_name': 'law_school', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Std', 'subgroup': 'overall', 'tag': 'OK'}
models_met

### Disparity Metrics

In [18]:
create_box_plots_for_diff_imputers_and_datasets(train_injection_scenario='MCAR3',
                                                test_injection_scenario='MCAR3',
                                                dataset_to_group=DATASETS_SENSITIVE_ATTRS,
                                                metric_name='Equalized_Odds_TPR',
                                                db_client=db_client)

query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'TPR', 'subgroup': 'sex_dis', 'tag': 'OK'}
query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'TPR', 'subgroup': 'sex_priv', 'tag': 'OK'}
models_metric_df.shape: (774, 8)
models_metric_df.shape: (48, 9)
Extracted data for german
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'TPR', 'subgroup': 'age_dis', 'tag': 'OK'}
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'TPR', 'subgroup': 'age_priv', 'tag': 'OK'}
models_metric_df.shape: (774, 8)
models_metric_df.shape: (48, 9)
Extracted data for bank
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'TPR', 'subgroup': 'gender_dis', 'tag': 'OK'}
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'TPR', 'subgroup': 'gender_priv', 'tag': 'OK'}
models_metric_df.shape: (774, 8)
models_metric_df.shape: (48, 9)
Extracte

In [13]:
create_box_plots_for_diff_imputers_and_datasets(train_injection_scenario='MCAR3',
                                                test_injection_scenario='MCAR3',
                                                dataset_to_group=DATASETS_SENSITIVE_ATTRS,
                                                metric_name='Equalized_Odds_FNR',
                                                db_client=db_client)

query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'FNR', 'subgroup': 'sex_dis', 'tag': 'OK'}
query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'FNR', 'subgroup': 'sex_priv', 'tag': 'OK'}
models_metric_df.shape: (774, 8)
models_metric_df.shape: (48, 9)
Extracted data for german
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'FNR', 'subgroup': 'age_dis', 'tag': 'OK'}
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'FNR', 'subgroup': 'age_priv', 'tag': 'OK'}
models_metric_df.shape: (774, 8)
models_metric_df.shape: (48, 9)
Extracted data for bank
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'FNR', 'subgroup': 'gender_dis', 'tag': 'OK'}
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'FNR', 'subgroup': 'gender_priv', 'tag': 'OK'}
models_metric_df.shape: (774, 8)
models_metric_df.shape: (48, 9)
Extracte

In [14]:
create_box_plots_for_diff_imputers_and_datasets(train_injection_scenario='MCAR3',
                                                test_injection_scenario='MCAR3',
                                                dataset_to_group=DATASETS_SENSITIVE_ATTRS,
                                                metric_name='Selection_Rate_Difference',
                                                db_client=db_client)

query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Selection-Rate', 'subgroup': 'sex_dis', 'tag': 'OK'}
query: {'dataset_name': 'german', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Selection-Rate', 'subgroup': 'sex_priv', 'tag': 'OK'}
models_metric_df.shape: (1548, 8)
models_metric_df.shape: (48, 9)
Extracted data for german
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Selection-Rate', 'subgroup': 'age_dis', 'tag': 'OK'}
query: {'dataset_name': 'bank', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Selection-Rate', 'subgroup': 'age_priv', 'tag': 'OK'}
models_metric_df.shape: (1548, 8)
models_metric_df.shape: (48, 9)
Extracted data for bank
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Selection-Rate', 'subgroup': 'gender_dis', 'tag': 'OK'}
query: {'dataset_name': 'heart', 'evaluation_scenario': 'exp1_mcar3', 'metric': 'Selection-Rate', 'subgroup': 'gender_priv', 'tag': 'OK'}
model

In [15]:
db_client.close()