# Visualizations for Imputation

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "data-cleaning-stability":
    os.chdir("../../../..")

print('Current location: ', os.getcwd())

Current location:  /Users/denys_herasymuk/Research/NYU/ML_Lifecycle_Project/Code/data-cleaning-stability


In [4]:
from source.visualizations.imputers_viz import create_box_plots_for_mixed_exp
from configs.constants import (ACS_INCOME_DATASET, ACS_EMPLOYMENT_DATASET, LAW_SCHOOL_DATASET, GERMAN_CREDIT_DATASET,
                               CARDIOVASCULAR_DISEASE_DATASET, BANK_MARKETING_DATASET, DIABETES_DATASET)

## Initialize Configs

In [5]:
DATASET_NAME = ACS_EMPLOYMENT_DATASET
DATASETS_SENSITIVE_ATTRS = {
    ACS_INCOME_DATASET: ['SEX', 'RAC1P', 'SEX&RAC1P'],
    LAW_SCHOOL_DATASET: ['male', 'race', 'male&race'],
    GERMAN_CREDIT_DATASET: ['sex', 'age', 'sex&age'],
    CARDIOVASCULAR_DISEASE_DATASET: ['gender'],
    BANK_MARKETING_DATASET: ['age'],
    DIABETES_DATASET: ['Gender'],
    ACS_EMPLOYMENT_DATASET: ['SEX', 'RAC1P', 'SEX&RAC1P'],
}
SENSITIVE_ATTR_FOR_DISPARITY_METRICS = DATASETS_SENSITIVE_ATTRS[DATASET_NAME][-1]

In [6]:
from source.custom_classes.database_client import DatabaseClient, get_secrets_path

db_client = DatabaseClient(secrets_path=get_secrets_path('secrets_3.env'))
db_client.connect()

## Metric Visualizations

### Overall Metrics

In [7]:
create_box_plots_for_mixed_exp(dataset_name=DATASET_NAME,
                               column_names=['SCHL', 'DIS', 'MIL'],
                               metric_name='f1_score',
                               db_client=db_client,
                               without_dummy=False)

In [8]:
create_box_plots_for_mixed_exp(dataset_name=DATASET_NAME,
                               column_names=['AGEP'],
                               metric_name='rmse',
                               db_client=db_client)

In [9]:
create_box_plots_for_mixed_exp(dataset_name=DATASET_NAME,
                               column_names=['SCHL', 'DIS', 'MIL', 'AGEP'],
                               metric_name='kl_divergence_pred',
                               db_client=db_client,
                               without_dummy=False)

In [10]:
create_box_plots_for_mixed_exp(dataset_name=DATASET_NAME,
                               column_names=['SCHL', 'DIS', 'MIL', 'AGEP'],
                               metric_name='kl_divergence_total',
                               db_client=db_client,
                               without_dummy=False)

### Disparity Metrics

In [11]:
create_box_plots_for_mixed_exp(dataset_name=DATASET_NAME,
                               column_names=['SCHL', 'DIS', 'MIL'],
                               metric_name='f1_score',
                               group=SENSITIVE_ATTR_FOR_DISPARITY_METRICS,
                               db_client=db_client,
                               without_dummy=False)

In [12]:
create_box_plots_for_mixed_exp(dataset_name=DATASET_NAME,
                               column_names=['AGEP'],
                               metric_name='rmse',
                               group=SENSITIVE_ATTR_FOR_DISPARITY_METRICS,
                               db_client=db_client,
                               without_dummy=False)

In [13]:
create_box_plots_for_mixed_exp(dataset_name=DATASET_NAME,
                               column_names=['SCHL', 'DIS', 'MIL', 'AGEP'],
                               metric_name='kl_divergence_pred',
                               group=SENSITIVE_ATTR_FOR_DISPARITY_METRICS,
                               db_client=db_client,
                               without_dummy=False)

In [14]:
db_client.close()