In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
import pandas as pd

from virny.datasets.data_loaders import ACSIncomeDataset
from virny.utils.protected_groups_partitioning import create_test_protected_groups

In [4]:
def get_proportions(protected_groups, X_data):
    for col_name in protected_groups.keys():
        proportion = protected_groups[col_name].shape[0] / X_data.shape[0]
        print(f'{col_name}: {round(proportion, 3)}')


def get_base_rate(protected_groups, y_data):
    for col_name in protected_groups.keys():
        filtered_df = y_data.iloc[protected_groups[col_name].index].copy(deep=True)
        base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]
        print(f'{col_name}: {round(base_rate, 3)}')

    base_rate = y_data[y_data == 1].shape[0] / y_data.shape[0]
    print(f'overall: {round(base_rate, 3)}')

In [5]:
sensitive_attributes_dct = {'SEX': '2', 'RAC1P': ['2', '3', '4', '5', '6', '7', '8', '9'], 'SEX & RAC1P': None}

In [6]:
data_loader = ACSIncomeDataset(state=['WV', 'MS', 'AR', 'NM', 'LA', 'AL', 'KY'], year=2018, with_nulls=False,
                               subsample_size=100_000, subsample_seed=42)
data_loader.full_df.head()

Unnamed: 0,SCHL,COW,MAR,OCCP,POBP,RELP,SEX,RAC1P,AGEP,WKHP
0,16,1,3,4230,1,0,2,2,61,15.0
1,23,5,1,3090,134,0,1,1,74,50.0
2,19,1,3,9645,26,2,1,2,59,40.0
3,14,2,5,4251,5,16,1,1,17,18.0
4,21,1,1,1021,217,1,1,6,33,45.0


In [7]:
data_loader.full_df.shape

(100000, 10)

In [8]:
data_loader.full_df['SEX'].value_counts()

1    51581
2    48419
Name: SEX, dtype: int64

In [9]:
protected_groups = create_test_protected_groups(data_loader.X_data, data_loader.X_data, sensitive_attributes_dct)

In [10]:
for col_name in protected_groups.keys():
    print(f'{col_name}: {protected_groups[col_name].shape[0]}')

SEX_priv: 51581
SEX_dis: 48419
RAC1P_priv: 78808
RAC1P_dis: 21192
SEX&RAC1P_priv: 88731
SEX&RAC1P_dis: 11269


In [11]:
protected_groups.keys()

dict_keys(['SEX_priv', 'SEX_dis', 'RAC1P_priv', 'RAC1P_dis', 'SEX&RAC1P_priv', 'SEX&RAC1P_dis'])

In [12]:
get_proportions(protected_groups, data_loader.X_data)

SEX_priv: 0.516
SEX_dis: 0.484
RAC1P_priv: 0.788
RAC1P_dis: 0.212
SEX&RAC1P_priv: 0.887
SEX&RAC1P_dis: 0.113


In [13]:
get_base_rate(protected_groups, data_loader.y_data)

SEX_priv: 0.386
SEX_dis: 0.212
RAC1P_priv: 0.333
RAC1P_dis: 0.186
SEX&RAC1P_priv: 0.322
SEX&RAC1P_dis: 0.14
overall: 0.302


In [33]:
data_loader.y_data.iloc[protected_groups['SEX_priv'].index].head()

1    1
2    0
3    0
4    1
5    0
Name: PINCP, dtype: int64

In [34]:
data_loader.y_data.iloc[protected_groups['SEX_priv'].index].dtype

dtype('int64')

In [35]:
filtered_df = data_loader.y_data.iloc[protected_groups['SEX_priv'].index].copy(deep=True)
base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]

In [36]:
base_rate

0.3862274868653186

In [37]:
filtered_df[filtered_df == 1].shape[0]

19922

In [38]:
filtered_df.shape[0]

51581