In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
import pandas as pd

from virny.datasets import ACSEmploymentDataset
from virny.utils.protected_groups_partitioning import create_test_protected_groups

In [4]:
def get_proportions(protected_groups, X_data):
    for col_name in protected_groups.keys():
        proportion = protected_groups[col_name].shape[0] / X_data.shape[0]
        print(f'{col_name}: {round(proportion, 3)}')


def get_base_rate(protected_groups, y_data):
    for col_name in protected_groups.keys():
        filtered_df = y_data.iloc[protected_groups[col_name].index].copy(deep=True)
        base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]
        print(f'{col_name}: {round(base_rate, 3)}')

    base_rate = y_data[y_data == 1].shape[0] / y_data.shape[0]
    print(f'overall: {round(base_rate, 3)}')

In [5]:
sensitive_attributes_dct = {'SEX': '2', 'RAC1P': ['2', '3', '4', '5', '6', '7', '8', '9'], 'SEX & RAC1P': None}

In [6]:
data_loader = ACSEmploymentDataset(state=['CA'], year=2018, with_nulls=False)
data_loader.full_df.head()

Unnamed: 0,MAR,MIL,ESP,MIG,DREM,NATIVITY,DIS,DEAR,DEYE,SEX,RAC1P,RELP,CIT,ANC,SCHL,AGEP,ESR
0,1,4,0,3,2,1,2,2,2,1,8,16,1,1,14,30,0
1,5,4,0,1,2,1,2,2,2,2,1,17,1,1,14,18,0
2,1,2,0,1,2,1,1,2,2,1,9,17,1,2,17,69,0
3,5,4,0,1,1,1,1,1,2,1,1,17,1,1,1,25,0
4,5,4,0,1,2,1,2,2,2,2,1,16,1,1,18,31,0


In [7]:
data_loader.full_df.shape

(302640, 17)

In [8]:
data_loader.full_df['SEX'].value_counts()

2    154465
1    148175
Name: SEX, dtype: int64

In [9]:
protected_groups = create_test_protected_groups(data_loader.X_data, data_loader.X_data, sensitive_attributes_dct)

In [10]:
for col_name in protected_groups.keys():
    print(f'{col_name}: {protected_groups[col_name].shape[0]}')

SEX_priv: 148175
SEX_dis: 154465
RAC1P_priv: 189095
RAC1P_dis: 113545
SEX&RAC1P_priv: 243951
SEX&RAC1P_dis: 58689


In [11]:
protected_groups.keys()

dict_keys(['SEX_priv', 'SEX_dis', 'RAC1P_priv', 'RAC1P_dis', 'SEX&RAC1P_priv', 'SEX&RAC1P_dis'])

In [12]:
get_proportions(protected_groups, data_loader.X_data)

SEX_priv: 0.49
SEX_dis: 0.51
RAC1P_priv: 0.625
RAC1P_dis: 0.375
SEX&RAC1P_priv: 0.806
SEX&RAC1P_dis: 0.194


In [13]:
get_base_rate(protected_groups, data_loader.y_data)

SEX_priv: 0.617
SEX_dis: 0.525
RAC1P_priv: 0.563
RAC1P_dis: 0.582
SEX&RAC1P_priv: 0.577
SEX&RAC1P_dis: 0.541
overall: 0.57


In [14]:
data_loader.y_data.iloc[protected_groups['SEX_priv'].index].head()

0    0
2    0
3    0
5    0
6    1
Name: ESR, dtype: int64

In [15]:
data_loader.y_data.iloc[protected_groups['SEX_priv'].index].dtype

dtype('int64')

In [16]:
filtered_df = data_loader.y_data.iloc[protected_groups['SEX_priv'].index].copy(deep=True)
base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]

In [17]:
base_rate

0.6170676564872617

In [18]:
filtered_df[filtered_df == 1].shape[0]

91434

In [19]:
filtered_df.shape[0]

148175