In [59]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [61]:
cur_folder_name = os.getcwd().split('/')[-1]
if cur_folder_name != "fairness-variance":
    os.chdir("../..")

print('Current location: ', os.getcwd())

Current location:  /home/denys_herasymuk


In [62]:
import pandas as pd

from virny.datasets.data_loaders import ACSPublicCoverageDataset
from virny.utils.protected_groups_partitioning import create_test_protected_groups

In [63]:
def get_proportions(protected_groups, X_data):
    for col_name in protected_groups.keys():
        proportion = protected_groups[col_name].shape[0] / X_data.shape[0]
        print(f'{col_name}: {round(proportion, 3)}')


def get_base_rate(protected_groups, y_data):
    for col_name in protected_groups.keys():
        filtered_df = y_data.iloc[protected_groups[col_name].index].copy(deep=True)
        base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]
        print(f'{col_name}: {round(base_rate, 3)}')

    base_rate = y_data[y_data == 1].shape[0] / y_data.shape[0]
    print(f'overall: {round(base_rate, 3)}')

In [64]:
sensitive_attributes_dct = {'SEX': '2', 'RAC1P': ['2', '3', '4', '5', '6', '7', '8', '9'], 'SEX & RAC1P': None}

In [65]:
data_loader = ACSPublicCoverageDataset(state=['CA'], year=2018, with_nulls=False, subsample_size=15_000, subsample_seed=42)
data_loader.full_df.head()

Unnamed: 0,SCHL,MAR,SEX,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,ESR,ST,FER,RAC1P,AGEP,PINCP
0,19,5,1,2,0,1,3,4,1,1,2,2,2,6,6,0,1,21,3150.0
1,16,5,1,2,0,3,3,4,4,1,2,2,2,1,6,0,9,18,1600.0
2,13,5,2,2,1,1,1,0,2,1,2,2,2,6,6,2,1,16,0.0
3,20,1,2,2,0,4,1,4,1,2,2,2,2,6,6,2,8,43,0.0
4,16,1,2,2,0,4,1,4,1,2,2,2,2,6,6,0,6,54,0.0


In [66]:
data_loader.full_df.shape

(15000, 19)

In [67]:
data_loader.full_df['SEX'].value_counts()

2    8329
1    6671
Name: SEX, dtype: int64

In [68]:
protected_groups = create_test_protected_groups(data_loader.X_data, data_loader.X_data, sensitive_attributes_dct)

In [69]:
for col_name in protected_groups.keys():
    print(f'{col_name}: {protected_groups[col_name].shape[0]}')

SEX_priv: 6671
SEX_dis: 8329
RAC1P_priv: 8478
RAC1P_dis: 6522
SEX&RAC1P_priv: 11415
SEX&RAC1P_dis: 3585


In [70]:
protected_groups.keys()

dict_keys(['SEX_priv', 'SEX_dis', 'RAC1P_priv', 'RAC1P_dis', 'SEX&RAC1P_priv', 'SEX&RAC1P_dis'])

In [71]:
get_proportions(protected_groups, data_loader.X_data)

SEX_priv: 0.445
SEX_dis: 0.555
RAC1P_priv: 0.565
RAC1P_dis: 0.435
SEX&RAC1P_priv: 0.761
SEX&RAC1P_dis: 0.239


In [72]:
get_base_rate(protected_groups, data_loader.y_data)

SEX_priv: 0.395
SEX_dis: 0.357
RAC1P_priv: 0.363
RAC1P_dis: 0.388
SEX&RAC1P_priv: 0.374
SEX&RAC1P_dis: 0.374
overall: 0.374


In [73]:
data_loader.y_data.iloc[protected_groups['SEX_priv'].index].head()

0    0
1    0
5    1
6    1
7    0
Name: PUBCOV, dtype: int64

In [74]:
data_loader.y_data.iloc[protected_groups['SEX_priv'].index].dtype

dtype('int64')

In [75]:
filtered_df = data_loader.y_data.iloc[protected_groups['SEX_priv'].index].copy(deep=True)
base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]

In [76]:
base_rate

0.3951431569479838

In [77]:
filtered_df[filtered_df == 1].shape[0]

2636

In [78]:
filtered_df.shape[0]

6671