In [79]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [80]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [82]:
import pandas as pd

from virny.datasets.data_loaders import LawSchoolDataset
from virny.utils.protected_groups_partitioning import create_test_protected_groups

In [83]:
def get_proportions(protected_groups, X_data):
    for col_name in protected_groups.keys():
        proportion = protected_groups[col_name].shape[0] / X_data.shape[0]
        print(f'{col_name}: {round(proportion, 3)}')


def get_base_rate(protected_groups, y_data):
    for col_name in protected_groups.keys():
        filtered_df = y_data.iloc[protected_groups[col_name].index].copy(deep=True)
        base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]
        print(f'{col_name}: {round(base_rate, 3)}')

    base_rate = y_data[y_data == 1].shape[0] / y_data.shape[0]
    print(f'overall: {round(base_rate, 3)}')

In [84]:
sensitive_attributes_dct = {'male': '0.0', 'race': 'Non-White', 'male & race': None}

In [85]:
data_loader = LawSchoolDataset()
data_loader.full_df.head()

Unnamed: 0,decile1b,decile3,lsat,ugpa,zfygpa,zgpa,fulltime,fam_inc,male,tier,race,pass_bar
0,10.0,10.0,44.0,3.5,1.33,1.88,1.0,5.0,0.0,4.0,White,1.0
1,5.0,4.0,29.0,3.5,-0.11,-0.57,1.0,4.0,0.0,2.0,White,1.0
2,8.0,7.0,37.0,3.4,0.63,0.37,1.0,3.0,1.0,4.0,White,1.0
3,8.0,7.0,43.0,3.3,0.67,0.34,1.0,4.0,0.0,4.0,White,1.0
4,3.0,2.0,41.0,3.3,-0.67,-1.3,1.0,4.0,0.0,5.0,White,1.0


In [86]:
data_loader.full_df.shape

(20798, 12)

In [87]:
data_loader.full_df['male'].value_counts()

1.0    11675
0.0     9123
Name: male, dtype: int64

In [88]:
protected_groups = create_test_protected_groups(data_loader.X_data, data_loader.X_data, sensitive_attributes_dct)

In [89]:
for col_name in protected_groups.keys():
    print(f'{col_name}: {protected_groups[col_name].shape[0]}')

male_priv: 11675
male_dis: 9123
race_priv: 17491
race_dis: 3307
male&race_priv: 19068
male&race_dis: 1730


In [90]:
protected_groups.keys()

dict_keys(['male_priv', 'male_dis', 'race_priv', 'race_dis', 'male&race_priv', 'male&race_dis'])

In [91]:
get_proportions(protected_groups, data_loader.X_data)

male_priv: 0.561
male_dis: 0.439
race_priv: 0.841
race_dis: 0.159
male&race_priv: 0.917
male&race_dis: 0.083


In [92]:
get_base_rate(protected_groups, data_loader.y_data)

male_priv: 0.899
male_dis: 0.878
race_priv: 0.921
race_dis: 0.723
male&race_priv: 0.906
male&race_dis: 0.713
overall: 0.89


In [93]:
data_loader.y_data.iloc[protected_groups['male_priv'].index].head()

2     1.0
5     0.0
7     0.0
12    1.0
18    1.0
Name: pass_bar, dtype: float64

In [94]:
data_loader.y_data.iloc[protected_groups['male_priv'].index].dtype

dtype('float64')

In [95]:
filtered_df = data_loader.y_data.iloc[protected_groups['male_priv'].index].copy(deep=True)
base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]

In [96]:
base_rate

0.8990149892933619

In [97]:
filtered_df[filtered_df == 1].shape[0]

10496

In [98]:
filtered_df.shape[0]

11675