In [9]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [11]:
import pandas as pd

from virny.datasets.base import BaseDataLoader
from virny.datasets.data_loaders import LawSchoolDataset
from virny.utils.protected_groups_partitioning import create_test_protected_groups

In [12]:
def get_proportions(protected_groups, X_data):
    for col_name in protected_groups.keys():
        proportion = protected_groups[col_name].shape[0] / X_data.shape[0]
        print(f'{col_name}: {round(proportion, 3)}')


def get_base_rate(protected_groups, y_data):
    for col_name in protected_groups.keys():
        filtered_df = y_data.iloc[protected_groups[col_name].index].copy(deep=True)
        base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]
        print(f'{col_name}: {round(base_rate, 3)}')

    base_rate = y_data[y_data == 1].shape[0] / y_data.shape[0]
    print(f'overall: {round(base_rate, 3)}')

In [13]:
# sensitive_attributes_dct = {'male': '0.0', 'race': 'Non-White', 'male & race': None}
sensitive_attributes_dct = {'sex': 'F'}

In [14]:
class StudentPerformanceDataset(BaseDataLoader):
    def __init__(self, dataset_path=None):
        df = pd.read_csv(dataset_path, delimiter=';')

        target = 'G3'
        df[target] = (df[target] >= 10) * 1

        df['G1'] = (df['G1'] >= 10) * 1
        df['G2'] = (df['G2'] >= 10) * 1


        categorical_columns = ['Mjob', 'Fjob', 'reason', 'guardian', 'sex']
        numerical_columns = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures',
                             'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health',
                             'absences', 'G1', 'G2']

        super().__init__(
            full_df=df,
            target=target,
            numerical_columns=numerical_columns,
            categorical_columns=categorical_columns,
        )

In [15]:
data_loader = StudentPerformanceDataset(dataset_path='../data/student-mat.csv')
data_loader.full_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,0,0,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,0,0,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,0,0,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,1,1,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,0,1,1


In [16]:
data_loader.full_df.shape

(395, 33)

In [17]:
data_loader.full_df['sex'].value_counts()

F    208
M    187
Name: sex, dtype: int64

In [18]:
protected_groups = create_test_protected_groups(data_loader.X_data, data_loader.X_data, sensitive_attributes_dct)

In [19]:
for col_name in protected_groups.keys():
    print(f'{col_name}: {protected_groups[col_name].shape[0]}')

sex_priv: 187
sex_dis: 208


In [20]:
protected_groups.keys()

dict_keys(['sex_priv', 'sex_dis'])

In [21]:
get_proportions(protected_groups, data_loader.X_data)

sex_priv: 0.473
sex_dis: 0.527


In [22]:
get_base_rate(protected_groups, data_loader.y_data)

sex_priv: 0.706
sex_dis: 0.639
overall: 0.671


In [23]:
data_loader.y_data.iloc[protected_groups['male_priv'].index].head()

KeyError: 'male_priv'

In [None]:
data_loader.y_data.iloc[protected_groups['male_priv'].index].dtype

In [None]:
filtered_df = data_loader.y_data.iloc[protected_groups['male_priv'].index].copy(deep=True)
base_rate = filtered_df[filtered_df == 1].shape[0] / filtered_df.shape[0]

In [None]:
base_rate

In [None]:
filtered_df[filtered_df == 1].shape[0]

In [None]:
filtered_df.shape[0]