In [1]:
import pandas as pd

# data from https://colab.research.google.com/github/tensorflow/fairness-indicators/blob/master/g3doc/tutorials/Fairness_Indicators_Pandas_Case_Study.ipynb
# features described here: https://rdrr.io/cran/fairml/man/law.school.admissions.html
# we predict pass_bar as in the ARL paper

df = pd.read_csv('https://storage.googleapis.com/lawschool_dataset/bar_pass_prediction.csv', index_col = 0).dropna()

In [2]:
df.columns

Index(['decile1b', 'decile3', 'ID', 'decile1', 'sex', 'race', 'cluster',
       'lsat', 'ugpa', 'zfygpa', 'DOB_yr', 'grad', 'zgpa', 'bar1', 'bar1_yr',
       'bar2', 'bar2_yr', 'fulltime', 'fam_inc', 'age', 'gender', 'parttime',
       'male', 'race1', 'race2', 'Dropout', 'other', 'asian', 'black', 'hisp',
       'pass_bar', 'bar', 'tier', 'index6040', 'indxgrp', 'indxgrp2',
       'dnn_bar_pass_prediction', 'gpa'],
      dtype='object')

In [3]:
cont_cols = ['age', 'decile1', 'decile3', 'fam_inc', 'lsat', 'ugpa']
cat_cols = ['cluster', 'fulltime']
protected_attrs = ['male', 'race1']
target_col = 'pass_bar'

In [4]:
for r in df['race1'].unique():
    print(r)
    print(df[df.race1 == r].pass_bar.value_counts(normalize = False))

white
1.0    16579
0.0      560
Name: pass_bar, dtype: int64
hisp
1.0    811
0.0    115
Name: pass_bar, dtype: int64
asian
1.0    730
0.0     60
Name: pass_bar, dtype: int64
black
1.0    932
0.0    266
Name: pass_bar, dtype: int64
other
1.0    337
0.0     37
Name: pass_bar, dtype: int64


In [5]:
df = df[cont_cols + cat_cols + protected_attrs + [target_col]]

In [6]:
race_mapping = {
   i:c for c, i in enumerate(df['race1'].unique())    
}
race_mapping

{'white': 0, 'hisp': 1, 'asian': 2, 'black': 3, 'other': 4}

In [7]:
df['race1'] = df['race1'].map(race_mapping)

In [8]:
df

Unnamed: 0,age,decile1,decile3,fam_inc,lsat,ugpa,cluster,fulltime,male,race1,pass_bar
0,-62.0,10.0,10.0,5.0,44.0,3.5,1.0,1.0,0.0,0,1.0
1,-62.0,5.0,4.0,4.0,29.0,3.5,2.0,1.0,0.0,0,1.0
2,-58.0,3.0,2.0,1.0,36.0,3.5,3.0,1.0,1.0,0,1.0
3,-51.0,7.0,4.0,4.0,39.0,3.5,3.0,1.0,1.0,0,1.0
4,-61.0,9.0,8.0,4.0,48.0,3.5,4.0,1.0,1.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
22402,-62.0,3.0,1.0,2.0,26.5,1.8,6.0,1.0,1.0,3,0.0
22403,-57.0,3.0,1.0,3.0,19.7,1.8,6.0,1.0,1.0,3,0.0
22404,-59.0,7.0,8.0,3.0,36.0,1.8,3.0,2.0,1.0,3,1.0
22405,-51.0,10.0,10.0,3.0,44.0,1.5,3.0,2.0,1.0,0,1.0


In [9]:
df.to_csv('lsac.csv')