# Data Prep and Loading

In [23]:
import opendp.prelude as dp
dp.enable_features('contrib')
import numpy as np 
import pandas as pd
from race_imputation.race_imputation_model import RaceImputationModel
import data_preprocessing_utils
# force all columns to show on head call
pd.set_option('display.max_columns', None)


In [24]:
voter_data_all = pd.read_csv('./data/combined_nc.csv')
voter_data = data_preprocessing_utils.prep_voter_dataframe(voter_data_all)

imputation_model = RaceImputationModel(["tract_code"],["race"])
imputation_model._fit(voter_data_all)

In [25]:
housing_data_all = pd.read_csv('./data/hmda_nc.csv')
housing_data = data_preprocessing_utils.prep_housing_dataframe(housing_data_all)

print(housing_data['tract_code'].unique())


  housing_data_all = pd.read_csv('./data/hmda_nc.csv')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_data.rename(columns={'derived_race': 'race', "census_tract":"tract_code"}, inplace=True)


['020800' '041100' '042300' ... '010709' '001927' '980300']


In [26]:
counts = housing_data["race"].value_counts()
# get the proportion of each
proportions = counts/counts.sum()
housing_data["race"].unique()

array(['Race Not Available', 'black', 'white', 'asian', 'Joint',
       'Native Hawaiian or Other Pacific Islander',
       'American Indian or Alaska Native', '2 or more minority races',
       'other'], dtype=object)

In [27]:
# Get the true proportions of race in the dataset, ignore Race Not Available
counts["Race Not Available"] = 0
proportions = counts/counts.sum()
proportions

race
white                                        0.720196
Race Not Available                           0.000000
black                                        0.176173
asian                                        0.064581
Joint                                        0.022799
American Indian or Alaska Native             0.010384
2 or more minority races                     0.003474
Native Hawaiian or Other Pacific Islander    0.002079
other                                        0.000313
Name: count, dtype: float64

In [13]:
housing_data_imputated = pd.read_csv('./data/hmda_nc_ri.csv')

# get the accuracies of the imputations
housing_data_imputation_comp = housing_data_imputated[housing_data_imputated["race"]!= "Race Not Available"]
print(housing_data_imputation_comp["pred_ri_argmax"].unique())
accuracy_argmax = sum(housing_data_imputation_comp["pred_ri_argmax"] == housing_data_imputation_comp["race"])/len(housing_data_imputation_comp)
accuracy_sample = sum(housing_data_imputation_comp["pred_ri_sample"] == housing_data_imputation_comp["race"])/len(housing_data_imputation_comp)
accuracy_threshold_25 = sum(housing_data_imputation_comp["pred_ri_threshold_25"] == housing_data_imputation_comp["race"])/len(housing_data_imputation_comp)
accuracy_threshold_50 = sum(housing_data_imputation_comp["pred_ri_threshold_5"] == housing_data_imputation_comp["race"])/len(housing_data_imputation_comp)
accuracy_threshold_75 = sum(housing_data_imputation_comp["pred_ri_threshold_75"] == housing_data_imputation_comp["race"])/len(housing_data_imputation_comp)
accuracy_threshold_95 = sum(housing_data_imputation_comp["pred_ri_threshold_95"] == housing_data_imputation_comp["race"])/len(housing_data_imputation_comp)
print("accuracy_argmax: ", accuracy_argmax)
print("accuracy_sample: ", accuracy_sample)
print("accuracy_threshold_25: ", accuracy_threshold_25)
print("accuracy_threshold_50: ", accuracy_threshold_50)
print("accuracy_threshold_75: ", accuracy_threshold_75)
print("accuracy_threshold_95: ", accuracy_threshold_95)

['white' 'black' 'asian']
accuracy_argmax:  0.7268935079726652
accuracy_sample:  0.6100256264236902
accuracy_threshold_25:  0.7268935079726652
accuracy_threshold_50:  0.7069561503416857
accuracy_threshold_75:  0.49005410022779045
accuracy_threshold_95:  0.009658314350797266


In [28]:

# demographic parity of the ground truth labels for argmax
model = RaceImputationModel(["race"], ["denied"])
model._fit(housing_data_imputation_comp)
print(model.conditional_probs.items())
print(model.col_to_name_map)
print(model.val_to_idx)

dict_items([((5, 0), 0.7089151797258857), ((7, 0), 0.8270134778775406), ((7, 1), 0.1729865221224593), ((4, 0), 0.8709051629116882), ((5, 1), 0.2910848202741143), ((2, 0), 0.8327713250905457), ((3, 0), 0.678082191780822), ((1, 1), 0.33205374280230326), ((2, 1), 0.16722867490945423), ((0, 0), 0.6959016393442623), ((4, 1), 0.1290948370883118), ((1, 0), 0.6679462571976967), ((0, 1), 0.3040983606557377), ((3, 1), 0.3219178082191781), ((6, 1), 0.38181818181818183), ((6, 0), 0.6181818181818182)])
{0: 'race', 1: 'denied'}
{0: {'2 or more minority races': 0, 'American Indian or Alaska Native': 1, 'Joint': 2, 'Native Hawaiian or Other Pacific Islander': 3, 'asian': 4, 'black': 5, 'other': 6, 'white': 7}, 1: {0: 0, 1: 1}}


In [33]:
model.demographic_parity()

AttributeError: 'RaceImputationModel' object has no attribute 'demographic_parity'