# Data Prep and Loading


In [2]:
# import opendp.prelude as dp
# dp.enable_features('contrib')
import numpy as np 
import pandas as pd
from race_imputation.race_imputation_model import RaceImputationModel
import data_preprocessing_utils
# force all columns to show on head call
pd.set_option('display.max_columns', None)


In [3]:
voter_data_all = pd.read_csv('./data/combined_nc.csv')
voter_data = data_preprocessing_utils.prep_voter_dataframe(voter_data_all)

imputation_model = RaceImputationModel(["tract_code"],["race"])
imputation_model._fit(voter_data_all)

In [4]:
housing_data_all = pd.read_csv('./data/hmda_nc.csv')
housing_data = data_preprocessing_utils.prep_housing_dataframe(housing_data_all)

print(housing_data['tract_code'].unique())


  housing_data_all = pd.read_csv('./data/hmda_nc.csv')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_data.rename(columns={'derived_race': 'race', "census_tract":"tract_code"}, inplace=True)


['020800' '041100' '042300' ... '010709' '001927' '980300']


In [5]:
counts = housing_data["race"].value_counts()
# get the proportion of each
proportions = counts/counts.sum()
housing_data["race"].unique()

array(['Race Not Available', 'black', 'white', 'asian', 'Joint',
       'Native Hawaiian or Other Pacific Islander',
       'American Indian or Alaska Native', '2 or more minority races',
       'other'], dtype=object)

In [6]:
# Get the true proportions of race in the dataset, ignore Race Not Available
counts["Race Not Available"] = 0
proportions = counts/counts.sum()
proportions

race
white                                        0.720196
Race Not Available                           0.000000
black                                        0.176173
asian                                        0.064581
Joint                                        0.022799
American Indian or Alaska Native             0.010384
2 or more minority races                     0.003474
Native Hawaiian or Other Pacific Islander    0.002079
other                                        0.000313
Name: count, dtype: float64

In [7]:
housing_data_imputated = pd.read_csv('./data/hmda_nc_ri.csv')

# get the accuracies of the imputations
dp_housing_data_imputation_comp = housing_data_imputated[housing_data_imputated["race"]!= "Race Not Available"]
print(dp_housing_data_imputation_comp["pred_ri_argmax"].unique())
accuracy_argmax = sum(dp_housing_data_imputation_comp["pred_ri_argmax"] == dp_housing_data_imputation_comp["race"])/len(dp_housing_data_imputation_comp)
accuracy_sample = sum(dp_housing_data_imputation_comp["pred_ri_sample"] == dp_housing_data_imputation_comp["race"])/len(dp_housing_data_imputation_comp)
accuracy_threshold_25 = sum(dp_housing_data_imputation_comp["pred_ri_threshold_25"] == dp_housing_data_imputation_comp["race"])/len(dp_housing_data_imputation_comp)
accuracy_threshold_50 = sum(dp_housing_data_imputation_comp["pred_ri_threshold_5"] == dp_housing_data_imputation_comp["race"])/len(dp_housing_data_imputation_comp)
accuracy_threshold_75 = sum(dp_housing_data_imputation_comp["pred_ri_threshold_75"] == dp_housing_data_imputation_comp["race"])/len(dp_housing_data_imputation_comp)
accuracy_threshold_95 = sum(dp_housing_data_imputation_comp["pred_ri_threshold_95"] == dp_housing_data_imputation_comp["race"])/len(dp_housing_data_imputation_comp)
print("accuracy_argmax: ", accuracy_argmax)
print("accuracy_sample: ", accuracy_sample)
print("accuracy_threshold_25: ", accuracy_threshold_25)
print("accuracy_threshold_50: ", accuracy_threshold_50)
print("accuracy_threshold_75: ", accuracy_threshold_75)
print("accuracy_threshold_95: ", accuracy_threshold_95)

['white' 'black' 'asian']
accuracy_argmax:  0.7268935079726652
accuracy_sample:  0.6094504555808656
accuracy_threshold_25:  0.7268935079726652
accuracy_threshold_50:  0.7069561503416857
accuracy_threshold_75:  0.49005410022779045
accuracy_threshold_95:  0.009658314350797266


In [11]:
# Load the imputed housing data
dp_housing_data_imputated = pd.read_csv('./data/hmda_nc_dp_ri_results.csv')

# get the accuracies of the imputations
dp_housing_data_imputation_comp = dp_housing_data_imputated[dp_housing_data_imputated["race"] != "Race Not Available"]

# Define epsilon thresholds
epsilon_thresholds = [1]  # Example epsilon values
accuracies = {}

# Loop through each epsilon threshold
for epsilon in epsilon_thresholds:
    # Calculate accuracies for each threshold
    accuracy_argmax = sum(dp_housing_data_imputation_comp[f"pred_ri_argmax_epsilon_{epsilon}"] == dp_housing_data_imputation_comp["race"]) / len(dp_housing_data_imputation_comp)
    accuracy_sample = sum(dp_housing_data_imputation_comp[f"pred_ri_sample_epsilon_{epsilon}"] == dp_housing_data_imputation_comp["race"]) / len(dp_housing_data_imputation_comp)
    # accuracy_threshold_25 = sum(dp_housing_data_imputation_comp[f"pred_ri_threshold_0.25_epsilon_{epsilon}"] == dp_housing_data_imputation_comp["race"]) / len(dp_housing_data_imputation_comp)
    # accuracy_threshold_50 = sum(dp_housing_data_imputation_comp[f"pred_ri_threshold_0.5_epsilon_{epsilon}"] == dp_housing_data_imputation_comp["race"]) / len(dp_housing_data_imputation_comp)
    # accuracy_threshold_75 = sum(dp_housing_data_imputation_comp[f"pred_ri_threshold_0.75_epsilon_{epsilon}"] == dp_housing_data_imputation_comp["race"]) / len(dp_housing_data_imputation_comp)
    # accuracy_threshold_95 = sum(dp_housing_data_imputation_comp[f"pred_ri_threshold_0.95_epsilon_{epsilon}"] == dp_housing_data_imputation_comp["race"]) / len(dp_housing_data_imputation_comp)

    # Store accuracies for the current epsilon
    accuracies[epsilon] = {
        "accuracy_argmax": accuracy_argmax,
        "accuracy_sample": accuracy_sample,
        # "accuracy_threshold_25": accuracy_threshold_25,
        # "accuracy_threshold_50": accuracy_threshold_50,
        # "accuracy_threshold_75": accuracy_threshold_75,
        # "accuracy_threshold_95": accuracy_threshold_95,
    }

# Print accuracies for each epsilon threshold
for epsilon, accuracy in accuracies.items():
    print(f"Epsilon: {epsilon}")
    print("accuracy_argmax: ", accuracy["accuracy_argmax"])
    print("accuracy_sample: ", accuracy["accuracy_sample"])
    # print("accuracy_threshold_25: ", accuracy["accuracy_threshold_25"])
    # print("accuracy_threshold_50: ", accuracy["accuracy_threshold_50"])
    # print("accuracy_threshold_75: ", accuracy["accuracy_threshold_75"])
    # print("accuracy_threshold_95: ", accuracy["accuracy_threshold_95"])

Epsilon: 1
accuracy_argmax:  0.7268935079726652
accuracy_sample:  0.6103274487471526


In [22]:
housing_data_imputated.columns

Index(['race', 'tract_code', 'derived_sex', 'denial_reason-1', 'denied',
       'pred_ri_argmax', 'pred_ri_sample', 'pred_ri_threshold_25',
       'pred_ri_threshold_5', 'pred_ri_threshold_75', 'pred_ri_threshold_95'],
      dtype='object')

In [26]:
# demographic parity of the ground truth labels for argmax
model = RaceImputationModel(["pred_ri_argmax"], ["denied"])
model._fit(housing_data_imputated)
print(model.conditional_probs.items())
print(model.col_to_name_map)
print(model.val_to_idx)

dict_items([((2, 0), 0.8230275879774548), ((1, 0), 0.7971925760458896), ((2, 1), 0.1769724120225452), ((1, 1), 0.20280742395411047), ((0, 0), 0.5960264900662252), ((0, 1), 0.40397350993377484)])
{0: 'pred_ri_argmax', 1: 'denied'}
{0: {'asian': 0, 'black': 1, 'white': 2}, 1: {0: 0, 1: 1}}


In [27]:
model.demographic_parity()

{'white': 1.0, 'black': 0.9686097862222901, 'asian': 0.7241877414230156}

In [15]:
# demographic parity of the ground truth labels for argmax
model = RaceImputationModel([f"pred_ri_argmax_epsilon_{epsilon}"], ["denied"])
model._fit(dp_housing_data_imputation_comp)
print(model.conditional_probs.items())
print(model.col_to_name_map)
print(model.val_to_idx)
model.demographic_parity()

dict_items([((2, 0), 0.809225440709195), ((2, 1), 0.190774559290805), ((1, 1), 0.22271671826625386), ((1, 0), 0.7772832817337462), ((0, 0), 0.5598455598455598), ((0, 1), 0.44015444015444016)])
{0: 'pred_ri_argmax_epsilon_1', 1: 'denied'}
{0: {'asian': 0, 'black': 1, 'white': 2}, 1: {0: 0, 1: 1}}


{'white': 1.0, 'black': 0.9605274904018649, 'asian': 0.6918289164944175}

In [19]:
dp_housing_data_imputation_comp.columns

Index(['race', 'tract_code', 'derived_sex', 'denial_reason-1', 'denied',
       'pred_ri_argmax_epsilon_1', 'pred_ri_sample_epsilon_1'],
      dtype='object')

In [21]:
dp_housing_data_imputation_comp[f"pred_ri_argmax_epsilon_{epsilon}"].unique()

array(['white', 'black', 'asian'], dtype=object)