# Data Prep and Loading

In [1]:
import opendp.prelude as dp
dp.enable_features('contrib')
import numpy as np 
import pandas as pd
from race_imputation.race_imputation_model import RaceImputationModel
import data_preprocessing_utils
# force all columns to show on head call
pd.set_option('display.max_columns', None)


In [None]:
voter_data_all = pd.read_csv('./data/combined_nc.csv')
voter_data = data_preprocessing_utils.prep_voter_dataframe(voter_data_all)

# imputation_model = RaceImputationModel(["tract_code"],["race"])
# imputation_model._fit(voter_data_all)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  voter_data["race"][voter_data["race"]=="aian"] = "asian"


In [None]:
voter_data.loc[voter_data["race"]=="aian", "race"] = "asian"


race
white    3747193
black    1011595
hisp      155719
asian      88419
other      87071
Name: count, dtype: int64


In [25]:
housing_data_all = pd.read_csv('./data/hmda_nc.csv')
housing_data = data_preprocessing_utils.prep_housing_dataframe(housing_data_all)

print(housing_data['tract_code'].unique())


SyntaxError: unmatched ']' (2633066039.py, line 5)

In [28]:
counts = housing_data["race"].value_counts()
# get the proportion of each
proportions = counts/counts.sum()
housing_data["race"].unique()

array(['Race Not Available', 'Black or African American', 'White',
       'Asian', 'Joint', 'Native Hawaiian or Other Pacific Islander',
       'American Indian or Alaska Native', '2 or more minority races',
       'Free Form Text Only'], dtype=object)

In [9]:
# Get the true proportions of race in the dataset, ignore Race Not Available
counts["Race Not Available"] = 0
proportions = counts/counts.sum()
proportions

race
White                                        0.720196
Race Not Available                           0.000000
Black or African American                    0.176173
Asian                                        0.064581
Joint                                        0.022799
American Indian or Alaska Native             0.010384
2 or more minority races                     0.003474
Native Hawaiian or Other Pacific Islander    0.002079
Free Form Text Only                          0.000313
Name: count, dtype: float64

In [18]:
housing_data_imputated = pd.read_csv('./data/hmda_nc_ri.csv')

# get the accuracies of the imputations
housing_data_imputation_comp = housing_data_imputated[housing_data_imputated["race"]!= "Race Not Available"]
print(housing_data_imputation_comp["pred_ri_argmax"].unique())
accuracy_argmax = sum(housing_data_imputation_comp["pred_ri_argmax"] == housing_data_imputation_comp["race"])/len(housing_data_imputation_comp)
accuracy_sample = sum(housing_data_imputation_comp["pred_ri_sample"] == housing_data_imputation_comp["race"])
accuracy_threshold_25 = sum(housing_data_imputation_comp["pred_ri_threshold_25"] == housing_data_imputation_comp["race"])
accuracy_threshold_50 = sum(housing_data_imputation_comp["pred_ri_threshold_5"] == housing_data_imputation_comp["race"])
accuracy_threshold_75 = sum(housing_data_imputation_comp["pred_ri_threshold_75"] == housing_data_imputation_comp["race"])
accuracy_threshold_95 = sum(housing_data_imputation_comp["pred_ri_threshold_95"] == housing_data_imputation_comp["race"])
print("accuracy_argmax: ", accuracy_argmax)
print("accuracy_sample: ", accuracy_sample)
print("accuracy_threshold_25: ", accuracy_threshold_25)
print("accuracy_threshold_50: ", accuracy_threshold_50)
print("accuracy_threshold_75: ", accuracy_threshold_75)
print("accuracy_threshold_95: ", accuracy_threshold_95)

['white' 'black' 'aian']
accuracy_argmax:  0.0
accuracy_sample:  0
accuracy_threshold_25:  0
accuracy_threshold_50:  0
accuracy_threshold_75:  0
accuracy_threshold_95:  0
