# Testing Race Prediction Methods on Georgia Voter List
Chris Iyer, 1/4/2023

In [1]:
# load libraries
import pandas as pd
import numpy as np
from ethnicolr import census_ln, pred_census_ln, pred_wiki_ln, pred_wiki_name, pred_fl_reg_name, pred_fl_reg_name_five_cat, pred_nc_reg_name

2023-01-18 09:03:01.392989: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# load merged 2020 and 2022 voter name + race files (merged and saved in other code)
fpath = '/Users/chrisiyer/_Current/alc/_TX voters/'
df = pd.read_csv(fpath+'GAlist_merged_nonan.csv')

  df = pd.read_csv('/Users/chrisiyer/_Current/alc/_TX voters/GAlist_merged_nonan.csv')


In [3]:
include_other = False

if (not include_other):
    df = df[df.RACE != 'OT']

In [4]:
df.RACE.unique()

array(['WH', 'HP', 'BH', 'AP', 'AI'], dtype=object)

In [5]:
df = df[['FIRST_NAME', 'LAST_NAME', 'RACE']]
df

Unnamed: 0,FIRST_NAME,LAST_NAME,RACE
0,SWAYNE,CARLAN,WH
1,RONNIE,CAGLE,WH
2,LINTON,COCHRAN,WH
3,HENRIETTA,DORSEY,WH
4,JOHNNY,LEWALLEN,WH
...,...,...,...
9062728,CAYDEN,CRIDER,WH
9062729,CLARISSA,CRIDER,WH
9062730,MATTHEW,CRIDER,WH
9062731,JOANNE,CUSTANCE-SMITH,WH


In [14]:
preds = pd.read_csv(fpath+ 'GA_predictions.csv')
preds

Unnamed: 0,pr_surname,pr_firstname,pr_avg,wru_surname
0,WH,OT,WH,WH
1,WH,WH,WH,WH
2,WH,OT,WH,WH
3,BH,WH,WH,BH
4,WH,WH,WH,WH
...,...,...,...,...
8864087,WH,OT,WH,WH
8864088,WH,WH,WH,WH
8864089,WH,WH,WH,WH
8864090,OT,WH,WH,WH


## Functions to use throughout

In [10]:
# sensitivity (#asian_correct / #total_asian) and PPV (#asian_correct / #total_pred_asian)
def results(preds, truth):
    for race in np.unique(preds):
        if race != 'OT':
            corr = sum(np.logical_and([i==race for i in preds], [i==race for i in truth]))
            sens = corr / truth.value_counts()[race]
            ppv = corr / preds.count(race)
            print(f'Race: {race}, Sensitivity: {round(sens,4)}, PPV: {round(ppv,4)}')

In [11]:
def reform_census(preds):
    d = {'white': 'WH',
         'black': 'BH',
         'api': 'AP',
         'hispanic': 'HP',
         'aian': 'AI',
         '2prace': 'OT'}
    return [d[i] for i in preds]

def reform_wiki(preds):
    d = {
        "Asian,GreaterEastAsian,EastAsian": "AP", 
        "Asian,GreaterEastAsian,Japanese": "AP", 
        "Asian,IndianSubContinent": "AP", 
        "GreaterAfrican,Africans": "BH", 
        "GreaterAfrican,Muslim": "BH", 
        "GreaterEuropean,British": "WH",
        "GreaterEuropean,EastEuropean": "WH", 
        "GreaterEuropean,Jewish": "WH",
        "GreaterEuropean,WestEuropean,French": "WH", 
        "GreaterEuropean,WestEuropean,Germanic": "WH",
        "GreaterEuropean,WestEuropean,Hispanic": "HP", 
        "GreaterEuropean,WestEuropean,Italian": "WH",
        "GreaterEuropean,WestEuropean,Nordic": "WH"}
    return [d[i] for i in preds]

def reform_fl(preds):
    d = {
        'asian': 'AP',
        'hispanic': 'HP',
        'nh_black': 'BH',
        'nh_white': 'WH',
        'other': 'OT'
    }
    return [d[i] for i in preds]

def reform_nc(preds):
    d = {
        'HL+A': 'HP', 
        'HL+B': 'HP', 
        'HL+I': 'HP',
        'HL+M': 'HP', 
        'HL+O': 'HP', 
        'HL+W': 'HP', 
        'NL+A': 'AP', 
        'NL+B': 'BH',
        'NL+I': 'AI', 
        'NL+M': 'OT', 
        'NL+O': 'OT', 
        'NL+W': 'WH'
    }
    return [d[i] for i in preds]

def reform_nc2(preds):
    d = {
        'HL+A': 'AP', 
        'HL+B': 'BH', 
        'HL+I': 'AI',
        'HL+M': 'OT', 
        'HL+O': 'OT', 
        'HL+W': 'WH', 
        'NL+A': 'AP', 
        'NL+B': 'BH',
        'NL+I': 'AI', 
        'NL+M': 'OT', 
        'NL+O': 'OT', 
        'NL+W': 'WH'
    }
    return [d[i] for i in preds]
    

## Ethnicolr race predictions

1. Census models (pred_census_ln) - predict using 2000 and 2010 census data (last name only). Only gives (white, black, asian, hispanic)

In [17]:
preds1a = np.array([])
preds1b = np.array([])
for floor in range(0,8900000,100000):
    ceil= min(len(df)+1, floor+100000) 
    preds1a = np.append(preds1a, pred_census_ln(df.iloc[floor:ceil,:].copy(), 'LAST_NAME', year=2000, num_iter = 1)['race'])
    preds1b = np.append(preds1b, pred_census_ln(df.iloc[floor:ceil,:].copy(), 'LAST_NAME', year=2010, num_iter = 1)['race'])
    

In [18]:
# convert verbiage
preds1a_alt = reform_census(preds1a)
preds1b_alt = reform_census(preds1b)

In [19]:
# get accuracy
print('2000 model:')
results(preds1a_alt, df.RACE)
print('2010 model:')
results(preds1b_alt, df.RACE)

2000 model:
Race: AP, Sensitivity: 0.5918, PPV: 0.7985
Race: BH, Sensitivity: 0.0455, PPV: 0.7334
Race: HP, Sensitivity: 0.7572, PPV: 0.6894
Race: WH, Sensitivity: 0.9703, PPV: 0.6238
2010 model:
Race: AP, Sensitivity: 0.5918, PPV: 0.7985
Race: BH, Sensitivity: 0.0455, PPV: 0.7334
Race: HP, Sensitivity: 0.7572, PPV: 0.6894
Race: WH, Sensitivity: 0.9703, PPV: 0.6238


In [20]:
preds['et_census_2000'] = preds1a_alt
preds['et_census_2010'] = preds1b_alt

2. Wiki models (pred_wiki_ln & pred_wiki_name) - predict using wiki model (a - last name only and b - full name). Gives: "Asian,GreaterEastAsian,EastAsian", 
"Asian,GreaterEastAsian,Japanese", 
"Asian,IndianSubContinent", 
"GreaterAfrican,Africans", 
"GreaterAfrican,Muslim", 
"GreaterEuropean,British",
"GreaterEuropean,EastEuropean", 
"GreaterEuropean,Jewish",
"GreaterEuropean,WestEuropean,French", 
"GreaterEuropean,WestEuropean,Germanic",
"GreaterEuropean,WestEuropean,Hispanic", 
"GreaterEuropean,WestEuropean,Italian",
"GreaterEuropean,WestEuropean,Nordic".



In [21]:
preds2a = np.array([])
preds2b = np.array([])
for floor in range(0,8900000,100000):
    ceil= min(len(df)+1, floor+100000) 
    preds2a = np.append(preds2a, pred_wiki_ln(df.iloc[floor:ceil,:].copy(), 'LAST_NAME', num_iter = 1)['race'])
    preds2b = np.append(preds2b, pred_wiki_name(df.iloc[floor:ceil,:].copy(), 'LAST_NAME', 'FIRST_NAME', num_iter = 1)['race'])
    

In [22]:
# convert verbiage
preds2a_alt = reform_wiki(preds2a)
preds2b_alt = reform_wiki(preds2b)

In [23]:
# get accuracy
print('last name wiki model:')
results(preds2a_alt, df.RACE)
print('full name wiki model:')
results(preds2b_alt, df.RACE)

last name wiki model:
Race: AP, Sensitivity: 0.4366, PPV: 0.2752
Race: BH, Sensitivity: 0.0203, PPV: 0.3432
Race: HP, Sensitivity: 0.5234, PPV: 0.6019
Race: WH, Sensitivity: 0.9304, PPV: 0.6076
full name wiki model:
Race: AP, Sensitivity: 0.5492, PPV: 0.5097
Race: BH, Sensitivity: 0.0352, PPV: 0.5171
Race: HP, Sensitivity: 0.5114, PPV: 0.5661
Race: WH, Sensitivity: 0.9608, PPV: 0.6201


In [24]:
preds['et_wiki_surname'] = preds2a_alt
preds['et_wiki_full'] = preds2b_alt

3. Florida models (pred_fl_reg_name, pred_fl_reg_name_five_cat) - full name, full name + other category.
First gives ['asian', 'hispanic', 'nh_black', 'nh_white']
Second gives ['asian', 'hispanic', 'nh_black', 'nh_white', 'other']

In [27]:
preds3a = np.array([])
preds3b = np.array([])
for floor in range(0,8900000,100000):
    ceil= min(len(df)+1, floor+100000) 
    preds3a = np.append(preds3a, pred_fl_reg_name(df.iloc[floor:ceil,:].copy(), 'LAST_NAME', 'FIRST_NAME', num_iter = 1)['race'])
    preds3b = np.append(preds3b, pred_fl_reg_name_five_cat(df.iloc[floor:ceil,:].copy(), 'LAST_NAME', 'FIRST_NAME', num_iter = 1)['race'])
    

Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x7fc85c2e8b80>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/Users/chrisiyer/opt/anaconda3/envs/alc/lib/python3.10/site-packages/keras/backend.py", line 5132, in <genexpr>
    output_ta_t = tuple(  File "/Users/chrisiyer/opt/anaconda3/envs/alc/lib/python3.10/site-packages/tensorflow/python/util/tf_should_use.py", line 243, in wrapped


In [28]:
# convert verbiage
preds3a_alt = reform_fl(preds3a)
preds3b_alt = reform_fl(preds3b)

In [29]:
# get accuracy
print('full name FL model:')
results(preds3a_alt, df.RACE)
print('full name FL model with "other" category:')
results(preds3b_alt, df.RACE)

full name FL model:
Race: AP, Sensitivity: 0.5938, PPV: 0.8605
Race: BH, Sensitivity: 0.4169, PPV: 0.8451
Race: HP, Sensitivity: 0.8109, PPV: 0.6747
Race: WH, Sensitivity: 0.9423, PPV: 0.724
full name FL model with "other" category:
Race: AP, Sensitivity: 0.8115, PPV: 0.5517
Race: BH, Sensitivity: 0.7189, PPV: 0.6272
Race: HP, Sensitivity: 0.8139, PPV: 0.6584
Race: WH, Sensitivity: 0.7006, PPV: 0.8309


In [30]:
preds['et_fl'] = preds3a_alt
preds['et_fl_other'] = preds3b_alt

4. North Carolina model (pred_nc_reg_name) - full name

In [31]:
preds4 = np.array([])
for floor in range(0,8900000,100000):
    ceil= min(len(df)+1, floor+100000) 
    preds4 = np.append(preds4, pred_nc_reg_name(df.iloc[floor:ceil,:].copy(), 'LAST_NAME', 'FIRST_NAME', num_iter = 1)['race'])
    

In [32]:
# convert verbiage
preds4_alt = reform_nc(preds4)
preds4_alt2 = reform_nc2(preds4)

In [33]:
# get accuracy
print('full name NC model (version 1):')
results(preds4_alt, df.RACE)
# get accuracy
print('full name NC model (version 2):')
results(preds4_alt2, df.RACE)

full name NC model (version 1):
Race: AI, Sensitivity: 0.0892, PPV: 0.0044
Race: AP, Sensitivity: 0.5643, PPV: 0.2864
Race: BH, Sensitivity: 0.3818, PPV: 0.5563
Race: HP, Sensitivity: 0.7988, PPV: 0.2366
Race: WH, Sensitivity: 0.3866, PPV: 0.7614
full name NC model (version 2):
Race: AI, Sensitivity: 0.0908, PPV: 0.0044
Race: AP, Sensitivity: 0.5664, PPV: 0.2837
Race: BH, Sensitivity: 0.4159, PPV: 0.5536
Race: WH, Sensitivity: 0.4442, PPV: 0.7319


In [35]:
preds['et_nc_1'] = preds4_alt
preds['et_nc_2'] = preds4_alt2

In [36]:
preds

Unnamed: 0,pr_surname,pr_firstname,pr_avg,wru_surname,et_census_2000,et_census_2010,et_wiki_surname,et_wiki_full,et_fl,et_fl_other,et_nc_1,et_nc_2
0,WH,OT,WH,WH,WH,WH,WH,WH,WH,AP,BH,BH
1,WH,WH,WH,WH,WH,WH,WH,WH,WH,WH,WH,WH
2,WH,OT,WH,WH,WH,WH,WH,WH,WH,BH,BH,BH
3,BH,WH,WH,BH,WH,WH,WH,WH,BH,BH,WH,WH
4,WH,WH,WH,WH,WH,WH,WH,WH,WH,BH,WH,WH
...,...,...,...,...,...,...,...,...,...,...,...,...
8864087,WH,OT,WH,WH,WH,WH,WH,WH,WH,WH,WH,WH
8864088,WH,WH,WH,WH,WH,WH,WH,WH,WH,OT,BH,BH
8864089,WH,WH,WH,WH,WH,WH,WH,WH,WH,WH,WH,WH
8864090,OT,WH,WH,WH,WH,WH,WH,WH,WH,BH,WH,WH


In [38]:
preds.to_csv(fpath + 'GA_predictions.csv')