# Deriving demographically-distinct names

Get names by gender and race/ethnicity from voter records and Census surveys. 

Note: Because the NC voter records have changed from the time we initially ran this notebook, you'll end up with a different list of names.

In [1]:
import json
import random
import pandas as pd

Voter registration data downloaded from this [site](https://www.ncsbe.gov/results-data/voter-registration-data). Read documentation [here](https://s3.amazonaws.com/dl.ncsbe.gov/data/layout_ncvoter.txt)

In [2]:
# inputs
url_nc = "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter_Statewide.zip"
fn_ssa = '../data/input/top_100_baby_names_ssa.tsv' # https://www.ssa.gov/oact/babynames/decades/century.html
fn_census = '../data/input/Names_2010Census_Top1000.csv' # https://www.census.gov/topics/population/genealogy/data/2010_surnames.html

# outputs
fn_men = '../data/input/top_mens_names.json'
fn_women = '../data/input/top_womens_names.json'

In [3]:
# read the file directly from s3 here
df = pd.read_csv(url_nc, compression='zip', sep='\t', encoding = "ISO-8859-1")

  df = pd.read_csv(url_nc, compression='zip', sep='\t', encoding = "ISO-8859-1")


In [4]:
len(df)

8562269

The race and ethnicity values are encoded, here is a translation.

In [5]:
racecode2legiable = {
    'W' : 'White',
    'B' : 'Black',
    'U' : 'Undesignated',
    'O' : 'Other',
    'A' : 'Asian',
    'I' : 'AMERICAN INDIAN OR ALASKAN NATIVE',
    'M' : 'TWO or MORE RACES',
    'P' : 'NATIVE HAWAIIAN or PACIFIC ISLANDER'  
}

ethnicitycode2legible = {
    'NL' : 'Not latino',
    'HL' : 'Hispanic or Latino',
    'UN' : 'Undesignated'
}

In [6]:
# filter to hispanic/latino
birthyear_retired = 1958

# filter by gender
men = df[(df.gender_code == 'M') & 
         (df.birth_year > birthyear_retired)]
women = df[(df.gender_code == 'F') & 
           (df.birth_year > birthyear_retired)]

men_nl = men[men.ethnic_code == 'NL']
men_hl = men[men.ethnic_code == 'HL']

women_nl = women[women.ethnic_code == 'NL']
women_hl = women[women.ethnic_code == 'HL']

Change the names for certian exceptions

In [7]:
# renaming files for famous names
manual = {
    'ALAN YANG': 'ALAN ZHANG',
    'JACOB KRUEGER': 'JACOB OCONNELL',
    'ALLYSON SCHWARTZ': 'ALLYSON OCONNELL',
    'JACK BAUER' : 'JACK SCHWARTZ'
}

## Get first names from voter records

Get the most popular first names distinct to each demographic. For Black and White names, get names with 90% or greater racial shares.

In [8]:
gender2race2names = {'M' : {}, 'W': {}}
N = 100

## men
pivot = pd.pivot_table(men_nl, index='first_name',values='ncid', columns='race_code', aggfunc='nunique')
piv_norm = pivot.divide(pivot.sum(axis=1), axis=0)
piv_norm['N'] = pivot.sum(axis=1)
for race in ['W', 'B']:
    names = piv_norm[piv_norm[race] > .9].sort_values(by=['N'], ascending=False).head(N).index.tolist()
    gender2race2names['M'][race] = names

# asian
race = 'A'
names = men_nl[men_nl.race_code == 'A'].first_name.value_counts().head(N).index.tolist()
gender2race2names['M'][race] = names

# hispanic
race = 'H'
names = men_hl[men_hl.race_code == 'O'].first_name.value_counts().head(N).index.tolist()
gender2race2names['M'][race] = names

## women
pivot = pd.pivot_table(women_nl, index='first_name',values='ncid', columns='race_code', aggfunc='nunique')
piv_norm = pivot.divide(pivot.sum(axis=1), axis=0)
piv_norm['N'] = pivot.sum(axis=1)
for race in ['W', 'B']:
    names = piv_norm[piv_norm[race] > .9].sort_values(by=['N'], ascending=False).head(N).index.tolist()
    gender2race2names['W'][race] = names

# asian
race = 'A'
names = women_nl[women_nl.race_code == 'A'].first_name.value_counts().head(N).index.tolist()
gender2race2names['W'][race] = names

# hispanic
pivot = pd.pivot_table(women_hl, index='first_name',values='ncid', columns='race_code', aggfunc='nunique')
piv_norm = pivot.divide(pivot.sum(axis=1), axis=0)
piv_norm['N'] = pivot.sum(axis=1)

race = 'H'
names = piv_norm.sort_values(by=['N'], ascending=False).head(N).index.tolist()
gender2race2names['W'][race] = names

## get surnames from decennial census

Get the surnames with the greatest share for each racial and ethnic demographic.

In [9]:
census = pd.read_csv(fn_census, header=2)

In [10]:
census['FREQUENCY (COUNT)'] = census['FREQUENCY (COUNT)'].str.replace(',', '').astype(float)

In [11]:
race2col = {
    'W' : 'PERCENT NON-HISPANIC OR LATINO WHITE ALONE',
    'B' : 'PERCENT NON-HISPANIC OR LATINO BLACK OR AFRICAN AMERICAN ALONE',
    'A' : 'PERCENT NON-HISPANIC OR LATINO ASIAN AND NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE',
    'H' : 'PERCENT HISPANIC OR LATINO ORIGIN'
}

In [12]:
# convert to float
census[race2col['B']] = census[race2col['B']].replace({'(S)':None}).astype(float)

In [13]:
# treat these surnames differently to match s asian first and last names
s_asian_surnames = ['PATEL', 'SINGH', 'KAUR']

In [28]:
race2surnames = {}
for race, col in race2col.items():
    surnames = census[
        (census['FREQUENCY (COUNT)'] > 10000) & 
        (~census['SURNAME'].isin(s_asian_surnames))
    ].sort_values(by=col, ascending=False)[:20]['SURNAME'].tolist()
    race2surnames[race] = surnames

Align South Asian surnames to first names. Note that these first names were gathered manually based on the top names, and are not exhaustive.

In [19]:
# match these names to `s_asian_surnames`.
south_asian = set([
    'MUHAMMAD','MOHAMMED', 'MOHAMMAD','SYED', 'OMAR', 'AHMED',
    'VIJAY', 'SANJAY', 'RAHUL', 'AMIT', 'RAVI', 'ROHAN', 'RAJESH',
    'NIKHIL', 'SANDEEP', 'VIVEK', 'SURESH', 'SUNIL', 'SRINIVAS',
    'RAMESH', 'KRISHNA', 'MANOJ', 'ARJUN', 'ANIL', 'ROHIT',
    'AJAY', 'ANAND', 'ARUN', 'RAJ', 'ANISH', 'KIRAN', 'VARUN',
    'VENKATA', 'RISHI', 'ASHISH', 'PRANAV', 'PRASHANT', 'MANISH',
    'NEHA', 'POOJA', 'DIVYA', 'PRIYA', 'ANJALI', 'NISHA', 
    'PRIYANKA', 'LAKSHMI', 'SHREYA', 'MEENA'
])

In [30]:
random.seed(303)
gender2race2fullnames = {'M': {} , 'W' : {}}
for gender, items in gender2race2names.items():
    for race, names in items.items():
        full_names = []
        for name in names:
            if (name in south_asian) and (race == 'A'):
                surname = random.choice(s_asian_surnames)
            else:
                surname = random.choice(race2surnames[race])
            full_name = f"{name} {surname}"    
            full_names.append(full_name)
        gender2race2fullnames[gender][race] = sorted(full_names)

In [31]:
reform = {(outerKey, innerKey): values for outerKey, innerDict in gender2race2fullnames.items() for innerKey, values in innerDict.items()}

In [32]:
race2names_m = gender2race2fullnames['M']
race2names_w = gender2race2fullnames['W']

## Save the demographically-distinct names

Having trouble reproducing the exact fist-surname name combinations? This is likely due to incremental edits to celebrity names that we removed and updates to the NC voter records.

In [33]:
for race, names in race2names_m.items():
    race2names_m[race] = [manual.get(_, _) for _ in names]

In [34]:
for race, names in race2names_w.items():
    race2names_w[race] = [manual.get(_, _) for _ in names]

In [35]:
# example of 5 Asian male's names:
race2names_m['A'][:5]

['AARON VANG', 'ADAM TRAN', 'ADITYA WU', 'AJAY PATEL', 'ALAN PHAM']

This is where we save the dictionaries:

In [None]:
# with open(fn_men, 'w') as f:
#     f.write(json.dumps(race2names_m))

# with open(fn_women, 'w') as f:
#     f.write(json.dumps(race2names_w))