In [32]:
import pandas as pd
import numpy as np

%matplotlib inline

TARGET_FILES = [
    "data/Boston.csv",
    "data/Brockton.csv",
    "data/Cambridge.csv",
    "data/Lynn.csv",
    "data/Springfield.csv"
]

In [33]:
city_infos = [{} for _ in TARGET_FILES]

for i in range(len(TARGET_FILES)):
    filename = TARGET_FILES[i]
    data = pd.read_csv(filename)
    data = data[data["Name"].notna()] # remove NaN name rows
    city_infos[i] = {
        "data": data,
        "filename": filename
    }

In [34]:
# example of data
city_infos[0]["data"].head()

Unnamed: 0,Annual_Wage,Employer,Job_Title,Monthly_Wage,Name,Year,_type
0,125979,City Of Boston,Police Lieutenant,10498,William Slavin J,2015.0,GovsalariesItem
1,122231,City Of Boston,Police Lieutenant,10186,Kenneth Macmaster A,2015.0,GovsalariesItem
2,117584,City Of Boston,Police Sergeant/Hdq Dispatcher,9799,Joseph Maguire M,2015.0,GovsalariesItem
3,115466,City Of Boston,Police Lieutenant,9622,Charles Kelly G,2015.0,GovsalariesItem
4,114487,City Of Boston,Police Lieutenant,9541,Richard Driscoll J,2015.0,GovsalariesItem


In [35]:
for c in city_infos:
    print("{:s}: {:d}".format(c["filename"], len(c["data"])))

data/Boston.csv: 8585
data/Brockton.csv: 510
data/Cambridge.csv: 835
data/Lynn.csv: 8881
data/Springfield.csv: 2208


In [36]:
import time

from namsorclient.request_objects import *
from namsorclient.country_codes import CountryCodes

def predict(name_batch, api_key):
    """
    Use NamSor API to predict
    
    - Likely gender
    
    - Race ethnicity
    
    of each name in `name_batch`
    """
    gender_batch = GenderBatch()
    ethnic_batch = US_RaceEthnicityBatch()
    
    for name in name_batch:
        gender_batch.addItem(* name)
        ethnic_batch.addItem(* name, CountryCodes.United_States)
    
    gender_pred = [x.likely_gender for x in gender_batch.classify(api_key)]
    ethnic_pred = [x.race_ethnicity for x in ethnic_batch.classify(api_key)]
    
    return gender_pred, ethnic_pred
    
def add(df, api_key):
    """
    Add predicted gender and race ethnicity to the dataframe
    """
    pd.options.mode.chained_assignment = None  # default='warn'
    raw_name_list = df["Name"].values

    name_list = [name.split()[:2] for name in raw_name_list]
    df["Gender"], df["Race Ethnicity"] = predict(name_list, api_key)
    
    return df

def timeSince(since):
    now = time.time()
    seconds = int(now - since)
    minutes = int(seconds / 60)
    seconds -= minutes * 60
    return "{:d}m {:d}s".format(minutes, seconds)

In [37]:
# start = time.time()

# for i in range(len(city_infos)):
#     print("Current file: {:s}".format(city_infos[i]["filename"]))
#     print("Objects in total: {:d}".format(len(city_infos[i]["data"])))
#     # add a gender column
#     city_infos[i]["data"] = addGender(city_infos[i]["data"])
#     print(timeSince(start))
#     # export to csv
#     filename = city_infos[i]["filename"]
#     city_infos[i]["data"].to_csv(filename, index = False)

In [38]:
# A sample of adding predicted gender and race ethnicity
df = city_infos[0]["data"][:450]
add(df, "8ee085c59f2f5f1b52b070495115b2bc").to_csv("data/sample.csv", index = False)

[{'id': 'unassigned', 'firstName': 'William', 'lastName': 'Slavin', 'likelyGender': 'male', 'genderScale': -0.9891137055450081, 'score': 33.24712234454998, 'probabilityCalibrated': 0.994556852772504}, {'id': 'unassigned', 'firstName': 'Kenneth', 'lastName': 'Macmaster', 'likelyGender': 'male', 'genderScale': -0.9568780884646577, 'score': 24.592328463188526, 'probabilityCalibrated': 0.9784390442323289}, {'id': 'unassigned', 'firstName': 'Joseph', 'lastName': 'Maguire', 'likelyGender': 'male', 'genderScale': -0.9910638454215497, 'score': 36.198511323701226, 'probabilityCalibrated': 0.9955319227107748}, {'id': 'unassigned', 'firstName': 'Charles', 'lastName': 'Kelly', 'likelyGender': 'male', 'genderScale': -0.9795463345472473, 'score': 28.341030728192663, 'probabilityCalibrated': 0.9897731672736236}, {'id': 'unassigned', 'firstName': 'Richard', 'lastName': 'Driscoll', 'likelyGender': 'male', 'genderScale': -0.9918105205926329, 'score': 43.20440225250662, 'probabilityCalibrated': 0.9959052