### Pre-process data for name typing

Here, we'll process the raw data acquired [here](https://github.com/philipperemy/name-dataset?tab=readme-ov-file#full-dataset) (a dataset of 500M names), and a dictionary of words (which can be acquired anywhere) to assist in identifying whether a name is an individual or not.

*Download the names dataset to an EC2 instance*: go to the Google Drive link and copy the network request as cURL, then past it and write the output to a file (`> names.zip`).

### Human names dataset
First, let's load the human names dataset.

In [61]:
import pyarrow.csv as pv
import pyarrow as pa

# Let's only read in the US
df = pv.read_csv(
    "../static/name_dataset/data/US.csv",
    read_options=pv.ReadOptions(column_names=["First", "Last", "Gender", "Country"]),
    parse_options=pv.ParseOptions(delimiter=","),
    convert_options=pv.ConvertOptions(column_types={"First": pa.string(), "Last": pa.string(), "Gender": pa.string(), "Country": pa.string()}),
)

In [62]:
df = df.to_pandas()

In [63]:
df.sample(5)

Unnamed: 0,First,Last,Gender,Country
17377721,Corey,Forbes,,US
9969609,Nasir,Alom,M,US
4084993,Joshua,Vargas,M,US
22780060,Ian,Anderson,M,US
14882460,Noardilu,Marquez,F,US


In [64]:
len(df)

32308973

In [65]:
# Replace non-ASCII characters and casefold
df["First"] = df["First"].str.encode("ascii", errors="ignore").str.decode("ascii").str.casefold()
df["Last"] = df["Last"].str.encode("ascii", errors="ignore").str.decode("ascii").str.casefold()

In [82]:
# Get the value counts of first and lasts (dict mapping to counts)
first_counts = df["First"].value_counts().to_dict()
last_counts = df["Last"].value_counts().to_dict()

In [83]:
MIN_LENGTH = 2

first_counts = {k: v for k, v in first_counts.items() if len(k) >= MIN_LENGTH}
last_counts = {k: v for k, v in last_counts.items() if len(k) >= MIN_LENGTH}

In [84]:
KEEP_THRESHOLD = 2

first_counts = {k: v for k, v in first_counts.items() if v >= KEEP_THRESHOLD}
last_counts = {k: v for k, v in last_counts.items() if v >= KEEP_THRESHOLD}

In [85]:
import math

first_counts = {k: math.log(v) for k, v in first_counts.items()}
last_counts = {k: math.log(v) for k, v in last_counts.items()}

In [86]:
# Scale to 0-100000 (integer)
max_first_count = max(first_counts.values())
min_first_count = min(first_counts.values())
first_counts = {k: int((v - min_first_count) / (max_first_count - min_first_count) * 100000) for k, v in first_counts.items()}
max_last_count = max(last_counts.values())
min_last_count = min(last_counts.values())
last_counts = {k: int((v - min_last_count) / (max_last_count - min_last_count) * 100000) for k, v in last_counts.items()}

In [87]:
# Save the counts in an efficient format
import json

with open("../static/name_dataset/first_scores.json", "w") as f:
    json.dump(first_counts, f)

with open("../static/name_dataset/last_scores.json", "w") as f:
    json.dump(last_counts, f)

### Word dataset
Next, let's load the word dataset.

In [161]:
import pandas as pd
df_unigram = pd.read_csv("../static/english_word_freq/unigram_freq.csv", dtype={"word": str, "count": int})
df_unigram.sample(5)

Unnamed: 0,word,count
310042,subperiod,14297
223711,gramling,25758
12417,pumping,3568469
230231,hewing,24529
219653,vanlandingham,26589


In [164]:
len(df_unigram)

333333

In [162]:
# Load in the corpus of English words
words = open("../static/popular.txt").read().splitlines()

In [163]:
len(words)

25322

In [165]:
# Only include words in the corpus
df_unigram = df_unigram[df_unigram["word"].isin(words)]

In [166]:
LENGTH_THRESHOLD = 2

df_unigram = df_unigram[df_unigram["word"].str.len() >= LENGTH_THRESHOLD]

In [167]:
df_unigram["log_count"] = df_unigram["count"].apply(math.log)

In [168]:
# Scale to 0-100000 (integer)
max_log_count = df_unigram["log_count"].max()
min_log_count = df_unigram["log_count"].min()
df_unigram["score"] = ((df_unigram["log_count"] - min_log_count) / (max_log_count - min_log_count) * 100000).astype(int)

In [169]:
# Convert to json and save
word_to_count = df_unigram.set_index("word")["score"].to_dict()
with open("../static/english_word_freq/word_to_score.json", "w") as f:
    json.dump(word_to_count, f)

In [None]:
#Load new dataset of names
# From here: https://www.ssa.gov/OACT/babynames/limits.html

In [149]:
import os

dfs = []
for file in [f for f in os.listdir("../static/census_baby_names") if f.endswith(".txt")]:
    # Read the txt file, with columns Name,Gender,Count
    df = pd.read_csv(os.path.join("../static/census_baby_names", file), names=["Name", "Gender", "Count"], delimiter=",")
    dfs.append(df)

In [150]:
df = pd.concat(dfs)

In [151]:
df["Name"] = df["Name"].str.casefold()

In [152]:
# Aggregate: rows with the same name should have their counts summed
df = df[["Name", "Count"]].groupby("Name").sum().reset_index()
df.sample(5)


Unnamed: 0,Name,Count
32360,finna,26
41953,jarail,5
3854,aliciah,21
1353,adelline,18
57634,lawarren,6


In [153]:
# Scale from 0-100000
df["log_count"] = df["Count"].apply(math.log)
max_log_count = df["log_count"].max()
min_log_count = df["log_count"].min()
df["ScaledCount"] = ((df["log_count"] - min_log_count) / (max_log_count - min_log_count) * 100000).astype(int)

In [154]:
# Order by count
df = df.sort_values(by="Count", ascending=False)

In [155]:
# Output to json
name_to_count = df.set_index("Name")["ScaledCount"].to_dict()
with open("../static/census_baby_names/first_to_score.json", "w") as f:
    json.dump(name_to_count, f)

In [None]:
# Census surnames
# From here: https://www.census.gov/topics/population/genealogy/data.html

In [140]:
df = pd.read_csv("../static/census_surnames/Names_2010Census.csv", dtype={"name": str, "count": int})

In [141]:
df.sample(5)

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
73133,KINAS,73005,266,0.09,85261.73,96.99,(S),(S),0,0,(S)
83036,KECSKES,82908,227,0.08,86086.5,92.95,(S),(S),0,3.08,2.2
92426,CASATELLI,92358,199,0.07,86763.3,91.46,(S),(S),0,(S),6.03
135656,BRAGMAN,135593,124,0.04,89060.28,94.35,(S),(S),0,(S),0
148536,GIARDELLI,148347,111,0.04,89573.92,96.4,0,0,0,(S),(S)


In [142]:
df["name"] = df["name"].str.casefold()

In [143]:
df = df[["name", "count"]].groupby("name").sum().reset_index()

In [144]:
df.sample(5)

Unnamed: 0,name,count
124232,ruschell,124
1746,akpan,732
84328,liamas,157
43375,etienne,7867
149571,valdovino,261


In [145]:
LENGTH_THRESHOLD = 2
df = df[df["name"].str.len() >= LENGTH_THRESHOLD]

# ELiminate "All other names"
df = df[df["name"] != "all other names"]
df.sample(5)

Unnamed: 0,name,count
90291,manier,920
84798,lillibridge,979
26159,chupp,2893
6043,audige,111
3817,angelina,335


In [146]:
# Scale from 0-100000
df["log_count"] = df["count"].apply(math.log)
max_log_count = df["log_count"].max()
min_log_count = df["log_count"].min()
df["ScaledCount"] = ((df["log_count"] - min_log_count) / (max_log_count - min_log_count) * 100000).astype(int)

# Sort by count
df = df.sort_values(by="count", ascending=False)

In [147]:
df.sample(5)

Unnamed: 0,name,count,log_count,ScaledCount
97168,mikell,3048,8.022241,33820
4647,archundia,400,5.991465,13720
95570,melchin,137,4.919981,3115
16688,bowey,163,5.09375,4835
118247,rashed,623,6.434547,18106


In [148]:
# Output to json
name_to_count = df.set_index("name")["ScaledCount"].to_dict()
with open("../static/census_surnames/last_to_score.json", "w") as f:
    json.dump(name_to_count, f)