## Simple Bayes Filter to Predict US Birth States by Name

A simple program that generates the most likely state you were born in given your name or name and birth year.
Using the dataset found here: https://www.ssa.gov/oact/babynames/limits.html

In [10]:
# Import necessary libraries

import nltk
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import os

# Setting some visualization parameters
sns.set(style="darkgrid")


In [None]:
# Import the list of names by state
fpath = os.getcwd() + "/namesbystate/"
nameFiles = os.listdir(fpath)

# Build the dataframe
stateDFs = []
for nf in nameFiles:
    state = nf.split(".")[0]
    with open(fpath + nf) as namesFile:
        stateDFs.append(pd.read_table(namesFile, sep=",", names=["State","Gender","Year","Name","Count"]))

names = pd.concat(stateDFs)


In [26]:
# Analyze the dataframe
#print(names.size)
#print(names.columns)
#pd.isnull(names).any()
#names.nunique()

#g = sns.FacetGrid(tips, row="sex", col="time", margin_titles=True)
#bins = np.linspace(0, 60, 13)
#g.map(plt.hist, "total_bill", color="steelblue", bins=bins)

# Get the overall frequencies, which names are most popular across the dataset?
name_freq = names.groupby('Name').sum().Count.sort_values(ascending=False)
name_freq[:5]
# best = names_freq.nlargest()


Name
James      5001762
John       4875934
Robert     4743843
Michael    4354622
William    3886371
Name: Count, dtype: int64

In [36]:
# Small collection of analysis functions

def mostPopularByYear(year, gender="Both"):
    # separate by gender if requested
    if gender == "F":
        yearNames = names.loc[(names['Year'] == year) & (names["Gender"] == "F") ]
    elif gender == "M":
        yearNames = names.loc[(names['Year'] == year) & (names["Gender"] == "M") ]
    else:
        yearNames = names.loc[names['Year'] == year]
        
    freq = yearNames.groupby('Name').sum().Count.sort_values(ascending=False)
    return (freq.index[0], freq[0])

def mostPopularByYearRange(minYear, maxYear, gender="Both"):
    if gender == "F":
        yearNames = names.loc[(names['Year'] >= minYear) & (names['Year'] <= maxYear) & (names["Gender"] == "F") ]
    elif gender == "M":
        yearNames = names.loc[(names['Year'] >= minYear) & (names['Year'] <= maxYear) & (names["Gender"] == "M") ]
    else:
        yearNames = names.loc[(names['Year'] >= minYear) & (names['Year'] <= maxYear)]
        
    freq = yearNames.groupby('Name').sum().Count.sort_values(ascending=False)
    return (freq.index[0], freq[0])
    
#def mostPopularByYearAndState(year, state):
    


In [39]:
mostPopularByYear(2010, "M")

#mostPopularByYearRange(1920, 1980, "F")

('Jacob', 22117)

In [41]:
# Build and train the Bayes classifier
rowCount = names.shape[0]
featureSet = []
for index, row in names.iterrows():
    # explanatory variable, response variable
    featureSet.append( {"Name": row["Name"], "Count": row["Count"]} , row["State"]))
train_set, test_set = featureSet[np.floor(rowCount*.7):], featureSet[:np.floor(rowCount*.3)]
classifier = nltk.NaiveBayesClassifier.train(train_set)

KeyboardInterrupt: 

In [None]:
def predictState(name):
    # get row count and build out the simple feature set
    return classifier.classify(name)
    
predictState("Greg", names)