In [1]:
%matplotlib inline

In [2]:
from __future__ import division
from numpy.random import randn, uniform, normal
import random
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import calendar
from datetime import date, timedelta

In [3]:
# 2010 is the last available year right now
years = range(1880, 2017)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = 'names/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)

    frame['year'] = year
    pieces.append(frame)

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

In [4]:
pieces = []
for year, group in names.groupby(['year', 'sex']):
    pieces.append(group.sort_index(by='births', ascending=False)[:1000])
top1000 = pd.concat(pieces, ignore_index=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
top1000.head()

Unnamed: 0,name,sex,births,year
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880


In [6]:
def firstName(year,sex):
    df = top1000[top1000.year == year]
    df = df[df.sex == sex]
    total = float(np.sum(df.births))
   
    df = df.sort_values(by=['births'], ascending=False)
    df["freq"] = df.births.apply(lambda x: float(x)/total)
    df["bound"] = df.freq.cumsum()
    return df
firstName(1968,"M").tail()

Unnamed: 0,name,sex,births,year,freq,bound
177872,Octavio,M,55,1968,3.3e-05,0.999869
177873,Torrance,M,55,1968,3.3e-05,0.999902
177874,Raynard,M,55,1968,3.3e-05,0.999935
177875,Raleigh,M,55,1968,3.3e-05,0.999967
177876,Prince,M,55,1968,3.3e-05,1.0


In [7]:
bar = firstName(1973,"F")
bar.head()

Unnamed: 0,name,sex,births,year,freq,bound
185877,Jennifer,F,62454,1973,0.048249,0.048249
185878,Amy,F,26965,1973,0.020832,0.069081
185879,Michelle,F,26930,1973,0.020805,0.089886
185880,Kimberly,F,23532,1973,0.01818,0.108066
185881,Lisa,F,22667,1973,0.017512,0.125578


In [8]:
bar.bound.searchsorted(0.07)[0], bar.bound.searchsorted(0.11)[0], bar.bound.searchsorted(0.01)[0]

(2, 4, 0)

In [9]:
def rowName(df,prob):
    i = df.bound.searchsorted(prob)[0]
    row = df.iloc[i]
    return row["name"]

rowName(bar,0.11)

'Lisa'

In [10]:
def makeFirstList(year,sex,count):
    bar = firstName(year,sex)
    ret = []
    for x in range(count):
        prob = uniform()
        name = rowName(bar,prob)
        ret.append(name)
        
    return ret

makeFirstList(1912,"M",10)

['Robert',
 'Manuel',
 'Thomas',
 'Jack',
 'Joseph',
 'Philip',
 'John',
 'Peter',
 'Frank',
 'John']

### Last Names
from the [census 1990](https://www.census.gov/topics/population/genealogy/data/1990_census/1990_census_namefiles.html)

In [11]:
path = 'surnames/1990.txt'
lastName = pd.read_csv(path,  delim_whitespace=True , names = ["uName","freq","c","Rank"])
lastName.head()

Unnamed: 0,uName,freq,c,Rank
0,SMITH,1.006,1.006,1
1,JOHNSON,0.81,1.816,2
2,WILLIAMS,0.699,2.515,3
3,JONES,0.621,3.136,4
4,BROWN,0.621,3.757,5


In [12]:
lastName["name"] = lastName.uName.apply(lambda x: str(x).title())
total = float(np.sum(lastName.freq))


lastName["freqReal"] = lastName.freq.apply(lambda x: float(x)/total)
lastName["bound"] = lastName.freqReal.cumsum()
lastName.head()

Unnamed: 0,uName,freq,c,Rank,name,freqReal,bound
0,SMITH,1.006,1.006,1,Smith,0.01264,0.01264
1,JOHNSON,0.81,1.816,2,Johnson,0.010177,0.022817
2,WILLIAMS,0.699,2.515,3,Williams,0.008783,0.031599
3,JONES,0.621,3.136,4,Jones,0.007802,0.039402
4,BROWN,0.621,3.757,5,Brown,0.007802,0.047204


In [13]:
rowName(lastName,0.02)

'Johnson'

In [18]:
def randomFullName(year,gender,count):
    aval = makeFirstList(year,gender,count)
    midList = makeFirstList(year,gender,count)
    ret = []
    for i in range(len(aval)):
        first = aval[i]
        middle = midList[i]
        prob = uniform()
        last = rowName(lastName,prob)
        name = first + " " + middle[0] + ". " + last
        ret.append( {"first":first,"middle":middle,"gender":gender,"last":last, "name":name})
    
        
    return ret
randomFullName(1986,"M",10)

[{'first': 'Albert',
  'gender': 'M',
  'last': 'Hill',
  'middle': 'Joshua',
  'name': 'Albert J. Hill'},
 {'first': 'Christian',
  'gender': 'M',
  'last': 'Favors',
  'middle': 'Kristopher',
  'name': 'Christian K. Favors'},
 {'first': 'Nicholas',
  'gender': 'M',
  'last': 'Beecher',
  'middle': 'Kendall',
  'name': 'Nicholas K. Beecher'},
 {'first': 'Carl',
  'gender': 'M',
  'last': 'Chee',
  'middle': 'Ryan',
  'name': 'Carl R. Chee'},
 {'first': 'Philip',
  'gender': 'M',
  'last': 'Jenkins',
  'middle': 'Jason',
  'name': 'Philip J. Jenkins'},
 {'first': 'Mathew',
  'gender': 'M',
  'last': 'Lewis',
  'middle': 'Adam',
  'name': 'Mathew A. Lewis'},
 {'first': 'Shawn',
  'gender': 'M',
  'last': 'Marin',
  'middle': 'Chad',
  'name': 'Shawn C. Marin'},
 {'first': 'Joseph',
  'gender': 'M',
  'last': 'Fay',
  'middle': 'Robert',
  'name': 'Joseph R. Fay'},
 {'first': 'Jason',
  'gender': 'M',
  'last': 'Burke',
  'middle': 'Marcus',
  'name': 'Jason M. Burke'},
 {'first': 'Eric'

## year

In [19]:
def randomDate(year):
    firstJan = date(year,1,1)
    d = firstJan + timedelta(days = random.randint(0, 365 if calendar.isleap(firstJan.year) else 364))
    return d
    
randomDate(2015)

datetime.date(2015, 3, 8)

In [21]:
def getYears(yearMean,std,count,gender):
    (years,yearcounts) = np.unique(normal(yearMean,std,count).astype(int),return_counts=True)
    
    ret = []
    
   
   
    for i in range(len(years)):
        year = years[i]
        ycount =  yearcounts[i]
        names = randomFullName(year,gender,ycount)


        for j in range(len(names)):
            pep = names[j]
            pep["birth"] = randomDate(year).isoformat()
            ret.append(pep)
    
    arr = np.arange(len(ret))
    np.random.shuffle(arr)
    
    for i in range(len(ret)):
        pep =  ret[i]
        pep["imageID"] = arr[i]
    
    return ret

getYears(1970,5,5,"M")

[{'birth': '1965-06-29',
  'first': 'Robert',
  'gender': 'M',
  'imageID': 3,
  'last': 'Parker',
  'middle': 'Sidney',
  'name': 'Robert S. Parker'},
 {'birth': '1967-05-17',
  'first': 'Christopher',
  'gender': 'M',
  'imageID': 0,
  'last': 'Lester',
  'middle': 'Scottie',
  'name': 'Christopher S. Lester'},
 {'birth': '1969-12-17',
  'first': 'Clint',
  'gender': 'M',
  'imageID': 1,
  'last': 'Eckard',
  'middle': 'Hector',
  'name': 'Clint H. Eckard'},
 {'birth': '1970-05-21',
  'first': 'Richard',
  'gender': 'M',
  'imageID': 4,
  'last': 'Griffin',
  'middle': 'Herbert',
  'name': 'Richard H. Griffin'},
 {'birth': '1973-08-30',
  'first': 'Robert',
  'gender': 'M',
  'imageID': 2,
  'last': 'Smith',
  'middle': 'Larry',
  'name': 'Robert L. Smith'}]

In [None]:
## ouput

In [23]:
import json
import uuid

In [27]:
people = getYears(1965,7,50,"M")
people.extend( getYears(1970,5,50,"F"))
lookup = {}


for i in range(len(people)):
    key = str(uuid.uuid4())
    pep =  people[i]
    pep["id"] = key
    lookup[key] = pep
    

with open('people.json', 'w') as outfile:
    json.dump(lookup, outfile)
people

[{'birth': '1954-09-02',
  'first': 'Anthony',
  'gender': 'M',
  'id': 'e0f73f19-f732-4b18-89b2-6a05732136db',
  'imageID': 28,
  'last': 'Capozzi',
  'middle': 'Gregory',
  'name': 'Anthony G. Capozzi'},
 {'birth': '1957-05-18',
  'first': 'Donald',
  'gender': 'M',
  'id': '73794054-4ad5-4e82-9e2a-854eee2088f0',
  'imageID': 33,
  'last': 'Donoho',
  'middle': 'Ronald',
  'name': 'Donald R. Donoho'},
 {'birth': '1957-06-21',
  'first': 'David',
  'gender': 'M',
  'id': 'e39ca359-9ac8-43a9-ba02-a87d1386df66',
  'imageID': 45,
  'last': 'Birney',
  'middle': 'John',
  'name': 'David J. Birney'},
 {'birth': '1959-02-13',
  'first': 'Patrick',
  'gender': 'M',
  'id': '9e261f4b-525d-4ff3-bf9b-dd8df656952c',
  'imageID': 24,
  'last': 'Solis',
  'middle': 'Paul',
  'name': 'Patrick P. Solis'},
 {'birth': '1959-05-02',
  'first': 'Carlos',
  'gender': 'M',
  'id': '013ac0c4-e305-4fec-a6e9-307fd8fb8b21',
  'imageID': 22,
  'last': 'Halbert',
  'middle': 'Mark',
  'name': 'Carlos M. Halbert