# This notebook creates real birthday distribution from https://github.com/fivethirtyeight/data/tree/master/births


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random 

In [2]:
# creating dataframe based on the .txt file

column_name = ['year', 'month', 'date_of_month', 'day_of_week,births', 'births']
df = pd.read_csv('US_births_1994-2003_CDC_NCHS.txt', header=None, sep = ',', skiprows=1, index_col=None)
df.columns = column_name
df.head()

Unnamed: 0,year,month,date_of_month,"day_of_week,births",births
0,1994,1,1,6,8096
1,1994,1,2,7,7772
2,1994,1,3,1,10142
3,1994,1,4,2,11248
4,1994,1,5,3,11053


In [4]:
# group the 'month', 'date_of_month' columns to produce average number of people born on specific day

temp = df.copy()
month_day_group = temp.groupby(['month', 'date_of_month'], as_index=False).mean()
month_day_group.head() #, len(month_day_group)

Unnamed: 0,month,date_of_month,year,"day_of_week,births",births
0,1,1,1998.5,3.8,8018.5
1,1,2,1998.5,4.1,9184.3
2,1,3,1998.5,3.7,10547.9
3,1,4,1998.5,4.0,10663.0
4,1,5,1998.5,4.3,10493.6


In [5]:
# computing the distribution of the birthday data

sum_births = month_day_group['births'].sum()
month_day_group["prob"] = month_day_group['births'] / sum_births
month_day_group["cdf"] = month_day_group["prob"].cumsum()
month_day_group

Unnamed: 0,month,date_of_month,year,"day_of_week,births",births,prob,cdf
0,1,1,1998.5,3.8,8018.5,0.002014,0.002014
1,1,2,1998.5,4.1,9184.3,0.002307,0.004321
2,1,3,1998.5,3.7,10547.9,0.002649,0.006971
3,1,4,1998.5,4.0,10663.0,0.002678,0.009649
4,1,5,1998.5,4.3,10493.6,0.002636,0.012285
...,...,...,...,...,...,...,...
361,12,27,1998.5,4.2,11197.3,0.002813,0.988828
362,12,28,1998.5,4.5,11108.1,0.002790,0.991618
363,12,29,1998.5,4.1,11437.5,0.002873,0.994491
364,12,30,1998.5,3.7,11600.8,0.002914,0.997405


In [9]:
# defining the function in order to create a random day [1, 366] from the real distribution

def birth_cdf_inverse():
    #random.seed(seed)
    u = np.random.uniform(0, 1)
    for day, cdf in enumerate(month_day_group['cdf']):
        if u <= cdf:
            return (day)