## Overview
Working with a dataset on births.  The data contains the following fields:

 - `year` - Year of birth (1994 to 2003)
 - `month` - Month of birth (1-January to 12-December)
 - `date_of_month` - Date number of the month of birth (1 to 31)
 - `day_of_week` - Day of week of birth (1-Monday to 7-Sunday)
 - `births` - Number of births on that day

In [1]:
# open and read the file
f = open('US_births_1994-2003_CDC_NCHS.csv', 'r')
data = f.read()

In [2]:
# split data by new line and print first 10 entries
split_data = data.split('\n')
print(split_data[0:10])

['year,month,date_of_month,day_of_week,births', '1994,1,1,6,8096', '1994,1,2,7,7772', '1994,1,3,1,10142', '1994,1,4,2,11248', '1994,1,5,3,11053', '1994,1,6,4,11406', '1994,1,7,5,11251', '1994,1,8,6,8653', '1994,1,9,7,7910']


In [3]:
# create a function to convert the dataset into a list of lists
# want integers, not strings, in the lists
def read_csv(input_filename):
    f = open(input_filename, 'r')
    data = f.read()
    split_data = data.split('\n')
    string_list = split_data[1:]
    final_list = []
    for item in string_list:
        int_fields = []
        string_fields = item.split(',')
        for i in string_fields:
            int_fields.append(int(i))
        final_list.append(int_fields)
    return final_list

In [4]:
# use function to convert the file
cdc_list = read_csv('US_births_1994-2003_CDC_NCHS.csv')

In [5]:
# confirm list of lists, all integer values, no header row
print(cdc_list[0:10])

[[1994, 1, 1, 6, 8096], [1994, 1, 2, 7, 7772], [1994, 1, 3, 1, 10142], [1994, 1, 4, 2, 11248], [1994, 1, 5, 3, 11053], [1994, 1, 6, 4, 11406], [1994, 1, 7, 5, 11251], [1994, 1, 8, 6, 8653], [1994, 1, 9, 7, 7910], [1994, 1, 10, 1, 10498]]


In [6]:
# create a function to calculate total number of births by month
def month_births(input_list):
    births_per_month = {}
    for item in input_list:
        month = item[1]
        births = item[4]
        if month in births_per_month:
            births_per_month[month] += births
        else:
            births_per_month[month] = births
    return births_per_month

In [7]:
# apply the above function to the dataset
cdc_month_births = month_births(cdc_list)
print(cdc_month_births)

{1: 3232517, 2: 3018140, 3: 3322069, 4: 3185314, 5: 3350907, 6: 3296530, 7: 3498783, 8: 3525858, 9: 3439698, 10: 3378814, 11: 3171647, 12: 3301860}


In [8]:
# create a function to calculate total number of births by day of the week
def dow_births(input_list):
    births_per_dow = {}
    for item in input_list:
        dow = item[3]
        births = item[4]
        if dow in births_per_dow:
            births_per_dow[dow] += births
        else:
            births_per_dow[dow] = births
    return births_per_dow

In [9]:
# apply the above function to the dataset
cdc_day_births = dow_births(cdc_list)
print(cdc_day_births)

{1: 5789166, 2: 6446196, 3: 6322855, 4: 6288429, 5: 6233657, 6: 4562111, 7: 4079723}


In [10]:
# create a more general function to calculate totals for any column
# data is a list of lists
# column is the column number 
# (0=year, 1=month, 2=date_of_month, 3=day_of_week)
def calc_counts(data, column):
    counts = {}
    for item in data:
        key = item[column]
        births = item[4]
        if key in counts:
            counts[key] += births
        else:
            counts[key] = births
    return counts

In [11]:
# use the general function to create counts for each column
cdc_year_births = calc_counts(cdc_list, 0)
cdc_month_births = calc_counts(cdc_list, 1)
cdc_dom_births = calc_counts(cdc_list, 2)
cdc_dow_births = calc_counts(cdc_list, 3)

In [13]:
# review the dictionaries created
cdc_year_births, cdc_month_births, cdc_dom_births, cdc_dow_births

({1994: 3952767,
  1995: 3899589,
  1996: 3891494,
  1997: 3880894,
  1998: 3941553,
  1999: 3959417,
  2000: 4058814,
  2001: 4025933,
  2002: 4021726,
  2003: 4089950},
 {1: 3232517,
  2: 3018140,
  3: 3322069,
  4: 3185314,
  5: 3350907,
  6: 3296530,
  7: 3498783,
  8: 3525858,
  9: 3439698,
  10: 3378814,
  11: 3171647,
  12: 3301860},
 {1: 1276557,
  2: 1288739,
  3: 1304499,
  4: 1288154,
  5: 1299953,
  6: 1304474,
  7: 1310459,
  8: 1312297,
  9: 1303292,
  10: 1320764,
  11: 1314361,
  12: 1318437,
  13: 1277684,
  14: 1320153,
  15: 1319171,
  16: 1315192,
  17: 1324953,
  18: 1326855,
  19: 1318727,
  20: 1324821,
  21: 1322897,
  22: 1317381,
  23: 1293290,
  24: 1288083,
  25: 1272116,
  26: 1284796,
  27: 1294395,
  28: 1307685,
  29: 1223161,
  30: 1202095,
  31: 746696},
 {1: 5789166,
  2: 6446196,
  3: 6322855,
  4: 6288429,
  5: 6233657,
  6: 4562111,
  7: 4079723})

In [16]:
# create a function to calculate the min and max values in a dictionary
def min_max(mydict):
    max = 0
    for key in mydict:
        if mydict[key] > max:
            max = mydict[key]
    min = max
    for key in mydict:
        if mydict[key] < min:
            min = mydict[key]
    return [min, max]

In [17]:
# apply min_max function to number of births by day of week
min_max_dow = min_max(cdc_dow_births)
min_max_dow

[4079723, 6446196]

In [18]:
# determine how the number of births on Saturdays changed year over year
def births_by_year_saturdays(data):
    counts = {}
    for item in data:
        if item[3] == 6:
            key = item[0]
            births = item[4]
            if key in counts:
                counts[key] += births
            else:
                counts[key] = births
    return counts

In [19]:
saturday_births = births_by_year_saturdays(cdc_list)
saturday_births

{1994: 474732,
 1995: 459580,
 1996: 456261,
 1997: 450840,
 1998: 453776,
 1999: 449985,
 2000: 469794,
 2001: 453928,
 2002: 445770,
 2003: 447445}