# Project: Explore U.S. Births

> The [dataset](https://raw.githubusercontent.com/fivethirtyeight/data/master/births/US_births_1994-2003_CDC_NCHS.csv) contains the following columns:

> * year: Year (1994 to 2003).

> * month: Month (1 to 12).

> * date_of_month: Day number of the month (1 to 31).

> * day_of_week: Day of week (1 to 7).

> * births: Number of births that day.


## 1. Reading csv

In [2]:
text = open("US_births_1994-2003_CDC_NCHS.csv",'r').read()
text_list = text.split("\n")
text_list[0:10]

['year,month,date_of_month,day_of_week,births',
 '1994,1,1,6,8096',
 '1994,1,2,7,7772',
 '1994,1,3,1,10142',
 '1994,1,4,2,11248',
 '1994,1,5,3,11053',
 '1994,1,6,4,11406',
 '1994,1,7,5,11251',
 '1994,1,8,6,8653',
 '1994,1,9,7,7910']

## 2. Converting Data Into A List Of Lists

In [3]:
def read_csv(name):
    f = open(name).read()
    l = f.split("\n")
    string_list = l[1:len(l)]
    final_list = []
    for st in string_list:
        int_fields = []
        string_fields = st.split(",")
        for sf in string_fields:
            int_fields.append(int(sf))
        final_list.append(int_fields)
    return final_list

In [4]:
cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")
cdc_list[0:10]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910],
 [1994, 1, 10, 1, 10498]]

## 3. Number Of Births Each Month

In [5]:
def month_births(lol):
    births_per_month = {}
    for ls in lol:
        month = str(ls[1])
        births = ls[4]
        if month not in births_per_month:
            births_per_month[month] = births
        else:
            births_per_month[month] = births_per_month[month] + births
    return births_per_month

In [6]:
cdc_month_births = month_births(cdc_list)
cdc_month_births

{'1': 3232517,
 '10': 3378814,
 '11': 3171647,
 '12': 3301860,
 '2': 3018140,
 '3': 3322069,
 '4': 3185314,
 '5': 3350907,
 '6': 3296530,
 '7': 3498783,
 '8': 3525858,
 '9': 3439698}

## 4. Number Of Births Each Day Of Week

In [7]:
def dow_births(list_of_lists):
    final_dict = {}
    for each_list in list_of_lists:
        dow = each_list[3]
        births = each_list[4]
        if dow not in final_dict:
            final_dict[dow] = births
        else:
            final_dict[dow] = final_dict[dow] + births
    return final_dict

In [8]:
cdc_day_births = dow_births(cdc_list)
cdc_day_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

## 5. Function that works for any column

In [9]:
def calc_counts(data,column):
    final_dict = {}
    for each_list in data:
        some_param = each_list[column]
        births = each_list[4]
        if some_param not in final_dict:
            final_dict[some_param] = births
        else:
            final_dict[some_param] = final_dict[some_param] + births
    return final_dict

In [10]:
cdc_year_births = calc_counts(cdc_list,0)
cdc_month_births = calc_counts(cdc_list,1)
cdc_dom_births = calc_counts(cdc_list,2)
cdc_dow_births = calc_counts(cdc_list,3)

In [11]:
cdc_year_births

{1994: 3952767,
 1995: 3899589,
 1996: 3891494,
 1997: 3880894,
 1998: 3941553,
 1999: 3959417,
 2000: 4058814,
 2001: 4025933,
 2002: 4021726,
 2003: 4089950}

## 6. Function that calculates the min and max values

In [12]:
def count_min_max(data):
    key_list = []
    value_list = []
    new_dict = {}
    for key in data:
        key_list.append(key)
        value_list.append(data[key])
    max_value = max(value_list)
    min_value = min(value_list)
    for key in data:
        if data[key] == max_value:
            new_dict[key] = max_value
        if data[key] == min_value:
            new_dict[key] = min_value
    return new_dict  

In [13]:
count_min_max(cdc_year_births)

{1997: 3880894, 2003: 4089950}

## 7. Function that extracts the same values across years

In [14]:
def same_values_across_years(data,day):
    some_day_births_across_years = dict()
    for row in data:
        row_day = row[3]
        row_year = row[0]
        row_births = row[4]
        if row_day == int(day):
            if row_year not in some_day_births_across_years:
                some_day_births_across_years[row_year] = row_births
            else:
                some_day_births_across_years[row_year] = some_day_births_across_years[row_year] + row_births
    return some_day_births_across_years
        
saturday_births_across_years = same_values_across_years(cdc_list,7)
saturday_births_across_years

{1994: 428752,
 1995: 425790,
 1996: 413336,
 1997: 404478,
 1998: 407129,
 1999: 401991,
 2000: 416454,
 2001: 397119,
 2002: 391375,
 2003: 393299}

## 8. Combining CDC data with [SSA data](https://github.com/fivethirtyeight/data/tree/master/births)

In [15]:
ssa_list = read_csv("US_births_2000-2014_SSA.csv")
ssa_list[0:10]

[[2000, 1, 1, 6, 9083],
 [2000, 1, 2, 7, 8006],
 [2000, 1, 3, 1, 11363],
 [2000, 1, 4, 2, 13032],
 [2000, 1, 5, 3, 12558],
 [2000, 1, 6, 4, 12466],
 [2000, 1, 7, 5, 12516],
 [2000, 1, 8, 6, 8934],
 [2000, 1, 9, 7, 7949],
 [2000, 1, 10, 1, 11668]]

## 9. Creating key dimension containing "year-month-dom-dow"

In [16]:
def create_key(data):
    new_list_of_lists = []
    for each_list in data:
        new_list = []
        key = each_list[0:4]
        value = each_list[4]
        new_list.append(key)
        new_list.append(value)
        new_list_of_lists.append(new_list)
    return new_list_of_lists

In [17]:
create_key(ssa_list)[0:5]

[[[2000, 1, 1, 6], 9083],
 [[2000, 1, 2, 7], 8006],
 [[2000, 1, 3, 1], 11363],
 [[2000, 1, 4, 2], 13032],
 [[2000, 1, 5, 3], 12558]]

## 10. Combining two datasets and dealing with overlapping time periods

In [18]:
def combine(data1, data2):
    data1 = create_key(data1)
    data2 = create_key(data2)
    total_list = []
    for row_two in data2:
        key_2 = row_two[0]
        value_2 = row_two[1]
        for row_one in data1:
            key_1 = row_one[0]
            value_1 = row_one[1]
            if key_1 != key_2:
                total_list.append(row_one)
                total_list.append(row_two)
            else:
                new_val = value_1 + value_2
                new_lst = [key_1,new_val]
                total_list.append(new_lst)
    return total_list
mixed_data = combine(cdc_list,ssa_list)

In [None]:
mixed_data[0:5]

[[[1994, 1, 1, 6], 8096],
 [[2000, 1, 1, 6], 9083],
 [[1994, 1, 2, 7], 7772],
 [[2000, 1, 1, 6], 9083],
 [[1994, 1, 3, 1], 10142]]

## 11. Cleaning 'key' and extracting data to normal List of Lists

In [None]:
def clean_key(data):
    total_data = []
    for row in data:
        new_row = []
        key = row[0]
        value = row[1]
        year = key[0]
        month = key[1]
        dom = key[2]
        dow = key[3]
        new_row.append(year)
        new_row.append(month)
        new_row.append(dom)
        new_row.append(dow)
        new_row.append(value)
        total_data.append(new_row)
    return total_data
cleaned_mixed_data = clean_key(mixed_data)
cleaned_mixed_data[0:5]