In [1]:
import pandas as pd
import numpy as np
from read_path_module import read_data_relative_path
from write_path_module import write_data_relative_path

## Importing 2003 - 2006 Data

In [2]:
# Read in txt file
df = read_data_relative_path('./data/CDC_Natality/2003_2006.txt', data_type='txt_tab')

# Write to csv for more convenient future use
write_data_relative_path(df, './data/CDC_Natality/2003_2006.csv', data_type='csv')
del df

# Import in data and rename
df_2003_2006 = read_data_relative_path('./data/CDC_Natality/2003_2006.csv', data_type='csv')
df_2003_2006.sample(3)

Unnamed: 0,Notes,Year,Year Code,Month,Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age
20192,,2005.0,2005.0,September,9.0,Louisiana,22.0,"Rapides Parish, LA",22079.0,196.0,24.4,38.06
28212,,2006.0,2006.0,October,10.0,Florida,12.0,"Volusia County, FL",12127.0,453.0,26.92,38.68
10437,,2004.0,2004.0,May,5.0,Oregon,41.0,"Jackson County, OR",41029.0,155.0,26.59,38.95


## Importing 2007 - 2018 Data

In [3]:
# Create path list for all txt files
lst_paths = ['./data/CDC_Natality/'+str(i)+'.txt' for i in range(2007,2018+1)]

# Read in first csv
df = read_data_relative_path('./data/CDC_Natality/2007.txt', data_type='txt_tab').head(1)

# Delete data to create empty, but properly columned dataframe
df = df[0:0]

# Iterate through path names and append them to empty dataframe above
for path in lst_paths:
    temp = read_data_relative_path(path, data_type='txt_tab')
    df = pd.concat([df, temp])
    del temp

# Write to csv for more convenient future use
write_data_relative_path(df, './data/CDC_Natality/2007_2018.csv', data_type='csv')
del df

# Import in data and rename
df_2007_2018 = read_data_relative_path('./data/CDC_Natality/2007_2018.csv', data_type='csv')
df_2007_2018.sample(5)

Unnamed: 0,Notes,Year,Year Code,Month,Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age,Average OE Gestational Age
88413,,2017.0,2017.0,October,10.0,Connecticut,9.0,"Fairfield County, CT",9001.0,795,31.73,38.66,38.54
24593,,2009.0,2009.0,December,12.0,Wisconsin,55.0,"Waukesha County, WI",55133.0,278,30.15,38.72,38.67
36649,,2011.0,2011.0,June,6.0,New Jersey,34.0,"Cumberland County, NJ",34011.0,201,26.53,38.42,38.4
33750,,2011.0,2011.0,February,2.0,Illinois,17.0,"Peoria County, IL",17143.0,208,25.98,38.51,38.66
60086,,2014.0,2014.0,April,4.0,Pennsylvania,42.0,"Berks County, PA",42011.0,381,27.35,38.59,38.35


## Combine the two datasets

In [4]:
# Check to see if column names match
col_2003 = df_2003_2006.columns.to_list()
col_2007 = df_2007_2018.columns.to_list()

def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 

print('The difference between the two datasets is the column: ', Diff(col_2007, col_2003)) 
# print('The difference between the two datasets is the column: ', Diff(col_2003, col_2007)) # No difference

# Drop this column from the 2007 DF
df_2007_2018.drop(columns=['Average OE Gestational Age'], inplace=True)

# Concatenate the two columns
df = pd.concat([df_2003_2006, df_2007_2018])
df.sample(5)

The difference between the two datasets is the column:  ['Average OE Gestational Age']


Unnamed: 0,Notes,Year,Year Code,Month,Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age
41816,,2012.0,2012.0,February,2.0,Arkansas,5.0,"Sebastian County, AR",5131.0,136,26.17,38.48
82785,,2017.0,2017.0,January,1.0,Texas,48.0,"Comal County, TX",48091.0,134,29.08,38.45
76789,,2016.0,2016.0,May,5.0,Colorado,8.0,"Pueblo County, CO",8101.0,166,28.16,37.92
96524,,2018.0,2018.0,September,9.0,West Virginia,54.0,"Kanawha County, WV",54039.0,152,27.1,37.51
27674,,2010.0,2010.0,May,5.0,Michigan,26.0,"Berrien County, MI",26021.0,158,26.49,38.84


In [7]:
#There are some rows that:
    # are footnotes about the dataset (i.e. everything is null besides the note column)
    # are totals of each year, month, and state grouping
# Here we will remove those rows
# Before that we will check to make sure we are just removing those ones
    
# number of rows of full dataset
num_total = df.shape[0]

# number of rows where everything is null besides note column (the pesky footnote rows)
num_all_null = df[df.iloc[:, -11:].isnull().all(axis=1)].shape[0]

# number of rows where note is null (this is a valid entry)
num_note_null = df[df['Notes'].isna()].shape[0]

# number of rows where note is 'Total' (this is a valid entry)
num_note_total = df[df['Notes']=='Total'].shape[0]

# sum up these row counts and see if they match
print(num_total, num_all_null + num_note_null + num_note_total, num_all_null, num_note_null, num_note_total)

# since they do we will concatenate everything except for the pesky footnote rows
df = df[df['Notes'].isna()]

127901 127901 0 117888 10013


In [8]:
# Write to csv for more convenient future use
write_data_relative_path(df, './data/CDC_Natality/2003_2018.csv', data_type='csv')
del df