In [1]:
import pandas as pd
import numpy as np
from read_path_module import read_data_relative_path
from write_path_module import write_data_relative_path

## Importing 2003 - 2006 Data

In [2]:
# Read in txt file
df = read_data_relative_path('./data/CDC_Natality/2003_2006.txt', data_type='txt_tab')

# Write to csv for more convenient future use
write_data_relative_path(df, './data/CDC_Natality/2003_2006.csv', data_type='csv')
del df

# Import in data and rename
df_2003_2006 = read_data_relative_path('./data/CDC_Natality/2003_2006.csv', data_type='csv')
df_2003_2006.sample(3)

Unnamed: 0,Notes,Year,Year Code,Month,Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age
10895,,2004.0,2004.0,June,6.0,Minnesota,27.0,"Olmsted County, MN",27109.0,176.0,27.83,39.06
13233,,2004.0,2004.0,October,10.0,Florida,12.0,"Seminole County, FL",12117.0,386.0,28.18,38.86
29486,,2006.0,2006.0,December,12.0,Idaho,16.0,"Kootenai County, ID",16055.0,136.0,27.01,38.68


## Importing 2007 - 2018 Data

In [3]:
# Create path list for all txt files
lst_paths = ['./data/CDC_Natality/'+str(i)+'.txt' for i in range(2007,2018+1)]

# Read in first csv
df = read_data_relative_path('./data/CDC_Natality/2007.txt', data_type='txt_tab').head(1)

# Delete data to create empty, but properly columned dataframe
df = df[0:0]

# Iterate through path names and append them to empty dataframe above
for path in lst_paths:
    temp = read_data_relative_path(path, data_type='txt_tab')
    df = pd.concat([df, temp])
    del temp

# Write to csv for more convenient future use
write_data_relative_path(df, './data/CDC_Natality/2007_2018.csv', data_type='csv')
del df

# Import in data and rename
df_2007_2018 = read_data_relative_path('./data/CDC_Natality/2007_2018.csv', data_type='csv')
df_2007_2018.sample(5)

Unnamed: 0,Notes,Year,Year Code,Month,Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age,Average OE Gestational Age
35732,,2011.0,2011.0,May,5.0,Florida,12.0,"St. Johns County, FL",12109.0,157,28.75,39.2,39.04
24548,,2009.0,2009.0,December,12.0,Virginia,51.0,"Loudoun County, VA",51107.0,438,30.82,38.57,38.41
85377,Total,2017.0,2017.0,May,5.0,North Dakota,38.0,,,895,28.36,38.78,38.6
56356,,2013.0,2013.0,November,11.0,Kentucky,21.0,"Jefferson County, KY",21111.0,817,27.62,38.54,38.38
65379,,2014.0,2014.0,December,12.0,New Hampshire,33.0,"Merrimack County, NH",33013.0,101,29.67,38.81,38.72


## Combine the two datasets

In [4]:
# Check to see if column names match
col_2003 = df_2003_2006.columns.to_list()
col_2007 = df_2007_2018.columns.to_list()

def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 

print('The difference between the two datasets is the column: ', Diff(col_2007, col_2003)) 
# print('The difference between the two datasets is the column: ', Diff(col_2003, col_2007)) # No difference

# Drop this column from the 2007 DF
df_2007_2018.drop(columns=['Average OE Gestational Age'], inplace=True)

# Concatenate the two columns
df = pd.concat([df_2003_2006, df_2007_2018])
df.sample(5)

The difference between the two datasets is the column:  ['Average OE Gestational Age']


Unnamed: 0,Notes,Year,Year Code,Month,Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age
42047,,2012.0,2012.0,February,2.0,Maryland,24.0,"Cecil County, MD",24015.0,Missing County,Missing County,Missing County
26327,,2006.0,2006.0,July,7.0,Florida,12.0,"Miami-Dade County, FL",12086.0,2799,28.06,38.37
78941,Total,2016.0,2016.0,August,8.0,Illinois,17.0,,,14068,29.19,38.66
5434,,2007.0,2007.0,August,8.0,Wisconsin,55.0,"Winnebago County, WI",55139.0,181,27.32,38.84
13056,,2008.0,2008.0,August,8.0,Colorado,8.0,"Denver County, CO",8031.0,973,28.22,38.74


In [5]:
#There are some rows that:
    # are footnotes about the dataset (i.e. everything is null besides the note column)
    # are totals of each year, month, and state grouping
# Here we will remove those rows
# Before that we will check to make sure we are just removing those ones
    
# number of rows of full dataset
num_total = df.shape[0]

# number of rows where everything is null besides note column (the pesky footnote rows)
num_all_null = df[df.iloc[:, -11:].isnull().all(axis=1)].shape[0]

# number of rows where note is null (this is a valid entry)
num_note_null = df[df['Notes'].isna()].shape[0]

# number of rows where note is 'Total' (this is a valid entry)
num_note_total = df[df['Notes']=='Total'].shape[0]

# sum up these row counts and see if they match
print(num_total, num_all_null + num_note_null + num_note_total, num_all_null, num_note_null, num_note_total)

# since they do we will concatenate everything except for the pesky footnote rows
df = df[df['Notes'].isna()]

128638 128638 737 117888 10013


## - Change the 'Missing County' label to NA

In [6]:
col_lst = df.columns.to_list()

for col in col_lst:
    df.loc[df[col] == 'Missing County', col] = np.nan
del col_lst

  res_values = method(rvalues)


## - Convert appropriate columns to int

In [7]:
# df['Month Code'] = df['Month Code'].astype(int)
col_lst = ['Year', 'Year Code', 'Month Code', 'State Code', 'County Code']

for col in col_lst:
    df[col] = df[col].astype(int)

## - Write result to csv

In [8]:
# Write to csv for more convenient future use
write_data_relative_path(df, './data/CDC_Natality/2003_2018.csv', data_type='csv')
del df