In [1]:
import pandas as pd
import numpy as np
from read_path_module import read_data_relative_path
from write_path_module import write_data_relative_path

## Importing 2003 - 2006 Data

In [2]:
# Read in txt file
df = read_data_relative_path('./data/CDC_Natality/2003_2006.txt', data_type='txt_tab')

# Write to csv for more convenient future use
write_data_relative_path(df, './data/CDC_Natality/2003_2006.csv', data_type='csv')
del df

# Import in data and rename
df_2003_2006 = read_data_relative_path('./data/CDC_Natality/2003_2006.csv', data_type='csv')
df_2003_2006.sample(3)

Unnamed: 0,Notes,Year,Year Code,Month,Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age
28587,,2006.0,2006.0,October,10.0,South Carolina,45.0,"Horry County, SC",45051.0,311.0,26.35,38.08
14385,,2004.0,2004.0,December,12.0,California,6.0,"Alameda County, CA",6001.0,1693.0,29.31,38.94
23935,,2006.0,2006.0,March,3.0,Louisiana,22.0,"Orleans Parish, LA",22071.0,159.0,27.35,38.2


## Importing 2007 - 2018 Data

In [3]:
# Create path list for all txt files
lst_paths = ['./data/CDC_Natality/'+str(i)+'.txt' for i in range(2007,2018+1)]

# Read in first csv
df = read_data_relative_path('./data/CDC_Natality/2007.txt', data_type='txt_tab').head(1)

# Delete data to create empty, but properly columned dataframe
df = df[0:0]

# Iterate through path names and append them to empty dataframe above
for path in lst_paths:
    temp = read_data_relative_path(path, data_type='txt_tab')
    df = pd.concat([df, temp])
    del temp

# Write to csv for more convenient future use
write_data_relative_path(df, './data/CDC_Natality/2007_2018.csv', data_type='csv')
del df

# Import in data and rename
df_2007_2018 = read_data_relative_path('./data/CDC_Natality/2007_2018.csv', data_type='csv')
df_2007_2018.sample(5)

Unnamed: 0,Notes,Year,Year Code,Month,Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age,Average OE Gestational Age
83646,,2017.0,2017.0,March,3.0,Colorado,8.0,"Jefferson County, CO",8059.0,527,30.47,38.78,38.84
14248,,2008.0,2008.0,September,9.0,Texas,48.0,"Guadalupe County, TX",48187.0,Missing County,Missing County,Missing County,Missing County
60990,,2014.0,2014.0,June,6.0,California,6.0,"Madera County, CA",6039.0,195,27.54,38.86,38.79
15737,,2008.0,2008.0,December,12.0,California,6.0,"Butte County, CA",6007.0,248,26.30,38.88,38.59
46505,Total,2012.0,2012.0,August,8.0,Virginia,51.0,,,9190,28.53,38.78,38.54


## Combine the two datasets

In [4]:
# Check to see if column names match
col_2003 = df_2003_2006.columns.to_list()
col_2007 = df_2007_2018.columns.to_list()

def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 

print('The difference between the two datasets is the column: ', Diff(col_2007, col_2003)) 
# print('The difference between the two datasets is the column: ', Diff(col_2003, col_2007)) # No difference

# Drop this column from the 2007 DF
df_2007_2018.drop(columns=['Average OE Gestational Age'], inplace=True)

# Concatenate the two columns
df = pd.concat([df_2003_2006, df_2007_2018])
df.sample(5)

The difference between the two datasets is the column:  ['Average OE Gestational Age']


Unnamed: 0,Notes,Year,Year Code,Month,Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age
23996,,2009.0,2009.0,December,12.0,Colorado,8.0,"Denver County, CO",8031.0,851,28.07,38.88
19351,,2005.0,2005.0,August,8.0,Alabama,1.0,"Madison County, AL",1089.0,333,26.88,38.02
18187,,2009.0,2009.0,March,3.0,New Mexico,35.0,"Unidentified Counties, NM",35999.0,1098,25.6,38.59
34345,,2011.0,2011.0,March,3.0,Florida,12.0,"Alachua County, FL",12001.0,220,28.19,38.94
95269,,2018.0,2018.0,August,8.0,Colorado,8.0,"Pueblo County, CO",8101.0,168,27.32,38.23


In [5]:
#There are some rows that:
    # are footnotes about the dataset (i.e. everything is null besides the note column)
    # are totals of each year, month, and state grouping
# Here we will remove those rows
# Before that we will check to make sure we are just removing those ones
    
# number of rows of full dataset
num_total = df.shape[0]

# number of rows where everything is null besides note column (the pesky footnote rows)
num_all_null = df[df.iloc[:, -11:].isnull().all(axis=1)].shape[0]

# number of rows where note is null (this is a valid entry)
num_note_null = df[df['Notes'].isna()].shape[0]

# number of rows where note is 'Total' (this is a valid entry)
num_note_total = df[df['Notes']=='Total'].shape[0]

# sum up these row counts and see if they match
print(num_total, num_all_null + num_note_null + num_note_total, num_all_null, num_note_null, num_note_total)

# since they do we will concatenate everything except for the pesky footnote rows
df = df[df['Notes'].isna()]

128638 128638 737 117888 10013


## - Change the 'Missing County' label to NA

In [6]:
col_lst = df.columns.to_list()

for col in col_lst:
    df.loc[df[col] == 'Missing County', col] = np.nan
del col_lst

  res_values = method(rvalues)


## - Convert appropriate columns to int

In [7]:
# df['Month Code'] = df['Month Code'].astype(int)
col_lst = ['Year', 'Year Code', 'Month Code', 'State Code', 'County Code']

for col in col_lst:
    df[col] = df[col].astype(int)

### - Rename columns

In [8]:
df.rename(columns = {
    "Year": "Birth Year",
    "Month": "Birth Month",
    "Month Code": "Birth Month Code"},
         inplace = True)

### - Remove NA rows where appropriate

In [9]:
print('There were ',df['Births'].isna().sum(), ' Births NA rows.')

# Remove rows where birth data is not present
df = df[-df['Births'].isna()]

print( 'There are now ', df['Births'].isna().sum(), ' Births NA rows.')

There were  4824  Births NA rows.
There are now  0  Births NA rows.


### - Create pandas datetime column

In [10]:
df['Birth Date'] = pd.to_datetime(df['Birth Year'].astype(str) + '-' + df['Birth Month Code'].astype(str) + '-' + '15', format='%Y-%m-%d')
df.head(2)

Unnamed: 0,Notes,Birth Year,Year Code,Birth Month,Birth Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age,Birth Date
0,,2003,2003,January,1,Alabama,1,"Baldwin County, AL",1003,131,26.77,37.97,2003-01-15
1,,2003,2003,January,1,Alabama,1,"Calhoun County, AL",1015,124,25.32,38.63,2003-01-15


### - Backtrack to calculate conception date
- At a high level
    - A menstrual cycle starts
    - Two weeks later ovulation occurs
    - Sparks fly and conception occurs
    - ~38 weeks later parents sleep cycles are forever changed...
    
    
- Within the dataset LMP gestational age is given. This is the estimate of the last menstrual cycle


- To calculate conception date we:
    - Take date of birth (+/- 15 days) then subtract LMP Age then add 2 weeks to ovulation

In [11]:
# Transforming the gestational age into a timedelta for time arithmetic
lst_time_intrval = [pd.Timedelta(float(x)*7, unit='days') for x in df['Average LMP Gestational Age']]
    
# Date of birth - LMP Age + 2 weeks to ovulation
df['Conception Date'] = df['Birth Date'] - pd.Series(lst_time_intrval).values + pd.Timedelta(2, unit='W')
df[['Birth Date','Conception Date']].head(2)

Unnamed: 0,Birth Date,Conception Date
0,2003-01-15,2002-05-08 05:02:24
1,2003-01-15,2002-05-03 14:09:36


### - Create Conception Year, Month, and Month Code Columns

In [12]:
df['Conception Year'] = pd.DatetimeIndex(df['Conception Date']).year
df['Conception Month'] = pd.DatetimeIndex(df['Conception Date']).month_name()
df['Conception Month Code'] = pd.DatetimeIndex(df['Conception Date']).month

# Convert to integers
col_lst = ['Conception Year', 'Conception Month Code']

for col in col_lst:
    df[col] = df[col].astype(int)

### - Clean, reorder, and create new columns

In [13]:
df['State Acronym'] = pd.Series([x[-2:] for x in df['County']])
df[['County','temp']] = df['County'].str.split(',', expand=True)
df.drop(columns=['temp'], inplace=True)

df.sample(3)

df.columns.to_list()

col_lst = ['Birth Year', 'Conception Year', 'Birth Month', 'Conception Month',
           'Birth Month Code', 'Conception Month Code',
           'Birth Date', 'Conception Date', 'State', 'State Acronym',
           'State Code','County','County Code', 'Births','Average Age of Mother',
           'Average LMP Gestational Age','Notes']

df = df[col_lst]
del col_lst
print(df.shape)
df.head(3)

(113064, 17)


Unnamed: 0,Birth Year,Conception Year,Birth Month,Conception Month,Birth Month Code,Conception Month Code,Birth Date,Conception Date,State,State Acronym,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age,Notes
0,2003,2002,January,May,1,5,2003-01-15,2002-05-08 05:02:24,Alabama,AL,1,Baldwin County,1003,131,26.77,37.97,
1,2003,2002,January,May,1,5,2003-01-15,2002-05-03 14:09:36,Alabama,AL,1,Calhoun County,1015,124,25.32,38.63,
2,2003,2002,January,May,1,5,2003-01-15,2002-05-05 13:12:00,Alabama,AL,1,Etowah County,1055,102,25.02,38.35,


## - Write result to csv

In [14]:
# Write to csv for more convenient future use
write_data_relative_path(df, './data/CDC_Natality/2003_2018.csv', data_type='csv')
del df