In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from read_path_module import read_data_relative_path
from write_path_module import write_data_relative_path

### - Import the dataset

In [151]:
df = read_data_relative_path('./data/CDC_Natality/2003_2018.csv', data_type='csv')

### - Rename columns

In [152]:
df.rename(columns = {
    "Year": "Birth Year",
    "Month": "Birth Month",
    "Month Code": "Birth Month Code"},
         inplace = True)

### - Remove NA rows where appropriate

In [153]:
print('There were ',df['Births'].isna().sum(), ' Births NA rows.')

# Remove rows where birth data is not present
df = df[-df['Births'].isna()]

print( 'There are now ', df['Births'].isna().sum(), ' Births NA rows.')

There were  4824  Births NA rows.
There are now  0  Births NA rows.


### - Create pandas datetime column

In [154]:
df['Birth Date'] = pd.to_datetime(df['Birth Year'].astype(str) + '-' + df['Birth Month Code'].astype(str) + '-' + '15', format='%Y-%m-%d')
df.head(2)

Unnamed: 0,Notes,Birth Year,Year Code,Birth Month,Birth Month Code,State,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age,Birth Date
0,,2003,2003,January,1,Alabama,1,"Baldwin County, AL",1003,131.0,26.77,37.97,2003-01-15
1,,2003,2003,January,1,Alabama,1,"Calhoun County, AL",1015,124.0,25.32,38.63,2003-01-15


### - Backtrack to calculate conception date
- At a high level
    - A menstrual cycle starts
    - Two weeks later ovulation occurs
    - Sparks fly and conception occurs
    - ~38 weeks later parents sleep cycles are forever changed...
    
    
- Within the dataset LMP gestational age is given. This is the estimate of the last menstrual cycle


- To calculate conception date we:
    - Take date of birth (+/- 15 days) then subtract LMP Age then add 2 weeks to ovulation

In [155]:
# Transforming the gestational age into a timedelta for time arithmetic
lst_time_intrval = [pd.Timedelta(x*7, unit='days') for x in df['Average LMP Gestational Age']]
    
# Date of birth - LMP Age + 2 weeks to ovulation
df['Conception Date'] = df['Birth Date'] - pd.Series(lst_time_intrval).values + pd.Timedelta(2, unit='W')
df[['Birth Date','Conception Date']].head(2)

Unnamed: 0,Birth Date,Conception Date
0,2003-01-15,2002-05-08 05:02:24
1,2003-01-15,2002-05-03 14:09:36


### - Create Conception Year, Month, and Month Code Columns

In [158]:
df['Conception Year'] = pd.DatetimeIndex(df['Conception Date']).year
df['Conception Month'] = pd.DatetimeIndex(df['Conception Date']).month_name()
df['Conception Month Code'] = pd.DatetimeIndex(df['Conception Date']).month

# Convert to integers
col_lst = ['Conception Year', 'Conception Month Code']

for col in col_lst:
    df[col] = df[col].astype(int)

### - Clean, reorder, and create new columns

In [159]:
df['State Acronym'] = df['County'][0][-2:]
df[['County','temp']] = df['County'].str.split(',', expand=True)
df.drop(columns=['temp'], inplace=True)

df.sample(3)

df.columns.to_list()

col_lst = ['Birth Year', 'Conception Year', 'Birth Month', 'Conception Month',
           'Birth Month Code', 'Conception Month Code',
           'Birth Date', 'Conception Date', 'State', 'State Acronym',
           'State Code','County','County Code', 'Births','Average Age of Mother',
           'Average LMP Gestational Age','Notes']

df = df[col_lst]
del col_lst
df.head(2)

Unnamed: 0,Birth Year,Conception Year,Birth Month,Conception Month,Birth Month Code,Conception Month Code,Birth Date,Conception Date,State,State Acronym,State Code,County,County Code,Births,Average Age of Mother,Average LMP Gestational Age,Notes
0,2003,2002,January,May,1,5,2003-01-15,2002-05-08 05:02:24,Alabama,AL,1,Baldwin County,1003,131.0,26.77,37.97,
1,2003,2002,January,May,1,5,2003-01-15,2002-05-03 14:09:36,Alabama,AL,1,Calhoun County,1015,124.0,25.32,38.63,


### - Get unique list of county codes

In [None]:
lst_county_codes = list(df['County Code'].unique())

### - Births per county over time (Line)

In [None]:
def Random_County_Plotter(df, lst_county_codes, time='birth'):
    
    import random 
    n = random.randrange(0, len(lst_county_codes))
    
    df_county = df[df['County Code']==lst_county_codes[n]]
    
    if time.lower() == 'birth':
        df_county.index = df_county['Birth Date']
    else:
        df_county.index = df_county['Conception Date']
    

    df_county['Births'].plot(figsize=(15,6), legend=True, label='Raw')
    df_county['Births'].resample("2m").median().plot(figsize=(15,6), legend=True, label='2 Month Median Resample')
    df_county['Births'].rolling(4).median().plot(figsize=(15,6), legend=True, label='4 Month Rolling Median')

    plt.xlabel('Birth Year')
    plt.ylabel('Births')
    plt.title(df_county['County'].to_list()[0] + ', ' + df_county['State Acronym'].to_list()[0])
    plt.show()


In [None]:
Random_County_Plotter(df, lst_county_codes, time='conception')

### - Births per county over each Birth Year (Line)

In [None]:
def Random_County_Plotter_Annual(df, lst_county_codes):
    
    import random 
    n = random.randrange(0, len(lst_county_codes))
    
    df_county = df[df['County Code']==lst_county_codes[n]]
    
    low = df_county['Birth Year'].min()
    high = df_county['Birth Year'].max()
    lst_yrs = list(range(low, high+1))
    
    plt.figure(figsize=(20,10))

    for yr in lst_yrs:
        x = df_county[df_county['Birth Year'] == yr]['Birth Month Code']
        y = df_county[df_county['Birth Year'] == yr]['Births']
        plt.plot(x,y)
    
    plt.xticks(x, labels=df_county['Birth Month'].unique(), rotation='45')
    plt.ylabel('Births')
    plt.title(df_county['County'].to_list()[0] + ', ' + df_county['State Acronym'].to_list()[0])
    plt.show()

In [None]:
Random_County_Plotter_Annual(df, lst_county_codes)

### - Births per county over each Birth Year (Scatter)

In [None]:
def Random_County_Plotter_Scatter(df, lst_county_codes):
    
    import random 
    n = random.randrange(0, len(lst_county_codes))
    
    df_county = df[df['County Code']==lst_county_codes[n]]

    x = df_county['Birth Month Code']
    y = df_county['Births']
    
    plt.figure(figsize=(20,8))
    
    plt.scatter(x, y)

    plt.xticks(x, labels=df_county['Birth Month'].unique(), rotation='45')
    plt.xlabel('Birth Year')
    plt.ylabel('Births')
    plt.title(df_county['County'].to_list()[0] + ', ' + df_county['State Acronym'].to_list()[0])
    plt.show()

In [None]:
Random_County_Plotter_Scatter(df, lst_county_codes)

### - Births per county over each Birth Year (Boxplot)

In [None]:
def Random_County_Plotter_Boxplot(df, lst_county_codes):
    
    import random 
    n = random.randrange(0, len(lst_county_codes))
    
    df_county = df[df['County Code']==lst_county_codes[n]]

    
    df_county.boxplot(column='Births', by='Birth Month Code', figsize=(15,6))

    plt.xlabel('Birth Year')
    plt.ylabel('Births')
    plt.title(df_county['County'].to_list()[0] + ', ' + df_county['State Acronym'].to_list()[0])
    plt.show()

In [None]:
Random_County_Plotter_Boxplot(df, lst_county_codes)

### - 

### - 

### - 

### - 

### - 

### - 

### - 

### - 

### - 

### - 

### - 

### - 

### - 

### - 

### - 

### - 