In [95]:
#import dependencies
import pandas as pd

In [96]:
# Define a function that reads a text file with two columns and stores it in a dictionary.
def read_file_to_dict(file_path):
    """
    Reads a text file with two columns and stores it in a dictionary.
    
    Args:
    file_path (str): The path to the text file.
    
    Returns:
    dict: A dictionary with keys from the first column and values from the second column.
    """
    data_dict = {}
    
    with open(file_path, 'r') as file:
        for line in file:
            # Strip leading/trailing whitespace and split by whitespace
            parts = line.strip().split(maxsplit=1)
            if len(parts) == 2:
                key, value = parts
                data_dict[key] = value
    
    return data_dict

In [97]:
#Use the function to read the state and county data into dictionaries.

state_dict = read_file_to_dict('resources/state_codes.txt')
county_dict = read_file_to_dict('resources/county_codes.txt')
#display(state_dict)
#display(county_dict)

In [98]:
# for key in state_dict: if key begins with 0, then the '0' should not be used for mapping the state code to the state name.
# Create a new dictionary with the correct state codes.
state_dict_new = {}
for key, value in state_dict.items():
    if key[0] == '0':
        state_dict_new[key[1:]] = value
    else:
        state_dict_new[key] = value 
#display(state_dict_new)

In [None]:
# Read the birth data from csv file and store it in a dataframe
birth_data_df = pd.read_csv('resources/allBirthData.csv')
display(birth_data_df.head())
display(birth_data_df.tail())
birth_data_df.info()

In [None]:
# convert the state and county columns to strings
birth_data_df['State'] = birth_data_df['State'].astype(str)
birth_data_df['County'] = birth_data_df['County'].astype(str)
birth_data_df.info()

In [None]:
# if birth_data_df['County'] ends with '.0' then remove the '.0' from the county value.
birth_data_df['County'] = birth_data_df['County'].str.replace('.0', '')
display(birth_data_df.head())
# if birth_data_df['County'] length is 4, then add a leading '0' to the county value.
birth_data_df['County'] = birth_data_df['County'].apply(lambda x: '0' + x if len(x) == 4 else x)
display(birth_data_df.head())


In [None]:
# create a new column 'StateName' by mapping the 'State' column to the state names using the state_dict
birth_data_df['StateName'] = birth_data_df['State'].map(state_dict_new)
display(birth_data_df.head())
display(birth_data_df.tail())


In [None]:
# create a new column 'CountyName' by mapping the 'County' column to the county names using the county_dict
birth_data_df['CountyName'] = birth_data_df['County'].map(county_dict)
display(birth_data_df.head())
display(birth_data_df.tail())


In [None]:
#delete Unnamed: 0 column and the index column
birth_data_df.drop(columns=['Unnamed: 0'], inplace=True)
display(birth_data_df.head())


In [None]:
#create a new dataframe for years 1990-2020 
birth_data_1990_2015_df = birth_data_df.loc[birth_data_df['Year'] >= 1990]
display(birth_data_1990_2015_df.head())

In [None]:
#create a new dataframe with no county, CountyName, County columns
birth_data_state_1990_2015_df = birth_data_1990_2015_df.drop(columns=['County', 'CountyName','countyBirths'])
display(birth_data_state_1990_2015_df.head())
display(birth_data_state_1990_2015_df.tail())

In [None]:
#reset the index
birth_data_state_1990_2015_df.reset_index(drop=True, inplace=True)
del birth_data_state_1990_2015_df['index']
display(birth_data_state_1990_2015_df.head())
display(birth_data_state_1990_2015_df.info())

In [None]:
#delete duplicate rows from the dataframe   
birth_data_state_1990_2015_updated_df = birth_data_state_1990_2015_df.drop_duplicates() 
display(birth_data_state_1990_2015_updated_df.head())
display(birth_data_state_1990_2015_updated_df.info())



In [113]:
# create a dataframe with yearly total  birth data for each state from 1990-2015    
birth_yearly_statewise_total_df = birth_data_state_1990_2015_updated_df.groupby(['Year', 'State', 'StateName'])['stateBirths'].sum().reset_index()
display(birth_yearly_statewise_total_df.head())

Unnamed: 0,Year,State,StateName,stateBirths
0,1990,1,ALABAMA,63487
1,1990,10,DELAWARE,199339
2,1990,11,DISTRICT OF COLUMBIA,112666
3,1990,12,FLORIDA,20489
4,1990,13,GEORGIA,16433


In [116]:
# create a dataframe for the total birth data for all states combined from 1990-2015 every year 
birth_yearly_total_df = birth_yearly_statewise_total_df.groupby('Year')['stateBirths'].sum().reset_index()
display(birth_yearly_total_df)
display(birth_yearly_total_df.info())

Unnamed: 0,Year,stateBirths
0,1990,3768342
1,1991,3725491
2,1992,3683602
3,1993,3620679
4,1994,3573818
5,1995,3523807
6,1996,3514769
7,1997,3506805
8,1998,3559506
9,1999,3575091


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Year         26 non-null     int64
 1   stateBirths  26 non-null     int64
dtypes: int64(2)
memory usage: 548.0 bytes


None

In [118]:
# add a column to show the rate of change in birth rate from the previous year
birth_yearly_total_df['BirthRateChange'] = birth_yearly_total_df['stateBirths'].pct_change()
display(birth_yearly_total_df)

Unnamed: 0,Year,stateBirths,BirthRateChange
0,1990,3768342,
1,1991,3725491,-0.011371
2,1992,3683602,-0.011244
3,1993,3620679,-0.017082
4,1994,3573818,-0.012943
5,1995,3523807,-0.013994
6,1996,3514769,-0.002565
7,1997,3506805,-0.002266
8,1998,3559506,0.015028
9,1999,3575091,0.004378
