In [47]:
#import dependencies
import pandas as pd

In [48]:
# Define a function that reads a text file with two columns and stores it in a dictionary.
def read_file_to_dict(file_path):
    """
    Reads a text file with two columns and stores it in a dictionary.
    
    Args:
    file_path (str): The path to the text file.
    
    Returns:
    dict: A dictionary with keys from the first column and values from the second column.
    """
    data_dict = {}
    
    with open(file_path, 'r') as file:
        for line in file:
            # Strip leading/trailing whitespace and split by whitespace
            parts = line.strip().split(maxsplit=1)
            if len(parts) == 2:
                key, value = parts
                data_dict[key] = value
    
    return data_dict

In [49]:
#Use the function to read the state data into dictionary

state_dict = read_file_to_dict('resources/state_codes.txt')


In [50]:
# for key in state_dict: if key begins with 0, then the '0' should not be used for mapping the state code to the state name. Also change the state name to title case.
# Create a new dictionary with the correct state codes.
new_state_dict = {}
for key in state_dict:
    if key[0] == '0':
        new_key = key[1:]
    else:
        new_key = key
    new_state_dict[new_key] = state_dict[key].title()
print(new_state_dict)

{'state-level': 'Place', 'FIPS': 'Code     Name', '-----------': '-------', '1': 'Alabama', '2': 'Alaska', '4': 'Arizona', '5': 'Arkansas', '6': 'California', '8': 'Colorado', '9': 'Connecticut', '10': 'Delaware', '11': 'District Of Columbia', '12': 'Florida', '13': 'Georgia', '15': 'Hawaii', '16': 'Idaho', '17': 'Illinois', '18': 'Indiana', '19': 'Iowa', '20': 'Kansas', '21': 'Kentucky', '22': 'Louisiana', '23': 'Maine', '24': 'Maryland', '25': 'Massachusetts', '26': 'Michigan', '27': 'Minnesota', '28': 'Mississippi', '29': 'Missouri', '30': 'Montana', '31': 'Nebraska', '32': 'Nevada', '33': 'New Hampshire', '34': 'New Jersey', '35': 'New Mexico', '36': 'New York', '37': 'North Carolina', '38': 'North Dakota', '39': 'Ohio', '40': 'Oklahoma', '41': 'Oregon', '42': 'Pennsylvania', '44': 'Rhode Island', '45': 'South Carolina', '46': 'South Dakota', '47': 'Tennessee', '48': 'Texas', '49': 'Utah', '50': 'Vermont', '51': 'Virginia', '53': 'Washington', '54': 'West Virginia', '55': 'Wisconsi

In [51]:
# Read the birth data from csv file and store it in a dataframe
birth_data_df = pd.read_csv('resources/allBirthData.csv')
display(birth_data_df.head())
display(birth_data_df.tail())
birth_data_df.info()

Unnamed: 0.1,index,Unnamed: 0,State,Month,Year,countyBirths,stateBirths,County
0,0,1,1,1,1985,36.0,5027,1001.0
1,1,2,1,2,1985,36.0,4627,1001.0
2,2,3,1,3,1985,43.0,4738,1001.0
3,3,4,1,4,1985,40.0,4626,1001.0
4,4,5,1,5,1985,34.0,4834,1001.0


Unnamed: 0.1,index,Unnamed: 0,State,Month,Year,countyBirths,stateBirths,County
321470,321470,321471,55,12,2015,111.0,5632,55131.0
321471,321471,321472,55,12,2015,307.0,5632,55133.0
321472,321472,321473,55,12,2015,154.0,5632,55139.0
321473,321473,321474,55,12,2015,1848.0,5632,55999.0
321474,321474,321475,56,12,2015,661.0,661,56999.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321475 entries, 0 to 321474
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   index         321475 non-null  int64  
 1   Unnamed: 0    321475 non-null  int64  
 2   State         321475 non-null  int64  
 3   Month         321475 non-null  int64  
 4   Year          321475 non-null  int64  
 5   countyBirths  293021 non-null  float64
 6   stateBirths   321475 non-null  int64  
 7   County        312919 non-null  float64
dtypes: float64(2), int64(6)
memory usage: 19.6 MB


In [52]:
# convert the state column to string
birth_data_df['State'] = birth_data_df['State'].astype(str)
birth_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321475 entries, 0 to 321474
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   index         321475 non-null  int64  
 1   Unnamed: 0    321475 non-null  int64  
 2   State         321475 non-null  object 
 3   Month         321475 non-null  int64  
 4   Year          321475 non-null  int64  
 5   countyBirths  293021 non-null  float64
 6   stateBirths   321475 non-null  int64  
 7   County        312919 non-null  float64
dtypes: float64(2), int64(5), object(1)
memory usage: 19.6+ MB


In [53]:
# drop columns index, Unnamed: 0, countyBirths and County
birth_data_df = birth_data_df.drop(columns=['index', 'Unnamed: 0', 'countyBirths', 'County'])
display(birth_data_df.head())

Unnamed: 0,State,Month,Year,stateBirths
0,1,1,1985,5027
1,1,2,1985,4627
2,1,3,1985,4738
3,1,4,1985,4626
4,1,5,1985,4834


In [54]:
# create a new column 'StateName' by mapping the 'State' column to the state names using the state_dict
birth_data_df['StateName'] = birth_data_df['State'].map(new_state_dict)
display(birth_data_df.head())
display(birth_data_df.tail())
birth_data_df.info()


Unnamed: 0,State,Month,Year,stateBirths,StateName
0,1,1,1985,5027,Alabama
1,1,2,1985,4627,Alabama
2,1,3,1985,4738,Alabama
3,1,4,1985,4626,Alabama
4,1,5,1985,4834,Alabama


Unnamed: 0,State,Month,Year,stateBirths,StateName
321470,55,12,2015,5632,Wisconsin
321471,55,12,2015,5632,Wisconsin
321472,55,12,2015,5632,Wisconsin
321473,55,12,2015,5632,Wisconsin
321474,56,12,2015,661,Wyoming


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321475 entries, 0 to 321474
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   State        321475 non-null  object
 1   Month        321475 non-null  int64 
 2   Year         321475 non-null  int64 
 3   stateBirths  321475 non-null  int64 
 4   StateName    304381 non-null  object
dtypes: int64(3), object(2)
memory usage: 12.3+ MB


In [55]:
# drop the 'State' column
birth_data_df = birth_data_df.drop(columns=['State'])
display(birth_data_df.head())

Unnamed: 0,Month,Year,stateBirths,StateName
0,1,1985,5027,Alabama
1,2,1985,4627,Alabama
2,3,1985,4738,Alabama
3,4,1985,4626,Alabama
4,5,1985,4834,Alabama


In [56]:
# delete duplicate rows because the original data had county births and corresponding state births, since we are only interested in state births we can delete the duplicate rows
birth_data_df = birth_data_df.drop_duplicates()
display(birth_data_df.head())
birth_data_df.info()

Unnamed: 0,Month,Year,stateBirths,StateName
0,1,1985,5027,Alabama
1,2,1985,4627,Alabama
2,3,1985,4738,Alabama
3,4,1985,4626,Alabama
4,5,1985,4834,Alabama


<class 'pandas.core.frame.DataFrame'>
Index: 18972 entries, 0 to 321474
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Month        18972 non-null  int64 
 1   Year         18972 non-null  int64 
 2   stateBirths  18972 non-null  int64 
 3   StateName    18108 non-null  object
dtypes: int64(3), object(1)
memory usage: 741.1+ KB


In [57]:
# drop all rows with NaN values
birth_data_df = birth_data_df.dropna()
display(birth_data_df.head())
birth_data_df.info()

Unnamed: 0,Month,Year,stateBirths,StateName
0,1,1985,5027,Alabama
1,2,1985,4627,Alabama
2,3,1985,4738,Alabama
3,4,1985,4626,Alabama
4,5,1985,4834,Alabama


<class 'pandas.core.frame.DataFrame'>
Index: 18108 entries, 0 to 321474
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Month        18108 non-null  int64 
 1   Year         18108 non-null  int64 
 2   stateBirths  18108 non-null  int64 
 3   StateName    18108 non-null  object
dtypes: int64(3), object(1)
memory usage: 707.3+ KB


In [58]:
#create a new dataframe forall years  and reset index
birth_data_all_years_df = birth_data_df
birth_data_all_years_df = birth_data_all_years_df.reset_index(drop=True)    
display(birth_data_all_years_df.head())
birth_data_all_years_df.info()

Unnamed: 0,Month,Year,stateBirths,StateName
0,1,1985,5027,Alabama
1,2,1985,4627,Alabama
2,3,1985,4738,Alabama
3,4,1985,4626,Alabama
4,5,1985,4834,Alabama


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18108 entries, 0 to 18107
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Month        18108 non-null  int64 
 1   Year         18108 non-null  int64 
 2   stateBirths  18108 non-null  int64 
 3   StateName    18108 non-null  object
dtypes: int64(3), object(1)
memory usage: 566.0+ KB


In [59]:
# change the Month column to YYYY-MM-DD format where DD is the first day of the month
birth_data_all_years_df['Month'] = birth_data_all_years_df['Month'].astype(str)
birth_data_all_years_df['Month'] = pd.to_datetime(birth_data_all_years_df[['Year', 'Month']].assign(Day=1))
display(birth_data_all_years_df.head())
birth_data_all_years_df.info()

Unnamed: 0,Month,Year,stateBirths,StateName
0,1985-01-01,1985,5027,Alabama
1,1985-02-01,1985,4627,Alabama
2,1985-03-01,1985,4738,Alabama
3,1985-04-01,1985,4626,Alabama
4,1985-05-01,1985,4834,Alabama


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18108 entries, 0 to 18107
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Month        18108 non-null  datetime64[ns]
 1   Year         18108 non-null  int64         
 2   stateBirths  18108 non-null  int64         
 3   StateName    18108 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 566.0+ KB


In [60]:
# rename the StateName column to State and stateBirths to Births and Month to Date

birth_data_all_years_df = birth_data_all_years_df.rename(columns={'StateName': 'State', 'stateBirths': 'Births', 'Month': 'Date'})
display(birth_data_all_years_df.head())
birth_data_all_years_df.info()

Unnamed: 0,Date,Year,Births,State
0,1985-01-01,1985,5027,Alabama
1,1985-02-01,1985,4627,Alabama
2,1985-03-01,1985,4738,Alabama
3,1985-04-01,1985,4626,Alabama
4,1985-05-01,1985,4834,Alabama


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18108 entries, 0 to 18107
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    18108 non-null  datetime64[ns]
 1   Year    18108 non-null  int64         
 2   Births  18108 non-null  int64         
 3   State   18108 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 566.0+ KB


In [61]:
# save the cleaned data to a new csv file
birth_data_all_years_df.to_csv('resources/birthdata/master_birth_data.csv', index=False)


In [62]:
# create a new dataframe for New York, New Jersey and Connecticut
ny_nj_ct_df = birth_data_all_years_df.loc[birth_data_all_years_df['State'].isin(['New York', 'New Jersey', 'Connecticut'])]
ny_nj_ct_df = ny_nj_ct_df.reset_index(drop=True)
display(ny_nj_ct_df.head())
ny_nj_ct_df.info()


Unnamed: 0,Date,Year,Births,State
0,1985-01-01,1985,825,Connecticut
1,1985-02-01,1985,740,Connecticut
2,1985-03-01,1985,783,Connecticut
3,1985-04-01,1985,794,Connecticut
4,1985-05-01,1985,815,Connecticut


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1116 entries, 0 to 1115
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1116 non-null   datetime64[ns]
 1   Year    1116 non-null   int64         
 2   Births  1116 non-null   int64         
 3   State   1116 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 35.0+ KB


In [63]:
# save the cleaned data to a new csv file
ny_nj_ct_df.to_csv('resources/birthdata/ny_nj_ct_911_birth_data.csv', index=False)

In [64]:
# create a new dataframe for all states except New York, New Jersey and Connecticut
other_states_df = birth_data_all_years_df.loc[~birth_data_all_years_df['State'].isin(['New York', 'New Jersey', 'Connecticut'])]
other_states_df = other_states_df.reset_index(drop=True)
display(other_states_df.head())
other_states_df.info()

Unnamed: 0,Date,Year,Births,State
0,1985-01-01,1985,5027,Alabama
1,1985-02-01,1985,4627,Alabama
2,1985-03-01,1985,4738,Alabama
3,1985-04-01,1985,4626,Alabama
4,1985-05-01,1985,4834,Alabama


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16992 entries, 0 to 16991
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    16992 non-null  datetime64[ns]
 1   Year    16992 non-null  int64         
 2   Births  16992 non-null  int64         
 3   State   16992 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 531.1+ KB


In [65]:
# save the cleaned data to a new csv file
other_states_df.to_csv('resources/birthdata/other_states_911_birth_data.csv', index=False)


In [70]:
# create a new dataframe for state of louisiana for event of Katrina
la_df = birth_data_all_years_df.loc[birth_data_all_years_df['State'] == 'Louisiana']
la_df = la_df.reset_index(drop=True)
display(la_df.head())
display(la_df.tail())
la_df.info()


Unnamed: 0,Date,Year,Births,State
0,1985-01-01,1985,6234,Louisiana
1,1985-02-01,1985,6071,Louisiana
2,1985-03-01,1985,6797,Louisiana
3,1985-04-01,1985,6753,Louisiana
4,1985-05-01,1985,7254,Louisiana


Unnamed: 0,Date,Year,Births,State
367,2015-08-01,2015,5853,Louisiana
368,2015-09-01,2015,5881,Louisiana
369,2015-10-01,2015,5751,Louisiana
370,2015-11-01,2015,5344,Louisiana
371,2015-12-01,2015,5607,Louisiana


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    372 non-null    datetime64[ns]
 1   Year    372 non-null    int64         
 2   Births  372 non-null    int64         
 3   State   372 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 11.8+ KB


In [67]:
# save the cleaned data to a new csv file
la_df.to_csv('resources/birthdata/la_katrina_birth_data.csv', index=False)


In [68]:
# create a new dataframe for all states except Louisiana
other_states_df = birth_data_all_years_df.loc[birth_data_all_years_df['State'] != 'Louisiana']
other_states_df = other_states_df.reset_index(drop=True)
display(other_states_df.head())
other_states_df.info()


Unnamed: 0,Date,Year,Births,State
0,1985-01-01,1985,5027,Alabama
1,1985-02-01,1985,4627,Alabama
2,1985-03-01,1985,4738,Alabama
3,1985-04-01,1985,4626,Alabama
4,1985-05-01,1985,4834,Alabama


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17736 entries, 0 to 17735
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    17736 non-null  datetime64[ns]
 1   Year    17736 non-null  int64         
 2   Births  17736 non-null  int64         
 3   State   17736 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 554.4+ KB


In [69]:
# save the cleaned data to a new csv file
other_states_df.to_csv('resources/birthdata/other_states_katrina_birth_data.csv', index=False)
