# RADI Dataset Preprocessing

## Import Required Libraries

In [14]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [15]:
import os
curr_dir_path = os.path.dirname(os.path.abspath('preprocessing.ipynb'))
raw_data_path = os.path.join(curr_dir_path, 'data/raw_data/')
processed_data_path = os.path.join(curr_dir_path, 'data/processed_data/')

## Load Datasets

### Health Outcome File Sources

In [16]:
# 0. Health Outcome File Sources

# a. CDC Wonder

# b. County Health Rankings
county_health_rankings_df = pd.read_csv(raw_data_path + 'analytic_data2024.csv')

# c. HHS
hhs_df = pd.read_csv(raw_data_path + 'Heart_Disease_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County___2019-2021.csv')

### RUCC 2023 Codes

In [17]:
# 1. RUCC 2023 Codes
rucc_df = pd.read_csv(raw_data_path + 'Ruralurbancontinuumcodes2023.csv', encoding='latin-1')

### US Census Bureau

In [18]:
# 2. US Census Bureau

# a. Income
income_df = pd.read_csv(raw_data_path + 'ACSDT5Y2023.B19013-Data.csv')

# b. Poverty
poverty_df = pd.read_csv(raw_data_path + 'ACSST5Y2023.S1701-Data.csv')

# c. Housing
plumbing_df = pd.read_csv(raw_data_path + 'ACSDT5Y2023.B25047-Data.csv')
kitchen_df = pd.read_csv(raw_data_path + 'ACSDT5Y2023.B25051-Data.csv')
cost_burden_df = pd.read_csv(raw_data_path + 'ACSDT5Y2023.B25070-Data.csv')

# d. Demographics
age_df = pd.read_csv(raw_data_path + 'ACSST5Y2023.S0101-Data.csv')
race_df = pd.read_csv(raw_data_path + 'ACSDT5Y2023.B02001-Data.csv')

# e. Education
education_df = pd.read_csv(raw_data_path + 'ACSST5Y2023.S1501-Data.csv')

# f. Total Uninsured
uninsured_df = pd.read_csv(raw_data_path + 'ACSDT5Y2023.B27001-Data.csv')

# g. Presence and Types of Internet
internet_df = pd.read_csv(raw_data_path + 'ACSDT5Y2023.B28002-Data.csv')

# h. Transportation (Commuting Time)
transportation_df = pd.read_csv(raw_data_path + 'ACSDT5Y2023.B08012-Data.csv')

# i. Civilian Unemployment
unemployment_df = pd.read_csv(raw_data_path + 'ACSST5Y2023.S2301-Data.csv')

# j. Households with No Vehicles
vehicle_df = pd.read_csv(raw_data_path + 'ACSDT5Y2023.B08201-Data.csv')

# k. Local Government Revenue

### Health Access (HRSA)

In [19]:
# 3. Health Access (HRSA)

# a. Area Health Resource Files

# b. Health Center Service Delivery
health_center_service_delivery_df = pd.read_csv(raw_data_path + 'Health_Center_Service_Delivery_and_LookAlike_Sites.csv')

In [20]:
for column in county_health_rankings_df.columns:
    print(column)

State FIPS Code
County FIPS Code
5-digit FIPS Code
State Abbreviation
Name
Release Year
County Clustered (Yes=1/No=0)
Premature Death raw value
Premature Death numerator
Premature Death denominator
Premature Death CI low
Premature Death CI high
Premature Death flag (0 = No Flag/1=Unreliable/2=Suppressed)
Premature Death (AIAN)
Premature Death CI low (AIAN)
Premature Death CI high (AIAN)
Premature Death flag (AIAN) (. = No Flag/1=Unreliable/2=Suppressed)
Premature Death (Asian/Pacific Islander)
Premature Death CI low (Asian/Pacific Islander)
Premature Death CI high (Asian/Pacific Islander)
Premature Death flag (Asian/Pacific Islander) (. = No Flag/1=Unreliable/2=Suppressed)
Premature Death (Black)
Premature Death CI low (Black)
Premature Death CI high (Black)
Premature Death flag (Black) (. = No Flag/1=Unreliable/2=Suppressed)
Premature Death (Hispanic)
Premature Death CI low (Hispanic)
Premature Death CI high (Hispanic)
Premature Death flag (Hispanic) (. = No Flag/1=Unreliable/2=Suppre

### Environmental

In [21]:
# 4. Environmental

# a. Air Quality
air_quality_df = pd.read_excel(raw_data_path + 'ctyfactbook2023.xlsx')

# b. Drinking Water Quality (County Health Rankings)
drinking_water_df = county_health_rankings_df[['State FIPS Code', 'County FIPS Code', '5-digit FIPS Code', 'State Abbreviation', 
                                                'Name', 'Release Year', 'County Clustered (Yes=1/No=0)',
                                                'Drinking Water Violations raw value', 'Drinking Water Violations numerator', 
                                                'Drinking Water Violations denominator', 'Drinking Water Violations CI low', 
                                                'Drinking Water Violations CI high']]

# c. Natural Disaster Vulnerability
natural_disaster_df = pd.read_csv(raw_data_path + 'NRI_Table_Counties.csv')

# d. Food Insecurity (County Health Rankings)
food_insecurity_df = county_health_rankings_df[['State FIPS Code', 'County FIPS Code', '5-digit FIPS Code', 'State Abbreviation', 
                                                'Name', 'Release Year', 'County Clustered (Yes=1/No=0)', 
                                                'Food Environment Index raw value', 'Food Environment Index numerator', 
                                                'Food Environment Index denominator', 'Food Environment Index CI low', 
                                                'Food Environment Index CI high', 'Food Insecurity raw value', 
                                                'Food Insecurity numerator', 'Food Insecurity denominator', 
                                                'Food Insecurity CI low', 'Food Insecurity CI high',
                                                'Limited Access to Healthy Foods raw value', 'Limited Access to Healthy Foods numerator', 
                                                'Limited Access to Healthy Foods denominator', 'Limited Access to Healthy Foods CI low', 
                                                'Limited Access to Healthy Foods CI high',]]


### Health Outcome Files

In [22]:
# 5. Health Outcomes Files

# a. All-cause Mortality (CDC Wonder)

# b. Infant Mortality (CDC Wonder)

# c. Preventable Hospital Stays (County Health Rankings)
preventable_hospital_stays_df = county_health_rankings_df[['State FIPS Code', 'County FIPS Code', '5-digit FIPS Code', 'State Abbreviation', 
                                                            'Name', 'Release Year', 'County Clustered (Yes=1/No=0)', 
                                                            'Preventable Hospital Stays raw value', 'Preventable Hospital Stays numerator', 
                                                            'Preventable Hospital Stays denominator', 'Preventable Hospital Stays CI low', 
                                                            'Preventable Hospital Stays CI high', 'Preventable Hospital Stays (AIAN)', 
                                                            'Preventable Hospital Stays (Asian/Pacific Islander)', 'Preventable Hospital Stays (Black)', 
                                                            'Preventable Hospital Stays (Hispanic)', 'Preventable Hospital Stays (White)']]

# d. Heart Disease Mortality (HHS)
heart_disease_mortality_df = hhs_df

# e. Poor or Fair Health % (County Health Rankings)
poor_or_fair_health_df = county_health_rankings_df[['State FIPS Code', 'County FIPS Code', '5-digit FIPS Code', 'State Abbreviation', 
                                                    'Name', 'Release Year', 'County Clustered (Yes=1/No=0)', 
                                                    'Poor or Fair Health raw value', 'Poor or Fair Health numerator', 
                                                    'Poor or Fair Health denominator', 'Poor or Fair Health CI low', 
                                                    'Poor or Fair Health CI high']]

## Remove Unnecessary Rows

### Health Outcome File Sources

#### CDC Wonder

#### County Health Rankings

In [23]:
county_health_rankings_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Premature Death raw value,Premature Death numerator,Premature Death denominator,...,% Female raw value,% Female numerator,% Female denominator,% Female CI low,% Female CI high,% Rural raw value,% Rural numerator,% Rural denominator,% Rural CI low,% Rural CI high
0,statecode,countycode,fipscode,state,county,year,county_clustered,v001_rawvalue,v001_numerator,v001_denominator,...,v057_rawvalue,v057_numerator,v057_denominator,v057_cilow,v057_cihigh,v058_rawvalue,v058_numerator,v058_denominator,v058_cilow,v058_cihigh
1,00,000,00000,US,United States,2024,,7971.5097891,4535347,921750763,...,0.504081237,168004004,333287557,,,0.2000313707,66300254,331449281,,
2,01,000,01000,AL,Alabama,2024,,11415.734833,98140,13812804,...,0.5137532379,2606936,5074296,,,0.4226276049,2123399,5024279,,
3,01,001,01001,AL,Autauga County,2024,1,9407.9484384,942,159452,...,0.5129603909,30654,59759,,,0.406768132,23920,58805,,
4,01,003,01003,AL,Baldwin County,2024,1,8981.5753533,3789,633571,...,0.5123906913,126271,246435,,,0.3758645536,87113,231767,,


In [24]:
county_health_rankings_df = county_health_rankings_df[1:].reset_index(drop=True)

#### HHS

In [25]:
hhs_df.head()

Unnamed: 0,Year,LocationAbbr,LocationDesc,GeographicLevel,DataSource,Class,Topic,Data_Value,Data_Value_Unit,Data_Value_Type,...,Data_Value_Footnote,StratificationCategory1,Stratification1,StratificationCategory2,Stratification2,TopicID,LocationID,Y_lat,X_lon,Georeference
0,2020,AK,Kenai Peninsula,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,165.1,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Gender,Male,Race/Ethnicity,Hispanic,T2,2122,60.193263,-150.280744,POINT (-150.2807443 60.193262972)
1,2020,AL,Walker County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,109.0,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Gender,Overall,Race/Ethnicity,Hispanic,T2,1127,33.810226,-87.29707,POINT (-87.29707047 33.810226394)
2,2020,AL,St. Clair County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,90.0,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Gender,Overall,Race/Ethnicity,Asian,T2,1115,33.716065,-86.31496,POINT (-86.31496031 33.716065391)
3,2020,AR,Yell County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,Insufficient Data,Gender,Female,Race/Ethnicity,Asian,T2,5149,35.005864,-93.401676,POINT (-93.40167591 35.00586398)
4,2020,AS,American Samoa County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,Insufficient Data,Gender,Male,Race/Ethnicity,Black,T2,60000,-14.301754,-170.719474,POINT (-170.7194738 -14.30175426)


### RUCC 

In [26]:
rucc_df.head()

Unnamed: 0,FIPS,State,County_Name,Attribute,Value
0,1001,AL,Autauga County,Population_2020,58805
1,1001,AL,Autauga County,RUCC_2023,2
2,1001,AL,Autauga County,Description,"Metro - Counties in metro areas of 250,000 to ..."
3,1003,AL,Baldwin County,Population_2020,231767
4,1003,AL,Baldwin County,RUCC_2023,3


In [27]:
rucc_df = rucc_df[rucc_df['Attribute'] == 'RUCC_2023']
rucc_df.head()

Unnamed: 0,FIPS,State,County_Name,Attribute,Value
1,1001,AL,Autauga County,RUCC_2023,2
4,1003,AL,Baldwin County,RUCC_2023,3
7,1005,AL,Barbour County,RUCC_2023,6
10,1007,AL,Bibb County,RUCC_2023,1
13,1009,AL,Blount County,RUCC_2023,1


### US Census Bureau

In [28]:
def print_variable(variable):
    variable_name = [name for name, value in globals().items() if value is variable][0]
    return variable_name

In [29]:
def remove_unnamed_cols(df):
    unnamed_cols = [col for col in df.columns if 'Unnamed' in col]
    return df.drop(columns=unnamed_cols, inplace=True)

In [30]:
def merge_first_row_with_columns(df):
    for col in df.columns[2:]:
        df[f'{col}__{df[col][0]}'] = df[col]
        df.drop(columns=[col], inplace=True)
    df.drop(index=0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [31]:
census_bureau_dfs = [income_df, poverty_df, plumbing_df, kitchen_df, 
                     cost_burden_df, age_df, race_df, education_df, 
                     uninsured_df, internet_df, transportation_df, 
                     unemployment_df, vehicle_df]

In [32]:
for i in range(len(census_bureau_dfs)):
    print(f'Processing {print_variable(census_bureau_dfs[i])}')
    try:
        remove_unnamed_cols(census_bureau_dfs[i])
        merge_first_row_with_columns(census_bureau_dfs[i])
    except Exception as e:
        print(f'Error in processing: {print_variable(census_bureau_dfs[i])} unable to be processed due to: {e}')

# Verify successful processing by ensuring first row contains data for Antuaga County, Alabama (0500000US01001)
for df in census_bureau_dfs:
    if df['GEO_ID'][0] != '0500000US01001':
        print(f'Error in processing: {print_variable(df)} does not contain data for Antuaga County, Alabama (0500000US01001)')

print('All processing successful!')

Processing income_df
Processing poverty_df
Processing plumbing_df
Processing kitchen_df
Processing cost_burden_df
Processing age_df
Processing race_df
Processing education_df
Processing uninsured_df
Processing internet_df
Processing transportation_df
Processing unemployment_df
Processing vehicle_df
All processing successful!


In [33]:
income_df.head()

Unnamed: 0,GEO_ID,NAME,B19013_001E__Estimate!!Median household income in the past 12 months (in 2023 inflation-adjusted dollars),B19013_001M__Margin of Error!!Median household income in the past 12 months (in 2023 inflation-adjusted dollars)
0,0500000US01001,"Autauga County, Alabama",69841,5512
1,0500000US01003,"Baldwin County, Alabama",75019,2751
2,0500000US01005,"Barbour County, Alabama",44290,2762
3,0500000US01007,"Bibb County, Alabama",51215,6678
4,0500000US01009,"Blount County, Alabama",61096,3328


### Health Access (HRSA)

#### Area Health Resource Files

#### Health Center Service Delivery

In [34]:
health_center_service_delivery_df.head()

Unnamed: 0,Health Center Type,Health Center Number,BHCMIS Organization Identification Number,BPHC Assigned Number,Site Name,Site Address,Site City,Site State Abbreviation,Site Postal Code,Site Telephone Number,...,State Name,State FIPS and Congressional District Number Code,Congressional District Number,Congressional District Name,Congressional District Code,U.S. Congressional Representative Name,Name of U.S. Senator Number One,Name of U.S. Senator Number Two,Data Warehouse Record Create Date,Unnamed: 55
0,Federally Qualified Health Center (FQHC),H80CS00770,052030,BPS-H80-015059,Cedar Springs Campus,204 E Muskegon St,Cedar Springs,MI,49319-9326,616-696-7330,...,Michigan,2602,2,Michigan District 02,MI-02,John R. Moolenaar,Debbie Stabenow,Elissa Slotkin,01/22/2025,
1,Federally Qualified Health Center (FQHC),H80CS24147,03E00494,BPS-H80-021927,"Community Health & Dental Care, Inc.",351 W Schuylkill Rd,Pottstown,PA,19465-7438,610-326-9460 x222,...,Pennsylvania,4206,6,Pennsylvania District 06,PA-06,Chrissy Houlahan,"Robert P. Casey, Jr.",David McCormick,01/22/2025,
2,Federally Qualified Health Center (FQHC),H80CS00578,042610,BPS-H80-008140,TRAVELERS REST,1588 Geer Hwy,Travelers Rest,SC,29690-9204,864-836-1109,...,South Carolina,4504,4,South Carolina District 04,SC-04,"William R. Timmons, IV",Lindsey Graham,Tim Scott,01/22/2025,
3,Federally Qualified Health Center (FQHC),H80CS00747,020890,BPS-H80-013789,Centro de Servicios Primarios de Salud- Santa...,32 Calle Luis Munoz Rivera,Santa Isabel,PR,00757-2609,787-839-4320,...,Puerto Rico,7298,98,Puerto Rico Resident Commissioner,PR-98,Pablo Jose Hernandez,,,01/22/2025,
4,Federally Qualified Health Center (FQHC),H80CS00402,053160,BPS-H80-029697,THUNDER BAY COMMUNITY HEALTH SERVICE - FAIRVIEW,1910 E Miller Rd,Fairview,MI,48621-8731,989-848-5644,...,Michigan,2601,1,Michigan District 01,MI-01,Jack Bergman,Debbie Stabenow,Elissa Slotkin,01/22/2025,


In [35]:
remove_unnamed_cols(health_center_service_delivery_df)
health_center_service_delivery_df.head()

Unnamed: 0,Health Center Type,Health Center Number,BHCMIS Organization Identification Number,BPHC Assigned Number,Site Name,Site Address,Site City,Site State Abbreviation,Site Postal Code,Site Telephone Number,...,State FIPS Code,State Name,State FIPS and Congressional District Number Code,Congressional District Number,Congressional District Name,Congressional District Code,U.S. Congressional Representative Name,Name of U.S. Senator Number One,Name of U.S. Senator Number Two,Data Warehouse Record Create Date
0,Federally Qualified Health Center (FQHC),H80CS00770,052030,BPS-H80-015059,Cedar Springs Campus,204 E Muskegon St,Cedar Springs,MI,49319-9326,616-696-7330,...,26,Michigan,2602,2,Michigan District 02,MI-02,John R. Moolenaar,Debbie Stabenow,Elissa Slotkin,01/22/2025
1,Federally Qualified Health Center (FQHC),H80CS24147,03E00494,BPS-H80-021927,"Community Health & Dental Care, Inc.",351 W Schuylkill Rd,Pottstown,PA,19465-7438,610-326-9460 x222,...,42,Pennsylvania,4206,6,Pennsylvania District 06,PA-06,Chrissy Houlahan,"Robert P. Casey, Jr.",David McCormick,01/22/2025
2,Federally Qualified Health Center (FQHC),H80CS00578,042610,BPS-H80-008140,TRAVELERS REST,1588 Geer Hwy,Travelers Rest,SC,29690-9204,864-836-1109,...,45,South Carolina,4504,4,South Carolina District 04,SC-04,"William R. Timmons, IV",Lindsey Graham,Tim Scott,01/22/2025
3,Federally Qualified Health Center (FQHC),H80CS00747,020890,BPS-H80-013789,Centro de Servicios Primarios de Salud- Santa...,32 Calle Luis Munoz Rivera,Santa Isabel,PR,00757-2609,787-839-4320,...,72,Puerto Rico,7298,98,Puerto Rico Resident Commissioner,PR-98,Pablo Jose Hernandez,,,01/22/2025
4,Federally Qualified Health Center (FQHC),H80CS00402,053160,BPS-H80-029697,THUNDER BAY COMMUNITY HEALTH SERVICE - FAIRVIEW,1910 E Miller Rd,Fairview,MI,48621-8731,989-848-5644,...,26,Michigan,2601,1,Michigan District 01,MI-01,Jack Bergman,Debbie Stabenow,Elissa Slotkin,01/22/2025


### Environmental

#### Air Quality

In [36]:
air_quality_df.head()

Unnamed: 0,"Air Quality Statistics by County, 2023",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,Note: The values shown are the highest among t...,,,,,,,,,,,,
1,State,County,County FIPS Code,2010 Population,CO 8-hr (ppm),Pb 3-mo (µg/m3),NO2 AM (ppb),NO2 1-hr (ppb),O3 8-hr (ppm),PM10 24-hr (µg/m3),PM2.5 Wtd AM (µg/m3),PM2.5 24-hr (µg/m3),SO2 1-hr (ppb)
2,Alabama,Baldwin County,01003,182265,ND,ND,ND,ND,0.065,ND,7.6,18,ND
3,Alabama,Clay County,01027,13932,ND,ND,ND,ND,ND,ND,IN,IN,ND
4,Alabama,Colbert County,01033,54428,ND,ND,ND,ND,ND,ND,IN,IN,ND


In [37]:
air_quality_df.columns = air_quality_df.iloc[1]
air_quality_df = air_quality_df[2:].reset_index(drop=True)
air_quality_df.head()

1,State,County,County FIPS Code,2010 Population,CO 8-hr (ppm),Pb 3-mo (µg/m3),NO2 AM (ppb),NO2 1-hr (ppb),O3 8-hr (ppm),PM10 24-hr (µg/m3),PM2.5 Wtd AM (µg/m3),PM2.5 24-hr (µg/m3),SO2 1-hr (ppb)
0,Alabama,Baldwin County,1003,182265,ND,ND,ND,ND,0.065,ND,7.6,18,ND
1,Alabama,Clay County,1027,13932,ND,ND,ND,ND,ND,ND,IN,IN,ND
2,Alabama,Colbert County,1033,54428,ND,ND,ND,ND,ND,ND,IN,IN,ND
3,Alabama,DeKalb County,1049,71109,ND,ND,ND,ND,0.066,ND,8.9,21,ND
4,Alabama,Elmore County,1051,79303,ND,ND,ND,ND,0.061,ND,ND,ND,ND


#### Drinking Water

In [38]:
drinking_water_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Drinking Water Violations raw value,Drinking Water Violations numerator,Drinking Water Violations denominator,Drinking Water Violations CI low,Drinking Water Violations CI high
0,statecode,countycode,fipscode,state,county,year,county_clustered,v124_rawvalue,v124_numerator,v124_denominator,v124_cilow,v124_cihigh
1,00,000,00000,US,United States,2024,,,,,,
2,01,000,01000,AL,Alabama,2024,,0.223880597,,,,
3,01,001,01001,AL,Autauga County,2024,1,0,,,,
4,01,003,01003,AL,Baldwin County,2024,1,1,,,,


In [39]:
drinking_water_df = drinking_water_df[1:]

drinking_water_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Drinking Water Violations raw value,Drinking Water Violations numerator,Drinking Water Violations denominator,Drinking Water Violations CI low,Drinking Water Violations CI high
1,0,0,0,US,United States,2024,,,,,,
2,1,0,1000,AL,Alabama,2024,,0.223880597,,,,
3,1,1,1001,AL,Autauga County,2024,1.0,0.0,,,,
4,1,3,1003,AL,Baldwin County,2024,1.0,1.0,,,,
5,1,5,1005,AL,Barbour County,2024,1.0,0.0,,,,


#### Natural Disaster

In [40]:
natural_disaster_df.head()

Unnamed: 0,OID_,NRI_ID,STATE,STATEABBRV,STATEFIPS,COUNTY,COUNTYTYPE,COUNTYFIPS,STCOFIPS,POPULATION,...,WNTW_EALS,WNTW_EALR,WNTW_ALRB,WNTW_ALRP,WNTW_ALRA,WNTW_ALR_NPCTL,WNTW_RISKV,WNTW_RISKS,WNTW_RISKR,NRI_VER
0,1,C01001,Alabama,AL,1,Autauga,County,1,1001,58764,...,15.784587,Very Low,2.687716e-07,7.410082e-09,8.725777e-06,10.461158,8494.906508,12.217626,Very Low,March 2023
1,2,C01003,Alabama,AL,1,Baldwin,County,3,1003,231365,...,56.205509,Relatively Moderate,1.268231e-09,2.28712e-08,1.54836e-07,13.339523,65619.701638,52.083996,Relatively Low,March 2023
2,3,C01005,Alabama,AL,1,Barbour,County,5,1005,25160,...,18.632002,Relatively Low,5.78805e-07,2.347236e-08,7.606598e-07,16.125039,15501.730335,19.535476,Very Low,March 2023
3,4,C01007,Alabama,AL,1,Bibb,County,7,1007,22239,...,13.308573,Very Low,9.014679e-07,1.2703e-08,1.202015e-05,16.991643,7496.18694,11.104041,Very Low,March 2023
4,5,C01009,Alabama,AL,1,Blount,County,9,1009,58992,...,23.64593,Relatively Low,5.268425e-07,1.482016e-08,2.002965e-07,12.039616,17175.160729,21.44448,Very Low,March 2023


#### Food Insecurity

In [41]:
food_insecurity_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Food Environment Index raw value,Food Environment Index numerator,Food Environment Index denominator,...,Food Insecurity raw value,Food Insecurity numerator,Food Insecurity denominator,Food Insecurity CI low,Food Insecurity CI high,Limited Access to Healthy Foods raw value,Limited Access to Healthy Foods numerator,Limited Access to Healthy Foods denominator,Limited Access to Healthy Foods CI low,Limited Access to Healthy Foods CI high
0,statecode,countycode,fipscode,state,county,year,county_clustered,v133_rawvalue,v133_numerator,v133_denominator,...,v139_rawvalue,v139_numerator,v139_denominator,v139_cilow,v139_cihigh,v083_rawvalue,v083_numerator,v083_denominator,v083_cilow,v083_cihigh
1,00,000,00000,US,United States,2024,,7.7,0.0610019647,0.104,...,0.104,33844000,,,,0.0610019647,18834084.4,308745538,,
2,01,000,01000,AL,Alabama,2024,,5.4,0.0876054853,0.148,...,0.148,746550,,,,0.0876054853,418731.09187,4779736,,
3,01,001,01001,AL,Autauga County,2024,1,6.7,0.1302099797,0.133,...,0.133,7770,,,,0.1302099797,7105.6888029,54571,,
4,01,003,01003,AL,Baldwin County,2024,1,7.5,0.0793677936,0.118,...,0.118,26830,,,,0.0793677936,14465.970897,182265,,


In [42]:
food_insecurity_df = food_insecurity_df[1:]

food_insecurity_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Food Environment Index raw value,Food Environment Index numerator,Food Environment Index denominator,...,Food Insecurity raw value,Food Insecurity numerator,Food Insecurity denominator,Food Insecurity CI low,Food Insecurity CI high,Limited Access to Healthy Foods raw value,Limited Access to Healthy Foods numerator,Limited Access to Healthy Foods denominator,Limited Access to Healthy Foods CI low,Limited Access to Healthy Foods CI high
1,0,0,0,US,United States,2024,,7.7,0.0610019647,0.104,...,0.104,33844000,,,,0.0610019647,18834084.4,308745538,,
2,1,0,1000,AL,Alabama,2024,,5.4,0.0876054853,0.148,...,0.148,746550,,,,0.0876054853,418731.09187,4779736,,
3,1,1,1001,AL,Autauga County,2024,1.0,6.7,0.1302099797,0.133,...,0.133,7770,,,,0.1302099797,7105.6888029,54571,,
4,1,3,1003,AL,Baldwin County,2024,1.0,7.5,0.0793677936,0.118,...,0.118,26830,,,,0.0793677936,14465.970897,182265,,
5,1,5,1005,AL,Barbour County,2024,1.0,6.0,0.1043317167,0.178,...,0.178,4500,,,,0.1043317167,2864.6359459,27457,,


### Health Outcome Files

#### All-cause Mortality

#### Infant Mortality

#### Preventable Hospital Stays

In [43]:
preventable_hospital_stays_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Preventable Hospital Stays raw value,Preventable Hospital Stays numerator,Preventable Hospital Stays denominator,Preventable Hospital Stays CI low,Preventable Hospital Stays CI high,Preventable Hospital Stays (AIAN),Preventable Hospital Stays (Asian/Pacific Islander),Preventable Hospital Stays (Black),Preventable Hospital Stays (Hispanic),Preventable Hospital Stays (White)
0,statecode,countycode,fipscode,state,county,year,county_clustered,v005_rawvalue,v005_numerator,v005_denominator,v005_cilow,v005_cihigh,v005_race_aian,v005_race_asian,v005_race_black,v005_race_hispanic,v005_race_white
1,00,000,00000,US,United States,2024,,2681,,,,,3956,1575,4427,2659,2527
2,01,000,01000,AL,Alabama,2024,,3280,,,,,5665,1757,4310,1938,3093
3,01,001,01001,AL,Autauga County,2024,1,3915,,,,,,,6203,,3633
4,01,003,01003,AL,Baldwin County,2024,1,2799,,,,,,,5376,,2680


In [44]:
preventable_hospital_stays_df = preventable_hospital_stays_df[1:].reset_index(drop=True)

preventable_hospital_stays_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Preventable Hospital Stays raw value,Preventable Hospital Stays numerator,Preventable Hospital Stays denominator,Preventable Hospital Stays CI low,Preventable Hospital Stays CI high,Preventable Hospital Stays (AIAN),Preventable Hospital Stays (Asian/Pacific Islander),Preventable Hospital Stays (Black),Preventable Hospital Stays (Hispanic),Preventable Hospital Stays (White)
0,0,0,0,US,United States,2024,,2681,,,,,3956.0,1575.0,4427,2659.0,2527
1,1,0,1000,AL,Alabama,2024,,3280,,,,,5665.0,1757.0,4310,1938.0,3093
2,1,1,1001,AL,Autauga County,2024,1.0,3915,,,,,,,6203,,3633
3,1,3,1003,AL,Baldwin County,2024,1.0,2799,,,,,,,5376,,2680
4,1,5,1005,AL,Barbour County,2024,1.0,3040,,,,,,,4814,,2448


#### Heart Disease Mortality

In [45]:
heart_disease_mortality_df.head()

Unnamed: 0,Year,LocationAbbr,LocationDesc,GeographicLevel,DataSource,Class,Topic,Data_Value,Data_Value_Unit,Data_Value_Type,...,Data_Value_Footnote,StratificationCategory1,Stratification1,StratificationCategory2,Stratification2,TopicID,LocationID,Y_lat,X_lon,Georeference
0,2020,AK,Kenai Peninsula,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,165.1,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Gender,Male,Race/Ethnicity,Hispanic,T2,2122,60.193263,-150.280744,POINT (-150.2807443 60.193262972)
1,2020,AL,Walker County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,109.0,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Gender,Overall,Race/Ethnicity,Hispanic,T2,1127,33.810226,-87.29707,POINT (-87.29707047 33.810226394)
2,2020,AL,St. Clair County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,90.0,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Gender,Overall,Race/Ethnicity,Asian,T2,1115,33.716065,-86.31496,POINT (-86.31496031 33.716065391)
3,2020,AR,Yell County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,Insufficient Data,Gender,Female,Race/Ethnicity,Asian,T2,5149,35.005864,-93.401676,POINT (-93.40167591 35.00586398)
4,2020,AS,American Samoa County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,Insufficient Data,Gender,Male,Race/Ethnicity,Black,T2,60000,-14.301754,-170.719474,POINT (-170.7194738 -14.30175426)


#### Poor or Fair Health %

In [46]:
poor_or_fair_health_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Poor or Fair Health raw value,Poor or Fair Health numerator,Poor or Fair Health denominator,Poor or Fair Health CI low,Poor or Fair Health CI high
0,statecode,countycode,fipscode,state,county,year,county_clustered,v002_rawvalue,v002_numerator,v002_denominator,v002_cilow,v002_cihigh
1,00,000,00000,US,United States,2024,,0.142,,,0.137,0.157
2,01,000,01000,AL,Alabama,2024,,0.178,,,0.164,0.193
3,01,001,01001,AL,Autauga County,2024,1,0.173,,,0.146,0.204
4,01,003,01003,AL,Baldwin County,2024,1,0.152,,,0.127,0.179


In [47]:
poor_or_fair_health_df = poor_or_fair_health_df[1:].reset_index(drop=True)

poor_or_fair_health_df.head()

Unnamed: 0,State FIPS Code,County FIPS Code,5-digit FIPS Code,State Abbreviation,Name,Release Year,County Clustered (Yes=1/No=0),Poor or Fair Health raw value,Poor or Fair Health numerator,Poor or Fair Health denominator,Poor or Fair Health CI low,Poor or Fair Health CI high
0,0,0,0,US,United States,2024,,0.142,,,0.137,0.157
1,1,0,1000,AL,Alabama,2024,,0.178,,,0.164,0.193
2,1,1,1001,AL,Autauga County,2024,1.0,0.173,,,0.146,0.204
3,1,3,1003,AL,Baldwin County,2024,1.0,0.152,,,0.127,0.179
4,1,5,1005,AL,Barbour County,2024,1.0,0.273,,,0.237,0.31


## Storing Processed Dataset

In [48]:
rucc_df.to_csv(processed_data_path + 'rucc.csv', index=False)

hhs_df.to_csv(processed_data_path + 'hhs.csv', index=False)

county_health_rankings_df.to_csv(processed_data_path + 'county_health_rankings.csv', index=False)

income_df.to_csv(processed_data_path + 'income.csv', index=False)

poverty_df.to_csv(processed_data_path + 'poverty.csv', index=False)

plumbing_df.to_csv(processed_data_path + 'plumbing.csv', index=False)

kitchen_df.to_csv(processed_data_path + 'kitchen.csv', index=False)

cost_burden_df.to_csv(processed_data_path + 'cost_burden.csv', index=False)

age_df.to_csv(processed_data_path + 'age.csv', index=False)

race_df.to_csv(processed_data_path + 'race.csv', index=False)

education_df.to_csv(processed_data_path + 'education.csv', index=False)

uninsured_df.to_csv(processed_data_path + 'uninsured.csv', index=False)

internet_df.to_csv(processed_data_path + 'internet.csv', index=False)

transportation_df.to_csv(processed_data_path + 'transportation.csv', index=False)

unemployment_df.to_csv(processed_data_path + 'unemployment.csv', index=False)

vehicle_df.to_csv(processed_data_path + 'vehicle.csv', index=False)

health_center_service_delivery_df.to_csv(processed_data_path + 'health_center_service_delivery.csv', index=False)

air_quality_df.to_csv(processed_data_path + 'air_quality.csv', index=False)

drinking_water_df.to_csv(processed_data_path + 'drinking_water.csv', index=False)

natural_disaster_df.to_csv(processed_data_path + 'natural_disaster.csv', index=False)

food_insecurity_df.to_csv(processed_data_path + 'food_insecurity.csv', index=False)

preventable_hospital_stays_df.to_csv(processed_data_path + 'preventable_hospital_stays.csv', index=False)

heart_disease_mortality_df.to_csv(processed_data_path + 'heart_disease_mortality.csv', index=False)

poor_or_fair_health_df.to_csv(processed_data_path + 'poor_or_fair_health.csv', index=False)