# Process HGHI data

This notebook reads in the HIFLD datasets and converts them to GeoJSON format.

Data from https://globalepidemics.org/2020/03/17/caring-for-covid-19-patients/.

See description in `data/README.md` for details.


In [1]:
import pandas as pd
import geopandas as gpd

from covidcaremap.constants import state_name_to_abbreviation
from covidcaremap.data import external_data_path, processed_data_path

In [2]:
hghi_state_df = pd.read_csv(external_data_path('HGHI - Hospital Capacity by State.csv'))
hghi_hrr_60_df = pd.read_csv(external_data_path('HGHI - HRR Scorecard - 60% Population.csv'))
hghi_hrr_40_df = pd.read_csv(external_data_path('HGHI - HRR Scorecard - 40% Population.csv'))
hghi_hrr_20_df = pd.read_csv(external_data_path('HGHI - HRR Scorecard - 20% Population.csv'))

# Make sure numeric values are numbers.
int_columns =[ 'Total Hospital Beds',
               'Total ICU Beds',
               'Available Hospital Beds',
               'Hospital Beds Needed, Six Months',
               'Hospital Beds Needed, Twelve Months',
               'Hospital Beds Needed, Eighteen Months',
               'ICU Beds Needed, Six Months',
               'ICU Beds Needed, Twelve Months',
               'ICU Beds Needed, Eighteen Months',
               'Potentially Available Hospital Beds*',
               'Available ICU Beds',
               'Potentially Available ICU Beds*',
               'Adult Population',
               'Population 65+',
               'Projected Infected Individuals',
               'Proejcted Hospitalized Individuals',
               'Projected Hospitalized Individuals',
               'Projected Individuals Needing ICU Care']

float_columns = [ 'Hospital Bed Occupancy Rate',
                  'ICU Bed Occupancy Rate']

percent_columns = [ 'Percentage of Available Beds Needed, Six Months',
                    'Percentage of Potentially Available Beds Needed, Six Months',
                    'Percentage of Total Beds Needed, Six Months',
                    'Percentage of Available Beds Needed, Twelve Months',
                    'Percentage of Potentially Available Beds Needed, Twelve Months',
                    'Percentage of Total Beds Needed, Twelve Months',
                    'Percentage of Available Beds Needed, Eighteen Months',
                    'Percentage of Potentially Available Beds Needed, Eighteen Months',
                    'Percentage of Total Beds Needed, Eighteen Months',
                    'Percentage of Available ICU Beds Needed, Six Months',
                    'Percentage of Potentially Available ICU Beds Needed, Six Months',
                    'Percentage of Total ICU Beds Needed, Six Months',
                    'Percentage of Available ICU Beds Needed, Twelve Months',
                    'Percentage of Potentially Available ICU Beds Needed, Twelve Months',
                    'Percentage of Total ICU Beds Needed, Twelve Months',
                    'Percentage of Available ICU Beds Needed, Eighteen Months',
                    'Percentage of Potentially Available ICU Beds Needed, Eighteen Months',
                    'Percentage of Total ICU Beds Needed, Eighteen Months']

for df in  [hghi_state_df, hghi_hrr_60_df, hghi_hrr_40_df, hghi_hrr_20_df]:
    for c in int_columns:
        if c in df.columns:
            if df[c].dtype == object:
                df[c] = df[c].str.replace(',', '').astype('int')
    for c in float_columns:
        if c in df.columns:
            df[c] = df[c].astype('float')
    for c in percent_columns:
        if c in df.columns:
            df[c] = df[c].str.replace('%', '').astype('float') / 100.0

In [4]:
state_gdf = gpd.read_file(external_data_path('us_states.geojson'), encoding='utf-8')
hrr_gdf = gpd.read_file(external_data_path('us_hrr.geojson'), encoding='utf-8')

## Generate HRR GeoJSON

### Merge HGHI scenarios

Merge the 20, 40 and 60 scenario DFs in a dataframe that does not duplicate common properties.

In [5]:
non_common_properties = ['Projected Infected Individuals',
       'Projected Hospitalized Individuals',
       'Projected Individuals Needing ICU Care',
       'Hospital Beds Needed, Six Months',
       'Percentage of Available Beds Needed, Six Months',
       'Percentage of Potentially Available Beds Needed, Six Months',
       'Percentage of Total Beds Needed, Six Months',
       'Hospital Beds Needed, Twelve Months',
       'Percentage of Available Beds Needed, Twelve Months',
       'Percentage of Potentially Available Beds Needed, Twelve Months',
       'Percentage of Total Beds Needed, Twelve Months',
       'Hospital Beds Needed, Eighteen Months',
       'Percentage of Available Beds Needed, Eighteen Months',
       'Percentage of Potentially Available Beds Needed, Eighteen Months',
       'Percentage of Total Beds Needed, Eighteen Months',
       'ICU Beds Needed, Six Months',
       'Percentage of Available ICU Beds Needed, Six Months',
       'Percentage of Potentially Available ICU Beds Needed, Six Months',
       'Percentage of Total ICU Beds Needed, Six Months',
       'ICU Beds Needed, Twelve Months',
       'Percentage of Available ICU Beds Needed, Twelve Months',
       'Percentage of Potentially Available ICU Beds Needed, Twelve Months',
       'Percentage of Total ICU Beds Needed, Twelve Months',
       'ICU Beds Needed, Eighteen Months',
       'Percentage of Available ICU Beds Needed, Eighteen Months',
       'Percentage of Potentially Available ICU Beds Needed, Eighteen Months',
       'Percentage of Total ICU Beds Needed, Eighteen Months']

hghi_hrr_60_subset_df = hghi_hrr_60_df[['HRR'] + non_common_properties].add_suffix(' (60%)')
hghi_hrr_40_subset_df = hghi_hrr_40_df[['HRR'] + non_common_properties].add_suffix(' (40%)')
hghi_hrr_20_subset_df = hghi_hrr_20_df[['HRR'] + non_common_properties].add_suffix(' (20%)')

common_properties = [
    'HRR', 'Total Hospital Beds', 'Total ICU Beds',
    'Available Hospital Beds', 'Potentially Available Hospital Beds*',
    'Available ICU Beds', 'Potentially Available ICU Beds*',
    'Adult Population', 'Population 65+' 
]

hghi_hrr_common_subset_df = hghi_hrr_60_df[common_properties]

In [6]:
hghi_hrr_merged_df = hghi_hrr_common_subset_df.set_index('HRR').join(
    [hghi_hrr_60_subset_df.set_index('HRR (60%)'),
     hghi_hrr_40_subset_df.set_index('HRR (40%)'),
     hghi_hrr_20_subset_df.set_index('HRR (20%)')]
).reset_index()
hghi_hrr_merged_df

Unnamed: 0,HRR,Total Hospital Beds,Total ICU Beds,Available Hospital Beds,Potentially Available Hospital Beds*,Available ICU Beds,Potentially Available ICU Beds*,Adult Population,Population 65+,Projected Infected Individuals (60%),...,"Percentage of Potentially Available ICU Beds Needed, Six Months (20%)","Percentage of Total ICU Beds Needed, Six Months (20%)","ICU Beds Needed, Twelve Months (20%)","Percentage of Available ICU Beds Needed, Twelve Months (20%)","Percentage of Potentially Available ICU Beds Needed, Twelve Months (20%)","Percentage of Total ICU Beds Needed, Twelve Months (20%)","ICU Beds Needed, Eighteen Months (20%)","Percentage of Available ICU Beds Needed, Eighteen Months (20%)","Percentage of Potentially Available ICU Beds Needed, Eighteen Months (20%)","Percentage of Total ICU Beds Needed, Eighteen Months (20%)"
0,"Abilene, TX",980,127,565,772,68,98,226444,50412,135866,...,1.42,1.09,70,1.03,0.71,0.55,45,0.66,0.46,0.35
1,"Akron, OH",1358,186,518,938,94,140,547990,111042,328794,...,2.36,1.78,165,1.76,1.18,0.89,108,1.15,0.77,0.58
2,"Alameda County, CA",2695,293,665,1680,139,216,1310189,214991,786113,...,3.52,2.59,380,2.73,1.76,1.30,248,1.78,1.15,0.85
3,"Albany, GA",704,60,221,462,27,43,157143,30466,94286,...,2.19,1.57,47,1.74,1.09,0.78,31,1.15,0.72,0.52
4,"Albany, NY",4804,425,1579,3191,193,309,1477723,318695,886634,...,2.92,2.12,452,2.34,1.46,1.06,294,1.52,0.95,0.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,"Winston-Salem, NC",2870,499,843,1857,40,269,899862,197294,539917,...,2.05,1.11,276,6.90,1.03,0.55,180,4.50,0.67,0.36
301,"Worcester, MA",1525,169,425,975,43,106,652386,122123,391432,...,3.65,2.29,194,4.51,1.83,1.15,126,2.93,1.19,0.75
302,"Yakima, WA",369,58,200,285,33,45,220972,40707,132583,...,2.91,2.26,65,1.97,1.44,1.12,43,1.30,0.96,0.74
303,"York, PA",868,79,267,567,8,43,343334,73773,206000,...,4.88,2.66,105,13.13,2.44,1.33,68,8.50,1.58,0.86


### Reformat HRR name in HGHI data

HRRs are defined as `city, state` in the HGHI data, and `state- city` in the HRR geojson.

Also, there are some differing names beyond formatting. Account for this with a explicit mapping.

**NOTE:** HGHI data is missing 'FL- ST PETERSBURG' HRR that is contained in the HRR region dataset.

In [7]:
print('HGHI data contains {} HRRs, region data contains {}'.format(
    len(hghi_hrr_merged_df['HRR']), len(hrr_gdf['HRRCITY'])))

HGHI data contains 305 HRRs, region data contains 306


In [8]:
# HGHI missing 'FL- ST PETERSBURG'
hghi_to_hrr_mapping = {
    'AK- ANCHORAGE': 'AK-Anchorage',
    'CA- ALAMEDA COUNTY': 'CA- ALAMEDA CO.',
    'CA- CONTRA COSTA COUNTY': 'CA- CONTRA COSTA CO.',
    'CA- ORANGE COUNTY': 'CA- ORANGE CO.',
    'CA- PALM SPRINGS/RANCHO MIRA': 'CA- PALM SPR/RANCHO MIR.',
    'CA- SAN MATEO COUNTY': 'CA- SAN MATEO CO.',
    'FL- ST. PETERSBURG': 'FL- ST PETERSBURG',
    'HI- HONOLULU': 'HI-Honolulu',
    'MI- ST. JOSEPH': 'MI- ST JOSEPH',
    'MN- ST. CLOUD': 'MN- ST CLOUD',
    'MN- ST. PAUL': 'MN- ST PAUL',
    'MO- ST. LOUIS': 'MO- ST LOUIS',
    'ND- FARGO/MOORHEAD MN': 'ND- FARGO MOORHEAD -MN',
    'NY- MANHATTAN': 'NY- NEW YORK'
}

In [9]:
def reformat_hrr_name(hrr):
    city, state = hrr.split(', ')
    name = '{}- {}'.format(state, city.upper())
    if name in hghi_to_hrr_mapping:
        return hghi_to_hrr_mapping[name]
    return name

hghi_hrr_merged_df['HRRCITY'] = hghi_hrr_merged_df['HRR'].apply(reformat_hrr_name)

In [10]:
hghi_hrr_gdf = gpd.GeoDataFrame(
    hghi_hrr_merged_df.set_index('HRRCITY').join(hrr_gdf.set_index('HRRCITY')).reset_index()
, crs=4326)

In [12]:
hghi_hrr_gdf.to_file(processed_data_path('hghi_hrr_data.geojson'), 
                  encoding='utf-8', 
                  driver='GeoJSON')

## Generate State GeoJSON

Note: there is not a differentiation between scenarios. From `Projected Infected Individuals` / `Adult Population` it seems to be a 60% scenario.

In [13]:
abbrv_to_name = dict([(v, k) for k, v in state_name_to_abbreviation.items()])
hghi_state_df['State Name'] = hghi_state_df['State'].apply(lambda x: abbrv_to_name[x])

In [14]:
hghi_state_gdf = hghi_state_df.set_index('State Name').join(
        state_gdf.drop(columns=['STATE']).set_index('NAME')
).reset_index()

hghi_state_gdf =gpd.GeoDataFrame(hghi_state_gdf, crs=4326)

In [15]:
hghi_state_gdf.to_file(processed_data_path('hghi_state_data.geojson'), 
                  encoding='utf-8', 
                  driver='GeoJSON')