In [None]:
import json
from copy import deepcopy

import numpy as np
import pandas as pd
import geopandas as gpd

from covidcaremap.constants import *
from covidcaremap.data import (processed_data_path, 
                               published_data_path,
                               external_data_path)

# Generate CovidCareMap facility data

The CovidCareMap (CCM) facility data describes a US Heathcare Facility's capacity
in terms of beds.

## Methods 

This section describes methods that were used to construct the CovidCareMap facility
data.

### Data Sources

We use data from [Definitive Health](https://coronavirus-resources.esri.com/datasets/definitivehc::definitive-healthcare-usa-hospital-beds?geometry=125.859%2C-16.820%2C-150.821%2C72.123) (DH) and [HCRIS]( https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/Cost-Reports/Hospital-2010-form). 

The facilities between the two datasets are merged and data from both sources are used if
available. It's important to note that the merging happens by merging HCRIS data **into** 
DH data; therefore the complete DH data is represented, while unmerged HCRIS facilities are
left out of the data. This can also be stated as the join between DH and HCRIS data is a [left outer join](https://en.wikipedia.org/wiki/Join_(SQL)#Left_outer_join), as opposed to a full join. This choice as made because DH contained more facility information than HCRIS, HCRIS is
known to have some data issues, and the DH data is maintained by a commercial company. There is another facility-level dataset, [HIFLD](https://hifld-geoplatform.opendata.arcgis.com/datasets/hospitals) which is currently not used in this project due to the matching logic not being robust to more than two datasets. Integrating this and other facility level data is future work.

The source of the data column will be in the column that is suffixed with ` - SOURCE`. The following codes are used to describe the data source:

- `DH-{column}`: The data is sourced from DH data, from the column `{column}`.
- `HCRIS-{column}`: The data is sourced from HCRIS data, from the column `{column}`. Note that these columns are from the processed HCRIS data generated as part of this project's workflow. See the `Process HCRIS Data` notebook for more information about how the column's data is derived.
- `None`: There was no data available for this column and so the value is based on an assumption (e.g. 0 or NaN)

### How the data is computed

#### Staffed All Beds

- Use the value from DH if present; if not, use the HCRIS data.
- If there is no total staffed bed data available, this value is NaN.
  
#### Staffed ICU Beds

- Use the value from DH if present; if not, use the HCRIS data.
- If there is no staffed ICU bed data available, this value is set to NaN.

#### Licensed All Beds

- Use the value from DH data. HCRIS does not have this value.
- There are some cases where the number of licensed beds from the DH data is lower than the number of staffed beds in the HCRIS or DH data. If this is the case, we use the number of staffed beds for this value.

#### All Bed Occupancy Rate

- Use the value from DH if present; if not, use the HCRIS data.
- If there is no data, we set this value to NaN.

#### ICU Bed Occupancy Rate

- Use the value from HCRIS if present.
- If there is no HCRIS data, we set this value to NaN.

In [None]:
with open(processed_data_path('dh_hcris_merged_facility_data.geojson')) as f:
    geojson = json.loads(f.read())

In [None]:
# Show the fields we'll be working with.
geojson['features'][1]

In [None]:
# Target columns
print('CovidCareMap data colums:')
print(json.dumps(CCM_CSV_COLUMNS, indent=2))

In [None]:
def get_staffed_icu_beds(props):
    dh_source = 'DH-NUM_ICU_BE'
    hcris_source = 'HCRIS-ICU Total Staffed Beds'
    
    staffed_icu_beds = props['NUM_ICU_BE']
    source = dh_source
    
    if staffed_icu_beds is None:
        staffed_icu_beds = props['ICU Total Staffed Beds']
        source = hcris_source
        
        if staffed_icu_beds is None:
            staffed_icu_beds = np.nan
            source = 'None'
            
    return staffed_icu_beds, source

def get_total_staffed_beds(props):
    dh_source = 'DH-NUM_STAFFE'
    hcris_source = 'HCRIS-Total Staffed Beds'
    
    total_staffed_beds = props['NUM_STAFFE']
    source = dh_source
    
    if total_staffed_beds is None:
        total_staffed_beds = props['Total Staffed Beds']
        source = hcris_source
        
        if total_staffed_beds is None:
            total_staffed_beds = np.nan
            source = 'None'
        
    return total_staffed_beds, source

def get_licensed_beds(props, check_against_staffed=True):
    dh_source = 'DH-NUM_LICENS'
    
    licensed_beds = props['NUM_LICENS']
    source = dh_source
    
    if licensed_beds is None:
        print(json.dumps(props, indent=4))
        raise Exception('No licensed bed data found for the printed facility')
        
    # In some cases the licensed beds are less than staffed beds.
    # If this happens, use the larger number.
    if check_against_staffed:
        staffed_beds, staffed_beds_source = get_total_staffed_beds(props)
        if licensed_beds < staffed_beds:
            licensed_beds, source = staffed_beds, staffed_beds_source

    return licensed_beds, source
    
def get_bed_occupancy_rate(props):
    dh_source = 'DH-BED_UTILIZ'
    hcris_source = 'HCRIS-Total Bed Occupancy Rate'
    
    total_occupancy_rate = props['BED_UTILIZ']
    source = dh_source
    
    if total_occupancy_rate is None:
        total_occupancy_rate = props['Total Bed Occupancy Rate']
        source = hcris_source
        
        if total_occupancy_rate is None:
            # No occupancy data
            total_occupancy_rate = np.nan
            source = 'None'
            
    return total_occupancy_rate, source 

def get_icu_bed_occupancy_rate(props):
    hcris_source = 'HCRIS-ICU Occupancy Rate'
    
    icu_occupancy_rate = props['ICU Occupancy Rate']
    source = hcris_source
    
    if icu_occupancy_rate is None:
        # No occupancy data
        icu_occupancy_rate = np.nan
        source = 'None'
            
    return icu_occupancy_rate, source


In [None]:
# Some debugging methods
DEBUG = False

def notify_of_inconsistencies(props):
    """Prints out if a facility that has inconsistent data.
    """
    # Highlight if the HCRIS data disagrees with the DH data, if available.
    # Use 10 as an arbitrary threshold
    if props['Total Staffed Beds'] is not None:
        if props['NUM_STAFFE'] is not None:
            hcris_staffed_beds = props['Total Staffed Beds']
            dh_staffed_beds = props['NUM_STAFFE']
            if abs(hcris_staffed_beds - dh_staffed_beds) > 10:
                print('Facility {} ({}) has a disagreement about '
                      'staffed bed numbers! HCRIS: {} vs DH: {}'.format(
                    props['HOSPITAL_N'], props['OBJECTID'],
                      hcris_staffed_beds, dh_staffed_beds))

def pdebug(msg):
    if DEBUG:
        print(msg)

def printprops(props):
    print(json.dumps(props, indent=4))

In [None]:
# Properties to carry over, mapped by new_property_name -> source_property_name
properties_to_directly_map = {
    'Name': 'HOSPITAL_N',
    'Hospital Type': 'HOSPITAL_T',
    'Address': 'HQ_ADDRESS',
    'Address_2': 'HQ_ADDRE_1',
    'City': 'HQ_CITY',
    'State': 'HQ_STATE',
    'Zipcode': 'HQ_ZIP_COD',
    'County': 'COUNTY_NAM',
    'DH-OBJECTID': 'OBJECTID',
    'HCRIS-Provider Number': 'Provider Number'
}

properties_to_compute = {
     # Count of staffed beds during 'Business As Usual' (BAU)
    CCM_STAFFED_BEDS_COLUMN: get_total_staffed_beds,

    # Count of staffed ICU beds during BAU
    CCM_STAFFED_ICU_BEDS_COLUMN: get_staffed_icu_beds,

    # Count of beds the facility is licensed to opertate
    CCM_LICENSED_BEDS_COLUMN: get_licensed_beds,
    
    # Average ratio of beds occupied during BAU
    CCM_BED_OCCUPANCY_COLUMN: get_bed_occupancy_rate,
    
    # Average ratio of beds occupied during BAU
    CCM_ICU_BED_OCCUPANCY_COLUMN: get_icu_bed_occupancy_rate
}

In [None]:
new_gj = deepcopy(geojson)

for feature in new_gj['features']:
    props = feature['properties']
    new_props = {} 
    
    if DEBUG:
        notify_of_inconsistencies(props)
    
    for prop_column, prop_getter in properties_to_compute.items():
        value, source = prop_getter(props)
        
        new_props[prop_column] = value
        new_props[source_column(prop_column)] = source
        
    for new_prop, old_prop in properties_to_directly_map.items():
        new_props[new_prop] = props[old_prop]
        
    # Create a facility ID. For now, use DH_ID
    # since that's the base facility data we are using.
    # In the future this may change as we add new facility data sources.
    new_props[CCM_ID] = new_props[DH_ID]
    
    feature['properties'] = new_props
    
print(json.dumps(new_gj['features'][1], indent=4))

### Manual Override

There are instances where the data is off and can be corrected manually. We place facility information into the same format that is constructed here, and include information about why we are overriding the data and the new source of information. 

In [None]:
manual_override_data = pd.read_csv(external_data_path('covidcaremap-ushcsc-facility-manual-override.csv'))
override_dict = manual_override_data.to_dict(orient='record')
override_dict_by_ccm_id = dict((r['CCM_ID'], r) for r in override_dict)

for feat in new_gj['features']:
    props = feat['properties']
    if props['CCM_ID'] in override_dict_by_ccm_id:
        override = override_dict_by_ccm_id[props['CCM_ID']]
        print('Overriding facility {} for reason "{}" with new data source {}'.format(
            props['CCM_ID'], 
            override['Manual Override Reason'], 
            override['Manual Override New Data Source']
        ))
        
        lat, lng = None, None
        for prop in override:
            if prop == 'Latitude':
                lat = override[prop]
            elif prop == 'Longitude':
                lng = override[prop]
            elif prop in props:                
                props[prop] = override[prop]
        feat['geometry']['coordinates'] = [lng, lat]

existing_ids = set([f['properties']['CCM_ID'] for f in new_gj['features']])
override_ids = set(override_dict_by_ccm_id.keys())
new_ids = override_ids - existing_ids
        
for ccm_id in override_dict_by_ccm_id:
    if ccm_id in new_ids:
        facility = override_dict_by_ccm_id[ccm_id]
        
        print('Adding new facility {} for reason "{}" with new data source {}'.format(
            facility['CCM_ID'], 
            facility['Manual Override Reason'], 
            facility['Manual Override New Data Source']
        ))
        
        new_feature = { 
            'type': 'Feature',
            'geometry': {
                'type': 'Point',
                'coordinates': [facility['Longitude'], facility['Latitude']]
            },
            'properties': { }
        }
        
        for prop in facility:
            if prop == 'Latitude' or prop == 'Longitude':
                continue
            elif prop == 'Manual Override Reason' or prop == 'Manual Override New Data Source':
                continue
            else:
                new_feature['properties'][prop] = facility[prop]

        new_gj['features'].append(new_feature)
        print(json.dumps(new_feature, indent=4))
        

### Write out files

In [None]:
facility_fname = 'us_healthcare_capacity-facility-CovidCareMap'
geojson_path = published_data_path('{}.geojson'.format(facility_fname))
csv_path = published_data_path('{}.csv'.format(facility_fname))

In [None]:
# Write out GeoJSON
with open(geojson_path, 'w') as f:
    f.write(json.dumps(new_gj, indent=2))

In [None]:
# Write out CSV
def get_lon(row):
    return row['geometry'].x
def get_lat(row):
    return row['geometry'].y

final_gdf = gpd.read_file(geojson_path)
final_gdf['Latitude'] = final_gdf.apply(get_lat, axis=1)
final_gdf['Longitude'] = final_gdf.apply(get_lon, axis=1)
final_df = final_gdf[CCM_CSV_COLUMNS]
final_df.to_csv(csv_path, index=False)