# Merge facility information

Merge facility data from HCRIS (Healthcare Cost Reporting Information System) andDH (Definitive Healthcare) datasets.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
from os.path import join, isdir

from covidcaremap.geo import spatial_join_facilities
from covidcaremap.data import processed_data_path

from covidcaremap.mapping import HospMap
from covidcaremap.merge import Matcher

In [3]:
hcris = gpd.read_file(processed_data_path('usa_facilities_hcris_geocoded.geojson'), encoding='utf-8')
dh = gpd.read_file(processed_data_path('dh_geocoded_v1_0326202.geojson'), encoding='utf-8')

In [4]:
hcris.rename(columns={
    'ST_ABBR': 'STATE_NAME',
    'Zip_Code': 'ZIP_CODE'
}, inplace=True)

dh['STATE_NAME'] = dh['ST_ABBR']

In [5]:
hcris_info = (hcris, 'HCRIS', 'Provider Number')
dh_info = (dh, 'DH', 'OBJECTID')

In [47]:
all_states = hcris['STATE_NAME'].unique()

In [48]:
def match_by_state(d1, d2, map_dir=None, str_match_method='name'):
    d1, name1, uid1 = d1
    d2, name2, uid2 = d2
    
    if not os.path.isdir(processed_data_path(map_dir)):
        os.mkdir(processed_data_path(map_dir))
    
    state_matches = {}
    for state in all_states:
        print('Matching facilities in {}'.format(state))
        if state not in state_matches.keys():
            d1_s = d1[d1['STATE_NAME'] == state].reset_index().copy()
            d2_s = d2[d2['STATE_NAME'] == state].reset_index().copy()
            m = Matcher(d1_s, d2_s, uid1, uid2)
            m.match_point_set((100, 500), 10, str_match_method=str_match_method)
            if map_dir:
                all_map = m.map_all((name1, name2), ['match source', 'dist_apart'])
                all_map.add_layer_selector()
                all_map.save(join(processed_data_path('{}'.format(map_dir)), '{}.html'.format(state)))
            state_matches[state] = m
    
    ds = {
        f'{name1}_matched': [],
        f'{name2}_matched': [],
        f'{name1}_unmatched': [],
        f'{name2}_unmatched': [],
        'matching_dfs': []
    }
    
    for _, v in state_matches.items():
        ds[f'{name1}_matched'].append(v.d1_matched)
        ds[f'{name2}_matched'].append(v.d2_matched)
        ds[f'{name1}_unmatched'].append(v.d1_unmatched)
        ds[f'{name2}_unmatched'].append(v.d2_unmatched)
        ds['matching_dfs'].append(v.matching_key_df())
    
    for k, v in ds.items():
        ds[k] = pd.concat(v)
        if isinstance(ds[k], gpd.GeoDataFrame):
            ds[k] = ds[k].to_crs('epsg:4326')
    
    print('------------')
    n_matched = len(ds['matching_dfs'])
    n_unmatched = len(ds[f'{name1}_unmatched'])
    n_total = n_matched + n_unmatched
    pct_matched = round((n_matched / n_total) * 100, 1)
    print(f'{name1} to {name2} matches: {pct_matched}% ({n_matched} of {n_total})')
    
    return state_matches, ds

In [49]:
hcris_to_dh_matches, hcris_to_dh_data = match_by_state(
    hcris_info, 
    dh_info, 
    'state_validation_maps_03-31-21_hcris-to-dh')

HtoD_matches = hcris_to_dh_data['matching_dfs']
HtoD_matches.to_csv(processed_data_path('HCRIS_to_DH_matching_key.csv'), index=False)

Matching facilities in AL
Completed matching and deduping facilities, matched 109 of 136
Matching facilities in AK
Completed matching and deduping facilities, matched 23 of 26
Matching facilities in AZ
Completed matching and deduping facilities, matched 104 of 125
Matching facilities in AR
Completed matching and deduping facilities, matched 95 of 109
Matching facilities in CA
Completed [250] of 445 facilities, prelim matched 230
Completed matching and deduping facilities, matched 401 of 445
Matching facilities in CO
Completed matching and deduping facilities, matched 101 of 110
Matching facilities in CT
Completed matching and deduping facilities, matched 40 of 43
Matching facilities in DE
Completed matching and deduping facilities, matched 14 of 16
Matching facilities in DC
Completed matching and deduping facilities, matched 11 of 13
Matching facilities in FL
Completed [250] of 273 facilities, prelim matched 236
Completed matching and deduping facilities, matched 246 of 273
Matching fa

In [15]:
dh_to_hcris_matches, dh_to_hcris_data = match_by_state(
    dh_info, 
    hcris_info, 
    'state_validation_maps_03-31-20_dh-to-hcris')

DtoH_matches = dh_to_hcris_data['matching_dfs']
DtoH_matches.to_csv(processed_data_path('DH_to_HCRIS_matching_key.csv'), index=False)

Matching facilities in AL
Completed matching and deduping facilities, matched 109 of 120
Matching facilities in AK
Completed matching and deduping facilities, matched 23 of 28
Matching facilities in AZ
Completed matching and deduping facilities, matched 104 of 126
Matching facilities in AR
Completed matching and deduping facilities, matched 93 of 106
Matching facilities in CA
Completed [250] of 466 facilities, prelim matched 239
Completed matching and deduping facilities, matched 398 of 466
Matching facilities in CO
Completed matching and deduping facilities, matched 101 of 113
Matching facilities in CT
Completed matching and deduping facilities, matched 40 of 46
Matching facilities in DE
Completed matching and deduping facilities, matched 14 of 16
Matching facilities in DC
Completed matching and deduping facilities, matched 13 of 14
Matching facilities in FL
Completed [250] of 276 facilities, prelim matched 232
Completed matching and deduping facilities, matched 247 of 276
Matching fa

In [40]:
hcris[(hcris['HOSP10_Name'].str.startswith('CALIFORNIA P')) | (hcris['HOSP10_Name'].str.startswith('CPMC'))]

Unnamed: 0,Provider Number,FYB,FYE,STATUS,CTRL_TYPE,HOSP10_Name,Street_Addr,PO_Box,CITY_NAME,STATE_NAME,ZIP_CODE,COUNTY_NAME,source,geometry
399,50008,01-JAN-18,31-DEC-18,As Submitted,2,CPMC-R.K. DAVIES MEDICAL CENTER,601 DUBOCE AVE,,SAN FRANCISCO,CA,94117-3389,SAN FRANCISCO,google,POINT (-122.43473 37.76859)
420,50047,01-JAN-18,31-DEC-18,As Submitted,2,CALIFORNIA PACIFIC MEDICAL CENTER,2333 BUCHANAN ST,,SAN FRANCISCO,CA,94115-1925,SAN FRANCISCO,google,POINT (-122.43175 37.79069)
422,50055,01-JAN-18,31-DEC-18,As Submitted,2,CPMC - MISSION BERNAL CAMPUS,3555 CESAR CHAVEZ STREET,,SAN FRANCISCO,CA,94110-4403,SAN FRANCISCO,google,POINT (-122.42104 37.74754)


In [36]:
dh[dh['HOSP10_Name'].str.startswith('California P')]

Unnamed: 0,orig_index,OBJECTID,HOSP10_Name,HOSPITAL_T,Street_Addr,HQ_ADDRE_1,CITY_NAME,ST_ABBR,ZIP_CODE,COUNTY_NAME,...,FIPS,NUM_LICENS,NUM_STAFFE,NUM_ICU_BE,BED_UTILIZ,Potential_,source,confirmation_source,distance,geometry
2489,2498,499,California Pacific Medical Center - Mission Be...,Short Term Acute Care Hospital,3555 Cesar Chavez,,San Francisco,CA,94110,San Francisco,...,6075,149.0,120.0,10.0,0.325355,29,google,zip code,47.42497682199675,POINT (-122.42104 37.74754)
2490,2499,500,California Pacific Medical Center - Davies Campus,Short Term Acute Care Hospital,45 Castro St,,San Francisco,CA,94114,San Francisco,...,6075,185.0,137.0,8.0,0.314669,48,google,zip code,115.1081923886214,POINT (-122.43469 37.76835)
5464,5477,6478,California Pacific Medical Center - Van Ness C...,Short Term Acute Care Hospital,1101 Van Ness Ave,,San Francisco,CA,94109,San Francisco,...,6075,274.0,274.0,36.0,0.527958,0,google,zip code,0.0,POINT (-122.42231 37.78598)


#### Filter facilities with unusable data.


In [None]:
dh_gdf = dh_gdf[~dh_gdf['geometry'].isna()]
dh_gdf = dh_gdf[~dh_gdf['NUM_LICENS'].isnull()]
hcris_gdf = hcris_gdf[hcris_gdf['Total Staffed Beds'] > 0.0]

Perform the matching on facilities based on a spatial join and similarity score between address and name columns.

In [None]:
id_columns = {
    'HCRIS': 'Provider Number',
    'DH': 'OBJECTID'
}

similarity_columns = {
    'HCRIS': ['HOSP10_Name', 'Street_Addr'],
    'DH': ['HOSPITAL_N', 'HQ_ADDRESS']
}

In [None]:
hcris_filtered_gdf = hcris_gdf[['geometry', id_columns['HCRIS']] + similarity_columns['HCRIS']]
dh_filtered_gdf = dh_gdf[['geometry', id_columns['DH']] + similarity_columns['DH']]

In [None]:
joined_dh_hcris = spatial_join_facilities(left=dh_filtered_gdf,
                        right=hcris_filtered_gdf,
                        lid_property = id_columns['DH'],
                        rid_property = id_columns['HCRIS'],
                        lsimilarity_properties = similarity_columns['DH'],
                        rsimilarity_properties = similarity_columns['HCRIS'],
                        similarity_weights=[0.6, 0.4],                                                       
                        distance=1000,
                        merge_unmatched=False)

#### Save off unmatched HCRIS data to be inspected 

In [None]:
matched_hcris = set(set(joined_dh_hcris[~joined_dh_hcris['Provider Number'].isnull()]['Provider Number']))
total_hcris = set(hcris_gdf['Provider Number'])
unmatched_hcris = hcris_gdf[hcris_gdf['Provider Number'].isin(total_hcris - matched_hcris)]

In [None]:
unmatched_hcris.to_csv(processed_data_path('hcris-unmatched-to-dh.csv'))

#### Save merged facility data

In [None]:
full_df = joined_dh_hcris.merge(dh_gdf, on='OBJECTID').merge(hcris_gdf, how='left', on='Provider Number')
full_df = full_df.drop(columns=['geometry_y'])
full_df = full_df.rename({'geometry_x': 'geometry'}, axis=1)
full_df = gpd.GeoDataFrame(full_df, crs=4326)

In [None]:
full_df.to_file(processed_data_path('dh_hcris_merged_facility_data.geojson'), 
                encoding='utf-8', 
                driver='GeoJSON')