### Cal-CRAI Metric Calculation for: Society & Economy / Health Shortage Metrics
This notebook calculates 3 metrics, all sourced from the California Health Resources and Services Administration.
* Mental health professional shortage area score
* Primary health care professional shortage area score
* num. of licensed narcotic treatment programs per 10,000 people

In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/society_economy/social_services/ca_health_human_services/'

pull_gpkg_from_directory(bucket_name, aws_dir)

In [3]:
mental_healthcare_shortage_data = gpd.read_file('society_hrsa_mental_care_shortage.gpkg')
primary_healthcare_shortage_data = gpd.read_file('society_hrsa_primary_care_shortage.gpkg')

mental_healthcare_shortage_data = mental_healthcare_shortage_data.to_crs(crs=4269) 
primary_healthcare_shortage_data = primary_healthcare_shortage_data.to_crs(crs=4269) 

os.remove('society_hrsa_mental_care_shortage.gpkg')
os.remove('society_hrsa_primary_care_shortage.gpkg')

Start with primary care shortage data

In [None]:
primary_healthcare_shortage_data

In [None]:
primary_healthcare_shortage_data.plot(column="HpsScore")

call in California 2021 tiger tract data

In [None]:
# read in CA census tiger file
tract_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"

ca_tract_boundaries = gpd.read_file(tract_shp_dir)
# need to rename columns so we don't have any duplicates in the final geodatabase
column_names = ca_tract_boundaries.columns
# new_column_names = ["USCB_"+column for column in column_names if column != "geometry"]
ca_tract_boundaries = ca_tract_boundaries.rename(columns={'GEOID':"TRACT"})
ca_tract_boundaries = ca_tract_boundaries.to_crs(crs=4269) 
ca_tract_boundaries

sjoin instead and take a mean of all the point data within a tract

In [None]:
# sjoin with tracts
joined_primary_df = ca_tract_boundaries.sjoin(primary_healthcare_shortage_data).reset_index()
# take the mean of the point data within each tract
avg_primary_df = joined_primary_df.groupby('USCB_GEOID')['HpsScore'].agg(['mean']).reset_index()
# merge the means with the tract shapefile and transform to geodataframe so we can plot
avg_primary_df = avg_primary_df.rename(columns={"USCB_GEOID":"TRACT"})
avg_primary_df = pd.merge(avg_primary_df,ca_tract_boundaries,on="TRACT", how='right')
avg_primary_df = gpd.GeoDataFrame(avg_primary_df,geometry="geometry")
avg_primary_df.plot(column="mean")

isolate for relevant columns

In [None]:
columns_keep = ['TRACT', 'mean']

primary_healthcare_shortage_metric = avg_primary_df[columns_keep]
primary_healthcare_shortage_metric = primary_healthcare_shortage_metric.rename(columns={'TRACT':'census_tract', 'mean':'avg_hpsscore_primary_care_metric'})
primary_healthcare_shortage_metric

## Now to count the number of Indian Health Service designation types are in the primary healthcare shortage dataset
* per census tract
* then merge to our calculated metric dataframe based on tract

In [None]:
primary_native_tribe_column = joined_primary_df

columns_to_keep = ['TRACT', 'HpsTypDes']

native_tribe_merge = pd.merge(primary_native_tribe_column, ca_tract_boundaries,on="TRACT", how='right')
cleaned_native_tribe = native_tribe_merge.drop_duplicates(subset=['TRACT', 'HpsTypDes'])

filtered_df = cleaned_native_tribe[cleaned_native_tribe['HpsTypDes'].isin(['Indian Health Service, Tribal Health, and Urban Indian Health Organizations'])]
primary_native_filtered = filtered_df[columns_to_keep]

grouped_native_counts = primary_native_filtered.groupby('TRACT')['HpsTypDes'].agg(['count']).reset_index()
grouped_native_counts = grouped_native_counts.rename(columns={'TRACT':'census_tract'})
grouped_native_counts

In [None]:
# merge metric dataset to tribal healthcare sites
primary_healthcare_shortage_metric_tribe_count = pd.merge(grouped_native_counts,primary_healthcare_shortage_metric, on='census_tract', how='right')
primary_healthcare_shortage_metric_tribe_count = primary_healthcare_shortage_metric_tribe_count.rename(columns={'count':'number_tribal_health_services'})
primary_healthcare_shortage_metric_tribe_count

In [11]:
# save as a csv
primary_healthcare_shortage_metric_tribe_count.to_csv('society_primary_healthcare_shortage_metric.csv', index=False)

## Now we do the same steps for mental healthcare shortage areas

In [None]:
mental_healthcare_shortage_data

In [None]:
mental_healthcare_shortage_data.plot(column="HpsScore")

In [None]:
# sjoin with tracts
joined_mental_df = ca_tract_boundaries.sjoin(mental_healthcare_shortage_data).reset_index()
# take the mean of the point data within each tract
avg_mental_df = joined_mental_df.groupby('USCB_GEOID')['HpsScore'].agg(['mean']).reset_index()
# merge the means with the tract shapefile and transform to geodataframe so we can plot
avg_mental_df = avg_mental_df.rename(columns={"USCB_GEOID":"TRACT"})
avg_mental_df = pd.merge(avg_mental_df,ca_tract_boundaries,on="TRACT", how='right')
avg_mental_df = gpd.GeoDataFrame(avg_mental_df,geometry="geometry")
avg_mental_df.plot(column="mean")

In [None]:
columns_keep = ['TRACT', 'mean']

mental_healthcare_shortage_metric = avg_mental_df[columns_keep]
mental_healthcare_shortage_metric = mental_healthcare_shortage_metric.rename(columns={'TRACT':'census_tract', 'mean':'avg_hpsscore_mental_shortage_metric'})
mental_healthcare_shortage_metric

## Now to count the number of Indian Health Service designation types are in the mental healthcare shortage dataset
* per census tract
* then merge to our calculated metric dataframe based on tract

In [None]:
native_tribe_column = joined_mental_df

columns_to_keep = ['TRACT', 'HpsTypDes']

native_tribe_column = native_tribe_column[columns_to_keep]
native_tribe_merge = pd.merge(native_tribe_column,ca_tract_boundaries,on="TRACT", how='right')
cleaned_native_tribe = native_tribe_merge.drop_duplicates(subset=['TRACT', 'HpsTypDes'])

filtered_df = cleaned_native_tribe[cleaned_native_tribe['HpsTypDes'].isin(['Indian Health Service, Tribal Health, and Urban Indian Health Organizations'])]
mental_native_filtered = filtered_df[columns_to_keep]

grouped_native_counts = mental_native_filtered.groupby('TRACT')['HpsTypDes'].agg(['count']).reset_index()
grouped_native_counts_mental_shortage = grouped_native_counts.rename(columns={'TRACT':'census_tract'})
grouped_native_counts_mental_shortage

In [None]:
# merge metric dataset to tribal healthcare sites
mental_healthcare_shortage_metric_tribe_count = pd.merge(grouped_native_counts_mental_shortage,
                                                        mental_healthcare_shortage_metric, 
                                                        on='census_tract',
                                                        how='right')

mental_healthcare_shortage_metric_tribe_count = mental_healthcare_shortage_metric_tribe_count.rename(columns={'count':'number_tribal_health_services'})

mental_healthcare_shortage_metric_tribe_count

In [18]:
# save as a csv
mental_healthcare_shortage_metric_tribe_count.to_csv('society_mental_healthcare_shortage_metric.csv', index=False)

## Function Call for both mental health and primary care shortages

In [19]:
@append_metadata
def healthcare_shortage_metric_calc(input_csv, export=False, varname=''):    
    '''
    Calculates the average Healthcare Professional Shortage Area (HPSA) score per California census tract.
    Data was sourced from: https://data.hrsa.gov/data/download
    The function calculates metrics for mental and primary healthcare shortages.

    The number of tribal health facilities were also retained and summed per census tract.

    Methods
    -------
    Data was cleaned by removing duplicate entries containing the same county and HPSA score.
    Data columns were renamed, merged to 2022 census tract data, and averaged to attribute
    scores to all California tracts.
    
    Parameters
    ----------
    df: string
        the dataframe containing the initial HPSA score data
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI HPSA score metric to AWS
        True = will upload resulting df containing CAL CRAI HPSA score metric to AWS
    import_csv: string
        name of the csv file to be uploaded to AWS

    Script
    ------
    society_healthcare_shortage.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are
    stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: data converted to Cal-CRAI standardized coordinate reference system (CRS): 4269.')
    print('Data transformation: merge data to California tracts and calculate census tract average.')
    print('Data transformation: nan values are retained for each census tract without an HPSA score.')

    bucket_name = 'ca-climate-index'
    directory = '3_fair_data/index_data'
    export_filename = [input_csv]

    if export == True:
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{export_filename} uploaded to AWS.')

    if os.path.exists(input_csv):
        os.remove(input_csv)

In [20]:
input_csv = ['society_primary_healthcare_shortage_metric.csv',
            'society_mental_healthcare_shortage_metric.csv']

varnames = [
    'society_hrsa_primary_care_shortage',
    'society_hrsa_mental_care_shortage']

for csv, var in zip(input_csv, varnames):
    healthcare_shortage_metric_calc(csv, export=True, varname=var)

### Narcotic Treatment Facilities Metric
* num. of licensed narcotic treatment programs per 10,000 people

In [21]:
narcotic_support_data = gpd.read_file('society_hrsa_narcotic_support.gpkg')
os.remove('society_hrsa_narcotic_support.gpkg')

In [None]:
narcotic_columns = ['OBJECTID',
                    'CountyName',
                    'DBA',
                    'OTP_CA10',
                    'geometry'
                    ]
narcotic_support = narcotic_support_data[narcotic_columns]
narcotic_support = narcotic_support.rename(columns={'CountyName': 'County'})

narcotic_support['County'] = narcotic_support['County'].str.replace(' County', '', case=False)

unique_entries = narcotic_support['County'].unique()

narcotic_support
unique_entries

Checking to make sure only CA counties are included

In [None]:
ca_narcotic_support, omitted_rows = filter_counties(narcotic_support, 'County')
print(ca_narcotic_support)
print(f'number of omitted rows:{len(omitted_rows)}')

Getting rid of duplicates based on a few columns

In [None]:
# Columns to check for duplicates
selected_columns = ['OTP_CA10', 'geometry', 'DBA']

# Check for duplicates
duplicates = ca_narcotic_support.duplicated(subset=selected_columns, keep=False)

# Count duplicates
duplicate_count = duplicates.sum()

# Filter out the duplicate rows for display
duplicate_rows = narcotic_support[duplicates]

print(f"Number of duplicate entries in columns {selected_columns}: {duplicate_count}")
print("Duplicate rows:")
display(duplicate_rows)

# Drop duplicates, keeping the first occurrence
narcotic_support_unique_values = narcotic_support.drop_duplicates(subset=selected_columns, keep='first')

In [None]:
narcotic_support_unique_values

In [26]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties = ca_counties_tract.drop(columns={'field_1', 'TRACT', 'geometry'})

Merge narcotic support data with California counties

In [None]:
# Columns to check for duplicates
selected_columns = ['OTP_CA10', 'geometry', 'DBA']

merge_df = pd.merge(narcotic_support_unique_values, ca_counties, how='left', on='County')
merge_df = merge_df.drop_duplicates(subset=selected_columns, keep='first')
merge_df

Make a new df containing counts of narcotic support facilities per California county

In [None]:
# Group by 'CntFips' and 'County' and count the number of entries
count_narcotic_support_facilities = merge_df.groupby(['COUNTYFP', 'County']).size().reset_index(name='num_narcotic_support_facilities')

print(len(count_narcotic_support_facilities))
count_narcotic_support_facilities

In [29]:
# read in CA estimated county population
pull_county_pop = "s3://ca-climate-index/0_map_data/county_est_pop_2022.csv"
ca_county_pop = gpd.read_file(pull_county_pop)
ca_counties_pop = ca_county_pop.drop(columns={'field_1', 'geometry'})
ca_counties_pop = ca_counties_pop.rename(columns={'county':'County'})

In [None]:
# Ensure the 'num_narcotic_support_facilities' and 'est_total_pop' columns are numeric
count_narcotic_support_facilities['num_narcotic_support_facilities'] = pd.to_numeric(count_narcotic_support_facilities['num_narcotic_support_facilities'], errors='coerce')
ca_counties_pop['est_total_pop'] = pd.to_numeric(ca_counties_pop['est_total_pop'], errors='coerce')

ca_pop_narcotic_support_merge = pd.merge(ca_counties_pop, count_narcotic_support_facilities, on='County', how='right')
ca_pop_narcotic_support_merge['narcotic_support_per_10000'] = (ca_pop_narcotic_support_merge['num_narcotic_support_facilities'] / ca_pop_narcotic_support_merge['est_total_pop']) * 10000
ca_pop_narcotic_support_merge

In [None]:
ca_counties_tract

In [None]:
ca_tracts = ca_counties_tract.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
#count_narcotic_support_facilities = count_narcotic_support_facilities.drop(columns={'County'})

tract_merge = pd.merge(ca_tracts, ca_pop_narcotic_support_merge, on='County', how='left')
tract_merge

Function to check data per county

In [None]:
# Define a function to display data for a specific county
def display_county_data(df, county_name):
    county_data = df[df['County'] == county_name]
    if county_data.empty:
        print(f"No data found for {county_name}")
    else:
        print(f"Data for {county_name}:")
        display(county_data)

display_county_data(tract_merge, 'Orange')

## Function Call for Narcotic Support

In [34]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties_tract = ca_counties_tract.drop(columns={'field_1', 'geometry'})

@append_metadata
def narcotic_support_metric_calc(df, export=False, export_filename=None, varname=''):
    '''
    Calculates the number of Licensed Narcotic Treatment Programs per California county per
    10,000 people.
    Data was sourced from: https://data.chhs.ca.gov/dataset/licensed-narcotic-treatment-programs
    The function calculates the metrics for narcotic support.

    Methods
    -------
    Data was cleaned by removing duplicate entries containing the same location and facility identifier.
    Data columns were renamed, merged to 2022 census tract data, and summed to attribute scores to all 
    California tracts.
    
    Parameters
    ----------
    df: string
        the dataframe containing the initial narcotic support data
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI narcotic support metric to AWS
        True = will upload resulting df containing CAL CRAI narcotic support metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Script
    ------
    society_healthcare_shortage.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: isolate relevant columns and remove duplicate entries.')
    print('Data transformation: merge data to California counties and calculate county totals.')
    print('Data transformation: merge data to California tracts, keeping county sums for all tracts within.')
    print('Data transformation: import county population estimates and merge into data.')
    print('Data transformation: calculate metric by dividing county level narcotic treatment facilities by county population and multiply by 10,000.')

    # Columns to check for duplicates
    selected_columns = ['OTP_CA10', 'geometry', 'DBA']

    # Check for duplicates
    duplicates = df.duplicated(subset=selected_columns, keep=False)

    # Filter out the duplicate rows for display
    duplicate_rows = narcotic_support[duplicates]

    # Drop duplicates, keeping the first occurrence
    narcotic_support_unique_values = narcotic_support.drop_duplicates(subset=selected_columns, keep='first')

    # Columns to check for duplicates
    selected_columns = ['OTP_CA10', 'geometry', 'DBA']

    merge_df = pd.merge(narcotic_support_unique_values, ca_counties_tract, how='left', on='County')
    merge_df = merge_df.drop_duplicates(subset=selected_columns, keep='first')
    
    count_narcotic_support_facilities = merge_df.groupby(['COUNTYFP', 'County']).size().reset_index(name='num_narcotic_support_facilities')

    # read in CA estimated county population
    pull_county_pop = "s3://ca-climate-index/0_map_data/county_est_pop_2022.csv"
    ca_county_pop = gpd.read_file(pull_county_pop)
    ca_counties_pop = ca_county_pop.drop(columns={'field_1', 'geometry'})
    ca_counties_pop = ca_counties_pop.rename(columns={'county':'County'})

    # Ensure the 'num_narcotic_support_facilities' and 'est_total_pop' columns are numeric
    count_narcotic_support_facilities['num_narcotic_support_facilities'] =  pd.to_numeric(count_narcotic_support_facilities['num_narcotic_support_facilities'], errors='coerce')
    ca_counties_pop['est_total_pop'] = pd.to_numeric(ca_counties_pop['est_total_pop'], errors='coerce')

    ca_pop_narcotic_support_merge = pd.merge(ca_counties_pop, count_narcotic_support_facilities, on='County', how='right')
    ca_pop_narcotic_support_merge['narcotic_support_per_10000'] = (ca_pop_narcotic_support_merge['num_narcotic_support_facilities'] / ca_pop_narcotic_support_merge['est_total_pop']) * 10000

    ca_tracts = ca_counties_tract.drop(columns={'COUNTYFP'})
    
    tract_merge = pd.merge(ca_tracts, ca_pop_narcotic_support_merge, on='County', how='left')
    tract_merge = tract_merge.drop(columns='COUNTYFP')
    # export to csv and upload to AWS
    if export == True:
        tract_merge.to_csv(export_filename)
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [export_filename]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{export_filename} uploaded to AWS.')

    if os.path.exists(export_filename[0]):
        os.remove(export_filename[0])

    return tract_merge

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/society_economy/social_services/ca_health_human_services/'

pull_gpkg_from_directory(bucket_name, aws_dir)

narcotic_support_data = gpd.read_file('society_hrsa_narcotic_support.gpkg')

narcotic_columns = ['OBJECTID',
                    'CountyName',
                    'DBA',
                    'OTP_CA10',
                    'geometry'
                    ]
narcotic_support = narcotic_support_data[narcotic_columns]
narcotic_support = narcotic_support.rename(columns={'CountyName': 'County'})
narcotic_support['County'] = narcotic_support['County'].str.replace(' County', '', case=False)

narcotic_support_metric_calc(narcotic_support, export=True, export_filename='society_narcotic_support_metric.csv', varname='test')#'society_hrsa_narcotic_support')