In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [22]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/society_economy/social_services/ca_health_human_services/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'society_hrsa_mental_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_primary_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_narcotic_support.gpkg' locally


In [23]:
mental_healthcare_shortage_data = gpd.read_file('society_hrsa_mental_care_shortage.gpkg')
primary_healthcare_shortage_data = gpd.read_file('society_hrsa_primary_care_shortage.gpkg')

## Mental Healthcare Shortage

In [24]:
relevant_columns = [
                    'HpsScore',
                    'CntFips'
                    ]
mental_healthcare_shortage = mental_healthcare_shortage_data[relevant_columns]

In [25]:
mental_healthcare_shortage

Unnamed: 0,HpsScore,CntFips
0,18,103
1,15,103
2,17,103
3,18,103
4,21,039
...,...,...
450,18,073
451,13,073
452,19,007
453,18,019


In [26]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties = ca_counties_tract.drop(columns={'field_1', 'TRACT', 'geometry'})
ca_counties.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)

ca_counties

Unnamed: 0,CntFips,County
0,085,Santa Clara
1,085,Santa Clara
2,085,Santa Clara
3,085,Santa Clara
4,085,Santa Clara
...,...,...
9124,059,Orange
9125,059,Orange
9126,059,Orange
9127,013,Contra Costa


## Remove duplicate rows that contain same county and HPS Score

In [27]:
# Display the length of the original DataFrame
print("Length of original DataFrame:", len(mental_healthcare_shortage))

# Removing duplicates based on both 'CntFips' and 'HpsScore' columns
mental_healthcare_shortage_cleaned = mental_healthcare_shortage.drop_duplicates(subset=['CntFips', 'HpsScore'], keep='first')

# Display the length of the cleaned DataFrame
print("Length of cleaned DataFrame:", len(mental_healthcare_shortage_cleaned))

# Display the first few rows of the cleaned DataFrame
print(mental_healthcare_shortage_cleaned.head())

Length of original DataFrame: 455
Length of cleaned DataFrame: 227
   HpsScore CntFips
0        18     103
1        15     103
2        17     103
4        21     039
5        18     107


In [28]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties = ca_counties_tract.drop(columns={'field_1', 'TRACT', 'geometry'})
ca_counties.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)

ca_counties

Unnamed: 0,CntFips,County
0,085,Santa Clara
1,085,Santa Clara
2,085,Santa Clara
3,085,Santa Clara
4,085,Santa Clara
...,...,...
9124,059,Orange
9125,059,Orange
9126,059,Orange
9127,013,Contra Costa


## Merge the mental healthcare shortage data to the CA county data to attribute fips to county

In [29]:
mental_merge = pd.merge(mental_healthcare_shortage_cleaned, ca_counties, how='left', on='CntFips')
mental_merge = mental_merge.drop_duplicates(subset=['CntFips', 'HpsScore'], keep='first')

mental_merge


Unnamed: 0,HpsScore,CntFips,County
0,18,103,Tehama
14,15,103,Tehama
28,17,103,Tehama
42,21,039,Madera
76,18,107,Tulare
...,...,...,...
65908,20,087,Santa Cruz
65978,22,095,Solano
66078,16,025,Imperial
66118,13,073,San Diego


## Calculate average HPS score per county and assign a flag for any avg value over 0

In [30]:
average_hpsscore_county = mental_merge.groupby('CntFips')['HpsScore'].mean().reset_index()
average_hpsscore_county = average_hpsscore_county.rename(columns={'HpsScore':'Avg_HpsScore'})
average_hpsscore_county = pd.merge(average_hpsscore_county, ca_counties, how='right', on='CntFips')
average_hpsscore_county = average_hpsscore_county.drop_duplicates(subset=['CntFips', 'Avg_HpsScore'], keep='first')
average_hpsscore_county['Hps_Flag'] = average_hpsscore_county['Avg_HpsScore'].apply(lambda x: 1 if pd.notna(x) and x > 0 else None)

print(len(average_hpsscore_county))
average_hpsscore_county.head()

58


Unnamed: 0,CntFips,Avg_HpsScore,County,Hps_Flag
0,85,18.2,Santa Clara,1.0
55,59,18.333333,Orange,1.0
80,1,17.25,Alameda,1.0
121,29,17.0,Kern,1.0
129,71,17.714286,San Bernardino,1.0


In [31]:
ca_tracts = ca_counties_tract.drop(columns={'field_1', 'geometry'})
ca_tracts.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)
ca_tracts = ca_tracts.drop(columns={'County'})

ca_tracts

Unnamed: 0,TRACT,CntFips
0,06085504321,085
1,06085504410,085
2,06085507003,085
3,06085507004,085
4,06085502204,085
...,...,...
9124,06059001303,059
9125,06059001304,059
9126,06059001401,059
9127,06013367200,013


## Merge final mental health dataset to California census tract data

In [32]:
tract_merge = pd.merge(ca_tracts, average_hpsscore_county, on='CntFips', how='right')
tract_merge = tract_merge.drop(columns={'CntFips', 'County'})
tract_merge

Unnamed: 0,TRACT,Avg_HpsScore,Hps_Flag
0,06085504321,18.2,1.0
1,06085504410,18.2,1.0
2,06085507003,18.2,1.0
3,06085507004,18.2,1.0
4,06085502204,18.2,1.0
...,...,...,...
9124,06021010300,15.0,1.0
9125,06051000102,16.0,1.0
9126,06051000101,16.0,1.0
9127,06051000201,16.0,1.0


## Function Call for both mental health and primary care shortages

In [2]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties = ca_counties_tract.drop(columns={'field_1', 'TRACT', 'geometry'})
ca_counties.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)

@append_metadata
def healthcare_shortage_metric_calc(df, export=False, export_filename=None, varname=''):    
    '''
    Calculates the average Health Professional Shortage Area (HPSA) score per California county.
    This function also adds a boolean flag column that indicates if a county has shortage area score.
    Data was sourced from: https://data.hrsa.gov/data/download

    The function calculates these metrics for mental and primary health care shortages.

    Methods
    -------
    Data was cleaned by removing duplicate entries containing the same county and HPSA score.
    Data columns were renamed, merged to 2022 census tract data, and averaged to attribute
    scores to all California tracts.
    
    Parameters
    ----------
    df: string
        the dataframe containing the initial emergency management performance grant data
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI emergency management grant metric to AWS
        True = will upload resulting df containing CAL CRAI emergency management grant metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Script
    ------
    society_healthcare_shortage.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: isolate relevant columns and remove duplicate entries.')
    print('Data transformation: merge data to California counties and calculate county average.')
    print('Data transformation: merge data to California tracts and add flags to indicate HPSA scores.')

    healthcare_shortage_df_cleaned = df.drop_duplicates(subset=['CntFips', 'HpsScore'], keep='first')
    merge_df = pd.merge(healthcare_shortage_df_cleaned, ca_counties, how='left', on='CntFips')
    merge_df = merge_df.drop_duplicates(subset=['CntFips', 'HpsScore'], keep='first')

    average_hpsscore_county = merge_df.groupby('CntFips')['HpsScore'].mean().reset_index()
    average_hpsscore_county = average_hpsscore_county.rename(columns={'HpsScore':'Avg_HpsScore'})
    average_hpsscore_county = pd.merge(average_hpsscore_county, ca_counties, how='right', on='CntFips')
    average_hpsscore_county = average_hpsscore_county.drop_duplicates(subset=['CntFips', 'Avg_HpsScore'], keep='first')
    average_hpsscore_county['Hps_Flag'] = average_hpsscore_county['Avg_HpsScore'].apply(lambda x: 1 if pd.notna(x) and x > 0 else None)

    ca_tracts = ca_counties_tract.drop(columns={'field_1', 'geometry'})
    ca_tracts.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)
    ca_tracts = ca_tracts.drop(columns={'County'})

    tract_merge = pd.merge(ca_tracts, average_hpsscore_county, on='CntFips', how='right')
    tract_merge = tract_merge.drop(columns={'CntFips', 'County'})
       
    if export_filename == 'society_primary_healthcare_shortage_metric.csv':
        tract_merge = tract_merge.rename(columns={'Avg_HpsScore':'Avg_HpsScore_Primary_Care',
                                                  'Hps_Flag':'Hps_Flag_Primary_Care'})
    else:
        tract_merge = tract_merge.rename(columns={'Avg_HpsScore':'Avg_HpsScore_Mental_Care',
                                                  'Hps_Flag':'Hps_Flag_Mental_Care'})
    # export to csv and upload to AWS
    if export == True:
        tract_merge.to_csv(export_filename)
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [export_filename]
        upload_csv_aws(export_filename, bucket_name, directory)

        if os.path.exists(export_filename[0]):
            os.remove(export_filename[0])

    return tract_merge

In [3]:
# Calling for primary care

# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/society_economy/social_services/ca_health_human_services/'

pull_gpkg_from_directory(bucket_name, aws_dir)

primary_healthcare_shortage_data = gpd.read_file('society_hrsa_primary_care_shortage.gpkg')

relevant_columns = [
                    'HpsScore',
                    'CntFips'
                    ]
primary_healthcare_shortage = primary_healthcare_shortage_data[relevant_columns]

healthcare_shortage_metric_calc(primary_healthcare_shortage, export=True, export_filename='society_primary_healthcare_shortage_metric.csv', varname='society_hrsa_primary_care_shortage')


Saved GeoPackage as 'society_hrsa_mental_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_primary_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_narcotic_support.gpkg' locally


Unnamed: 0,TRACT,Avg_HpsScore_Primary_Care,Hps_Flag_Primary_Care
0,06085504321,10.5,1.0
1,06085504410,10.5,1.0
2,06085507003,10.5,1.0
3,06085507004,10.5,1.0
4,06085502204,10.5,1.0
...,...,...,...
9124,06021010300,17.0,1.0
9125,06051000102,12.0,1.0
9126,06051000101,12.0,1.0
9127,06051000201,12.0,1.0


In [4]:
# Calling for mental healthcare

# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/society_economy/social_services/ca_health_human_services/'

pull_gpkg_from_directory(bucket_name, aws_dir)

mental_healthcare_shortage_data = gpd.read_file('society_hrsa_mental_care_shortage.gpkg')
relevant_columns = [
                    'HpsScore',
                    'CntFips'
                    ]
mental_healthcare_shortage = mental_healthcare_shortage_data[relevant_columns]

healthcare_shortage_metric_calc(mental_healthcare_shortage, export=True, export_filename='society_mental_healthcare_shortage_metric.csv', varname='society_hrsa_mental_care_shortage')

Saved GeoPackage as 'society_hrsa_mental_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_primary_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_narcotic_support.gpkg' locally


Unnamed: 0,TRACT,Avg_HpsScore_Mental_Care,Hps_Flag_Mental_Care
0,06085504321,18.2,1.0
1,06085504410,18.2,1.0
2,06085507003,18.2,1.0
3,06085507004,18.2,1.0
4,06085502204,18.2,1.0
...,...,...,...
9124,06021010300,15.0,1.0
9125,06051000102,16.0,1.0
9126,06051000101,16.0,1.0
9127,06051000201,16.0,1.0


# Narcotic Treatment Facilities Metric

In [4]:
narcotic_support_data = gpd.read_file('society_hrsa_narcotic_support.gpkg')

In [5]:
narcotic_support_data.columns

Index(['OBJECTID', 'County', 'Licensee', 'License__', 'DBA', 'OTP_CA10',
       'Address', 'City', 'State', 'Program_Zi', 'Phone_Numb', 'Total_Slot',
       'Operating_', 'Dispensing', 'Weekend_Ho', 'Weekend_Di', 'Program_Di',
       'Medical_Di', 'Latitude', 'Longitude', 'CountyName', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [6]:
narcotic_columns = ['OBJECTID',
                    'CountyName',
                    'DBA',
                    'OTP_CA10',
                    'geometry'
                    ]
narcotic_support = narcotic_support_data[narcotic_columns]
narcotic_support = narcotic_support.rename(columns={'CountyName': 'County'})

narcotic_support['County'] = narcotic_support['County'].str.replace(' County', '', case=False)

unique_entries = narcotic_support['County'].unique()

narcotic_support
unique_entries

array(['Alameda', 'Butte', 'Contra Costa', 'Fresno', 'Humboldt',
       'Imperial', 'Kern', 'Los Angeles', 'Marin', 'Merced', 'Monterey',
       'Mendocino', 'Orange', 'El Dorado', 'Placer', 'Riverside',
       'Sacramento', 'San Bernardino', 'San Diego', 'San Francisco',
       'San Joaquin', 'San Luis Obispo', 'San Mateo', 'Santa Barbara',
       'Santa Clara', 'Santa Cruz', 'Solano', 'Sonoma', 'Tuolumne',
       'Shasta', 'Stanislaus', 'Nevada', 'Yuba', 'Tulare', 'Ventura',
       'Yolo'], dtype=object)

Checking to make sure only CA counties are included

In [7]:
ca_narcotic_support, omitted_rows = filter_counties(narcotic_support, 'County')
print(ca_narcotic_support)
print(f'number of omitted rows:{len(omitted_rows)}')

     OBJECTID   County                                  DBA OTP_CA10  \
0           1  Alameda                                  N/A     249M   
1           2  Alameda                                  N/A     309M   
2           3  Alameda               BAART Programs Oakland     186M   
3           4  Alameda               BAART Programs Oakland     186M   
4           5  Alameda  MedMark Treatment Centers - Hayward     304M   
..        ...      ...                                  ...      ...   
177       178  Ventura                                  N/A     298M   
178       179  Ventura                                  N/A     236M   
179       181  Ventura                                  N/A     386M   
180       182     Yolo                                  N/A     398M   
181       183  Ventura                                  N/A     174M   

                        geometry  
0    POINT (-121.88414 37.71620)  
1    POINT (-122.10225 37.68718)  
2    POINT (-122.24830 37.7912

Getting rid of duplicates based on a few columns

In [8]:
# Columns to check for duplicates
selected_columns = ['OTP_CA10', 'geometry', 'DBA']

# Check for duplicates
duplicates = ca_narcotic_support.duplicated(subset=selected_columns, keep=False)

# Count duplicates
duplicate_count = duplicates.sum()

# Filter out the duplicate rows for display
duplicate_rows = narcotic_support[duplicates]

print(f"Number of duplicate entries in columns {selected_columns}: {duplicate_count}")
print("Duplicate rows:")
display(duplicate_rows)

# Drop duplicates, keeping the first occurrence
narcotic_support_unique_values = narcotic_support.drop_duplicates(subset=selected_columns, keep='first')


Number of duplicate entries in columns ['OTP_CA10', 'geometry', 'DBA']: 16
Duplicate rows:


Unnamed: 0,OBJECTID,County,DBA,OTP_CA10,geometry
2,3,Alameda,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
3,4,Alameda,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
62,63,Los Angeles,BAART Programs Lynwood,205M,POINT (-118.18875 33.91724)
63,65,Los Angeles,BAART Programs Lynwood,205M,POINT (-118.18875 33.91724)
75,76,Los Angeles,BAART Programs Boyle Heights,279M,POINT (-118.21035 34.06239)
76,77,Los Angeles,BAART Programs Boyle Heights,279M,POINT (-118.21035 34.06239)
108,108,Sacramento,BAART Programs Carmicheal,293M,POINT (-121.32939 38.61252)
109,111,Sacramento,BAART Programs Carmicheal,293M,POINT (-121.32939 38.61252)
110,109,Sacramento,BAART Programs Norwood,274M,POINT (-121.45857 38.63813)
111,110,Sacramento,BAART Programs Norwood,274M,POINT (-121.45857 38.63813)


In [9]:
narcotic_support_unique_values

Unnamed: 0,OBJECTID,County,DBA,OTP_CA10,geometry
0,1,Alameda,,249M,POINT (-121.88414 37.71620)
1,2,Alameda,,309M,POINT (-122.10225 37.68718)
2,3,Alameda,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
4,5,Alameda,MedMark Treatment Centers - Hayward,304M,POINT (-122.08032 37.66542)
5,6,Alameda,,354M,POINT (-122.16715 37.73928)
...,...,...,...,...,...
177,178,Ventura,,298M,POINT (-119.06645 34.35216)
178,179,Ventura,,236M,POINT (-118.78578 34.27040)
179,181,Ventura,,386M,POINT (-119.22469 34.27695)
180,182,Yolo,,398M,POINT (-121.57790 38.57071)


In [10]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties = ca_counties_tract.drop(columns={'field_1', 'TRACT', 'geometry'})

In [11]:
ca_counties

Unnamed: 0,COUNTYFP,County
0,085,Santa Clara
1,085,Santa Clara
2,085,Santa Clara
3,085,Santa Clara
4,085,Santa Clara
...,...,...
9124,059,Orange
9125,059,Orange
9126,059,Orange
9127,013,Contra Costa


Merge narcotic support data with California counties

In [12]:
# Columns to check for duplicates
selected_columns = ['OTP_CA10', 'geometry', 'DBA']

merge_df = pd.merge(narcotic_support_unique_values, ca_counties, how='left', on='County')
merge_df = merge_df.drop_duplicates(subset=selected_columns, keep='first')
merge_df

Unnamed: 0,OBJECTID,County,DBA,OTP_CA10,geometry,COUNTYFP
0,1,Alameda,,249M,POINT (-121.88414 37.71620),001
379,2,Alameda,,309M,POINT (-122.10225 37.68718),001
758,3,Alameda,BAART Programs Oakland,186M,POINT (-122.24830 37.79120),001
1137,5,Alameda,MedMark Treatment Centers - Hayward,304M,POINT (-122.08032 37.66542),001
1516,6,Alameda,,354M,POINT (-122.16715 37.73928),001
...,...,...,...,...,...,...
154958,178,Ventura,,298M,POINT (-119.06645 34.35216),111
155148,179,Ventura,,236M,POINT (-118.78578 34.27040),111
155338,181,Ventura,,386M,POINT (-119.22469 34.27695),111
155528,182,Yolo,,398M,POINT (-121.57790 38.57071),113


Make a new df containing counts of narcotic support facilities per California county

In [13]:
# Group by 'CntFips' and 'County' and count the number of entries
count_narcotic_support_facilities = merge_df.groupby(['COUNTYFP', 'County']).size().reset_index(name='num_narcotic_support_facilities')

print(len(count_narcotic_support_facilities))
count_narcotic_support_facilities

36


Unnamed: 0,COUNTYFP,County,num_narcotic_support_facilities
0,1,Alameda,8
1,7,Butte,1
2,13,Contra Costa,6
3,17,El Dorado,1
4,19,Fresno,8
5,23,Humboldt,1
6,25,Imperial,2
7,29,Kern,6
8,37,Los Angeles,48
9,41,Marin,1


In [14]:
# read in CA estimated county population
pull_county_pop = "s3://ca-climate-index/0_map_data/county_est_pop_2022.csv"
ca_county_pop = gpd.read_file(pull_county_pop)
ca_counties_pop = ca_county_pop.drop(columns={'field_1', 'geometry'})
ca_counties_pop = ca_counties_pop.rename(columns={'county':'County'})


In [15]:
# Ensure the 'num_narcotic_support_facilities' and 'est_total_pop' columns are numeric
count_narcotic_support_facilities['num_narcotic_support_facilities'] = pd.to_numeric(count_narcotic_support_facilities['num_narcotic_support_facilities'], errors='coerce')
ca_counties_pop['est_total_pop'] = pd.to_numeric(ca_counties_pop['est_total_pop'], errors='coerce')

ca_pop_narcotic_support_merge = pd.merge(ca_counties_pop, count_narcotic_support_facilities, on='County', how='right')
ca_pop_narcotic_support_merge['narcotic_support_per_10000'] = (ca_pop_narcotic_support_merge['num_narcotic_support_facilities'] / ca_pop_narcotic_support_merge['est_total_pop']) * 10000

ca_pop_narcotic_support_merge

Unnamed: 0,County,est_total_pop,COUNTYFP,num_narcotic_support_facilities,narcotic_support_per_10000
0,Alameda,1663823,1,8,0.048082
1,Butte,213605,7,1,0.046815
2,Contra Costa,1162648,13,6,0.051606
3,El Dorado,191713,17,1,0.052161
4,Fresno,1008280,19,8,0.079343
5,Humboldt,136132,23,1,0.073458
6,Imperial,179578,25,2,0.111372
7,Kern,906883,29,6,0.066161
8,Los Angeles,9936690,37,48,0.048306
9,Marin,260485,41,1,0.03839


In [17]:
ca_counties_tract

Unnamed: 0,field_1,TRACT,COUNTYFP,County,geometry
0,0,06085504321,085,Santa Clara,
1,1,06085504410,085,Santa Clara,
2,2,06085507003,085,Santa Clara,
3,3,06085507004,085,Santa Clara,
4,4,06085502204,085,Santa Clara,
...,...,...,...,...,...
9124,9124,06059001303,059,Orange,
9125,9125,06059001304,059,Orange,
9126,9126,06059001401,059,Orange,
9127,9127,06013367200,013,Contra Costa,


In [16]:
ca_tracts = ca_counties_tract.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
#count_narcotic_support_facilities = count_narcotic_support_facilities.drop(columns={'County'})

tract_merge = pd.merge(ca_tracts, ca_pop_narcotic_support_merge, on='County', how='left')
tract_merge

Unnamed: 0,TRACT,County,est_total_pop,COUNTYFP,num_narcotic_support_facilities,narcotic_support_per_10000
0,06085504321,Santa Clara,1916831.0,085,3.0,0.015651
1,06085504410,Santa Clara,1916831.0,085,3.0,0.015651
2,06085507003,Santa Clara,1916831.0,085,3.0,0.015651
3,06085507004,Santa Clara,1916831.0,085,3.0,0.015651
4,06085502204,Santa Clara,1916831.0,085,3.0,0.015651
...,...,...,...,...,...,...
9124,06059001303,Orange,3175227.0,059,6.0,0.018896
9125,06059001304,Orange,3175227.0,059,6.0,0.018896
9126,06059001401,Orange,3175227.0,059,6.0,0.018896
9127,06013367200,Contra Costa,1162648.0,013,6.0,0.051606


Function to check data per county

In [20]:
# Define a function to display data for a specific county
def display_county_data(df, county_name):
    county_data = df[df['County'] == county_name]
    if county_data.empty:
        print(f"No data found for {county_name}")
    else:
        print(f"Data for {county_name}:")
        display(county_data)

display_county_data(tract_merge, 'Orange')

Data for Orange:


Unnamed: 0,TRACT,County,est_total_pop,COUNTYFP,num_narcotic_support_facilities,narcotic_support_per_10000
55,06059062651,Orange,3175227.0,059,6.0,0.018896
56,06059062650,Orange,3175227.0,059,6.0,0.018896
57,06059062654,Orange,3175227.0,059,6.0,0.018896
58,06059052531,Orange,3175227.0,059,6.0,0.018896
59,06059052532,Orange,3175227.0,059,6.0,0.018896
...,...,...,...,...,...,...
9108,06059074102,Orange,3175227.0,059,6.0,0.018896
9109,06059074103,Orange,3175227.0,059,6.0,0.018896
9124,06059001303,Orange,3175227.0,059,6.0,0.018896
9125,06059001304,Orange,3175227.0,059,6.0,0.018896


## Function Call for Narcotic Support

In [28]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties_tract = ca_counties_tract.drop(columns={'field_1', 'TRACT', 'geometry'})

@append_metadata
def narcotic_support_metric_calc(df, export=False, export_filename=None, varname=''):
    '''
    Calculates the number of Licensed Narcotic Treatment Programs per California county per
    10,000 people
    Data was sourced from: https://data.chhs.ca.gov/dataset/licensed-narcotic-treatment-programs

    The function calculates these metrics for mental and primary health care shortages.

    Methods
    -------
    Data was cleaned by removing duplicate entries containing the same location and facility identifier.
    Data columns were renamed, merged to 2022 census tract data, and summed to attribute scores to all 
    California tracts.
    
    Parameters
    ----------
    df: string
        the dataframe containing the initial emergency management performance grant data
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI emergency management grant metric to AWS
        True = will upload resulting df containing CAL CRAI emergency management grant metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Script
    ------
    society_healthcare_shortage.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: isolate relevant columns and remove duplicate entries.')
    print('Data transformation: merge data to California counties and calculate county totals.')
    print('Data transformation: merge data to California tracts, keeping county sums for all tracts within.')
    print('Data transformation: import county population estimates and merge into data.')
    print(f'Data transformation: calculate metric by dividing county level narcotic treatment facilities by county population and multiply by 10,000.')

    # Columns to check for duplicates
    selected_columns = ['OTP_CA10', 'geometry', 'DBA']

    # Check for duplicates
    duplicates = df.duplicated(subset=selected_columns, keep=False)

    # Filter out the duplicate rows for display
    duplicate_rows = narcotic_support[duplicates]

    # Drop duplicates, keeping the first occurrence
    narcotic_support_unique_values = narcotic_support.drop_duplicates(subset=selected_columns, keep='first')

    # Columns to check for duplicates
    selected_columns = ['OTP_CA10', 'geometry', 'DBA']

    merge_df = pd.merge(narcotic_support_unique_values, ca_counties_tract, how='left', on='County')
    merge_df = merge_df.drop_duplicates(subset=selected_columns, keep='first')
    merge_df
    count_narcotic_support_facilities = merge_df.groupby(['COUNTYFP', 'County']).size().reset_index(name='num_narcotic_support_facilities')

    # read in CA estimated county population
    pull_county_pop = "s3://ca-climate-index/0_map_data/county_est_pop_2022.csv"
    ca_county_pop = gpd.read_file(pull_county_pop)
    ca_counties_pop = ca_county_pop.drop(columns={'field_1', 'geometry'})
    ca_counties_pop = ca_counties_pop.rename(columns={'county':'County'})

    # Ensure the 'num_narcotic_support_facilities' and 'est_total_pop' columns are numeric
    count_narcotic_support_facilities['num_narcotic_support_facilities'] = pd.to_numeric(count_narcotic_support_facilities['num_narcotic_support_facilities'], errors='coerce')
    ca_counties_pop['est_total_pop'] = pd.to_numeric(ca_counties_pop['est_total_pop'], errors='coerce')

    ca_pop_narcotic_support_merge = pd.merge(ca_counties_pop, count_narcotic_support_facilities, on='County', how='right')
    ca_pop_narcotic_support_merge['narcotic_support_per_10000'] = (ca_pop_narcotic_support_merge['num_narcotic_support_facilities'] / ca_pop_narcotic_support_merge['est_total_pop']) * 10000

    ca_tracts = ca_counties_tract.drop(columns={'COUNTYFP'})
    
    tract_merge = pd.merge(ca_tracts, ca_pop_narcotic_support_merge, on='County', how='left')

    # export to csv and upload to AWS
    if export == True:
        tract_merge.to_csv(export_filename)
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [export_filename]
        upload_csv_aws(export_filename, bucket_name, directory)

        if os.path.exists(export_filename[0]):
            os.remove(export_filename[0])

    return tract_merge

In [29]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/society_economy/social_services/ca_health_human_services/'

pull_gpkg_from_directory(bucket_name, aws_dir)

narcotic_support_data = gpd.read_file('society_hrsa_narcotic_support.gpkg')

narcotic_columns = ['OBJECTID',
                    'CountyName',
                    'DBA',
                    'OTP_CA10',
                    'geometry'
                    ]
narcotic_support = narcotic_support_data[narcotic_columns]
narcotic_support = narcotic_support.rename(columns={'CountyName': 'County'})
narcotic_support['County'] = narcotic_support['County'].str.replace(' County', '', case=False)

narcotic_support_metric_calc(narcotic_support, export=True, export_filename='society_narcotic_support_metric.csv', varname='society_hrsa_narcotic_support')

Saved GeoPackage as 'society_hrsa_mental_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_primary_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_narcotic_support.gpkg' locally


Unnamed: 0,County,est_total_pop,COUNTYFP,num_narcotic_support_facilities,narcotic_support_per_10000
0,Santa Clara,1916831.0,085,3.0,0.015651
1,Santa Clara,1916831.0,085,3.0,0.015651
2,Santa Clara,1916831.0,085,3.0,0.015651
3,Santa Clara,1916831.0,085,3.0,0.015651
4,Santa Clara,1916831.0,085,3.0,0.015651
...,...,...,...,...,...
9124,Orange,3175227.0,059,6.0,0.018896
9125,Orange,3175227.0,059,6.0,0.018896
9126,Orange,3175227.0,059,6.0,0.018896
9127,Contra Costa,1162648.0,013,6.0,0.051606
