In [73]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [74]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/society_economy/social_services/ca_health_human_services/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'society_hrsa_mental_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_primary_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_narcotic_support.gpkg' locally


In [75]:
mental_healthcare_shortage_data = gpd.read_file('society_hrsa_mental_care_shortage.gpkg')
primary_healthcare_shortage_data = gpd.read_file('society_hrsa_primary_care_shortage.gpkg')
narcotic_support_data = gpd.read_file('society_hrsa_narcotic_support.gpkg')


In [76]:
mental_healthcare_shortage_data.columns

Index(['objectid_1', 'HpsSrcID', 'HpsNM', 'HpsStatCD', 'HpsStatDes',
       'HpsTypCD', 'HpsTypDes', 'HpsScore', 'HpsShtg', 'HpsFormlRt', 'HpsFte',
       'HpsPvt', 'HpsAddr', 'HpsCity', 'HpsStAbbr', 'HpsZipCD', 'HpsDgrShtg',
       'HpsDgnDT', 'HpsDgnLUDT', 'HpsDgnPp', 'HpsEsUsvPp', 'HpsEsSvPp',
       'HpsPpPdRtG', 'DscpClsNum', 'DscpClsDes', 'GeoID', 'CntFips', 'CntNM',
       'StCntFips', 'StFips', 'StAbbr', 'StNM', 'PriStNM', 'PriStFips',
       'PriRegNM', 'UMBdCntInd', 'UMBd100Ind', 'DwRecCrtDT', 'DwRecCtDtT',
       'BrkDgnIND', 'X', 'Y', 'HpsPpTypCD', 'HpsPpTypDe', 'HpsRCPp', 'CCityNM',
       'CZipCD', 'CCntStAbbr', 'CStCntFips', 'CStAbbr', 'CStNM', 'CStFips',
       'CRegNM', 'HpsWdrDT', 'HpsWdrDtT', 'PrvdTypDes', 'SdHpsID', 'SdHpsNM',
       'SdHpsScore', 'SdHpsSrcID', 'RurStatCD', 'RurStatDes', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_I

In [77]:
relevant_columns = [
                    'HpsScore',
                    'geometry'
                    ]

In [78]:
mental_healthcare_shortage = mental_healthcare_shortage_data[relevant_columns]
mental_healthcare_shortage

Unnamed: 0,HpsScore,geometry
0,18,POINT (-122.17551 39.92793)
1,15,POINT (-122.17440 39.92868)
2,17,POINT (-122.16986 39.92789)
3,18,POINT (-122.16875 39.92779)
4,21,POINT (-120.15683 37.10569)
...,...,...
450,18,POINT (-116.72574 32.83643)
451,13,POINT (-116.37038 32.67142)
452,19,POINT (-121.87646 39.74770)
453,18,POINT (-120.35831 36.13848)


In [79]:
primary_healthcare_shortage = primary_healthcare_shortage_data[relevant_columns]

primary_healthcare_shortage

Unnamed: 0,HpsScore,geometry
0,16,POINT (-120.38213 36.75691)
1,24,POINT (-120.39234 36.73197)
2,12,POINT (-118.24042 34.06099)
3,17,POINT (-118.23971 34.06256)
4,16,POINT (-119.09612 36.21296)
...,...,...
438,14,POINT (-123.33934 39.38994)
439,18,POINT (-117.09204 32.66109)
440,15,POINT (-117.65218 34.06799)
441,20,POINT (-121.41701 36.85325)


In [80]:
narcotic_support_data.columns


Index(['OBJECTID', 'County', 'Licensee', 'License__', 'DBA', 'OTP_CA10',
       'Address', 'City', 'State', 'Program_Zi', 'Phone_Numb', 'Total_Slot',
       'Operating_', 'Dispensing', 'Weekend_Ho', 'Weekend_Di', 'Program_Di',
       'Medical_Di', 'Latitude', 'Longitude', 'CountyName', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [81]:
narcotic_columns = ['County',
                    'DBA',
                    'OTP_CA10',
                    'geometry'
                    ]
narcotic_support = narcotic_support_data[narcotic_columns]
narcotic_support = narcotic_support.rename(columns={'County': 'countyfp'})

# Ensure all entries in 'County' column are three digits
narcotic_support['countyfp'] = narcotic_support['countyfp'].astype(str).str.zfill(3)
narcotic_support


Unnamed: 0,countyfp,DBA,OTP_CA10,geometry
0,001,,249M,POINT (-121.88414 37.71620)
1,001,,309M,POINT (-122.10225 37.68718)
2,001,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
3,001,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
4,001,MedMark Treatment Centers - Hayward,304M,POINT (-122.08032 37.66542)
...,...,...,...,...
177,056,,298M,POINT (-119.06645 34.35216)
178,056,,236M,POINT (-118.78578 34.27040)
179,056,,386M,POINT (-119.22469 34.27695)
180,057,,398M,POINT (-121.57790 38.57071)


In [82]:
# Columns to check for duplicates
selected_columns = ['OTP_CA10', 'geometry']

# Check for duplicates
duplicates = narcotic_support.duplicated(subset=selected_columns, keep=False)

# Count duplicates
duplicate_count = duplicates.sum()

# Filter out the duplicate rows for display
duplicate_rows = narcotic_support[duplicates]

print(f"Number of duplicate entries in columns {selected_columns}: {duplicate_count}")
print("Duplicate rows:")
display(duplicate_rows)

# Drop duplicates, keeping the first occurrence
narcotic_support_unique_values = narcotic_support.drop_duplicates(subset=selected_columns, keep='first')


Number of duplicate entries in columns ['OTP_CA10', 'geometry']: 19
Duplicate rows:


Unnamed: 0,countyfp,DBA,OTP_CA10,geometry
2,1,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
3,1,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
62,19,BAART Programs Lynwood,205M,POINT (-118.18875 33.91724)
63,19,BAART Programs Lynwood,205M,POINT (-118.18875 33.91724)
75,19,BAART Programs Boyle Heights,279M,POINT (-118.21035 34.06239)
76,19,BAART Programs Boyle Heights,279M,POINT (-118.21035 34.06239)
108,34,BAART Programs Carmicheal,293M,POINT (-121.32939 38.61252)
109,34,BAART Programs Carmicheal,293M,POINT (-121.32939 38.61252)
110,34,BAART Programs Norwood,274M,POINT (-121.45857 38.63813)
111,34,BAART Programs Norwood,274M,POINT (-121.45857 38.63813)


In [107]:
narcotic_support_unique = narcotic_support_unique_values.drop(columns='countyfp')
narcotic_support_unique

Unnamed: 0,DBA,OTP_CA10,geometry
0,,249M,POINT (-121.88414 37.71620)
1,,309M,POINT (-122.10225 37.68718)
2,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
4,MedMark Treatment Centers - Hayward,304M,POINT (-122.08032 37.66542)
5,,354M,POINT (-122.16715 37.73928)
...,...,...,...
177,,298M,POINT (-119.06645 34.35216)
178,,236M,POINT (-118.78578 34.27040)
179,,386M,POINT (-119.22469 34.27695)
180,,398M,POINT (-121.57790 38.57071)


In [49]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2022_tiger_census_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

In [84]:
ca_boundaries

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,06,037,137504,06037137504,1375.04,Census Tract 1375.04,G5020,S,3837562,0,+34.1480383,-118.5720594,"POLYGON ((-118.58119 34.14318, -118.58099 34.1..."
1,06,037,138000,06037138000,1380,Census Tract 1380,G5020,S,4472196,0,+34.1488008,-118.5910495,"POLYGON ((-118.60573 34.14585, -118.60561 34.1..."
2,06,037,139200,06037139200,1392,Census Tract 1392,G5020,S,1152028,0,+34.1756961,-118.5246446,"POLYGON ((-118.53082 34.18024, -118.52952 34.1..."
3,06,087,120901,06087120901,1209.01,Census Tract 1209.01,G5020,S,18372491,0,+37.0781624,-122.0154263,"POLYGON ((-122.04607 37.07105, -122.04505 37.0..."
4,06,087,120902,06087120902,1209.02,Census Tract 1209.02,G5020,S,6474889,0,+37.0628452,-122.0106243,"POLYGON ((-122.02513 37.04320, -122.02500 37.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,06,001,444500,06001444500,4445,Census Tract 4445,G5020,S,1959283,0,+37.5342653,-122.0273658,"POLYGON ((-122.03998 37.52944, -122.03971 37.5..."
9125,06,001,450200,06001450200,4502,Census Tract 4502,G5020,S,1612601,0,+37.7224277,-121.9154123,"POLYGON ((-121.92582 37.72685, -121.92470 37.7..."
9126,06,001,450300,06001450300,4503,Census Tract 4503,G5020,S,3133641,0,+37.7106517,-121.9177272,"POLYGON ((-121.92929 37.72608, -121.92864 37.7..."
9127,06,105,000500,06105000500,5,Census Tract 5,G5020,S,3706834371,4577022,+40.3425806,-123.2757792,"POLYGON ((-123.54464 40.59402, -123.54445 40.6..."


In [56]:
# Create a copy of the relevant columns from ca_boundaries
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry', 'COUNTYFP']].copy()

# Rename the 'GEOID' column to 'tract'
filtered_ca_boundaries.rename(columns={'GEOID': 'tract', 'COUNTYFP':'countyfp'}, inplace=True)

# Remove the first character from the 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

# Display the resulting DataFrame
filtered_ca_boundaries

Unnamed: 0,tract,geometry,countyfp
0,6037137504,"POLYGON ((-118.58119 34.14318, -118.58099 34.1...",037
1,6037138000,"POLYGON ((-118.60573 34.14585, -118.60561 34.1...",037
2,6037139200,"POLYGON ((-118.53082 34.18024, -118.52952 34.1...",037
3,6087120901,"POLYGON ((-122.04607 37.07105, -122.04505 37.0...",087
4,6087120902,"POLYGON ((-122.02513 37.04320, -122.02500 37.0...",087
...,...,...,...
9124,6001444500,"POLYGON ((-122.03998 37.52944, -122.03971 37.5...",001
9125,6001450200,"POLYGON ((-121.92582 37.72685, -121.92470 37.7...",001
9126,6001450300,"POLYGON ((-121.92929 37.72608, -121.92864 37.7...",001
9127,6105000500,"POLYGON ((-123.54464 40.59402, -123.54445 40.6...",105


In [109]:
mental_healthcare_shortage

Unnamed: 0,HpsScore,geometry
0,18,POINT (-122.17551 39.92793)
1,15,POINT (-122.17440 39.92868)
2,17,POINT (-122.16986 39.92789)
3,18,POINT (-122.16875 39.92779)
4,21,POINT (-120.15683 37.10569)
...,...,...
450,18,POINT (-116.72574 32.83643)
451,13,POINT (-116.37038 32.67142)
452,19,POINT (-121.87646 39.74770)
453,18,POINT (-120.35831 36.13848)


In [136]:
# List of GeoDataFrames and their names
gdfs = {
    'mental_healthcare_shortage': mental_healthcare_shortage,
    'primary_healthcare_shortage': primary_healthcare_shortage,
    'narcotic_support_unique': narcotic_support_unique
}

# Ensure the filtered_ca_boundaries has the same CRS as other GeoDataFrames
filtered_ca_boundaries = filtered_ca_boundaries.to_crs(epsg=4269)

# Dictionary to store the resulting GeoDataFrames
result_gdfs = {}

# Loop through each GeoDataFrame, perform the spatial join, and store the result
for name, gdf in gdfs.items():
    # Ensure each GeoDataFrame has the correct CRS
    gdf_crs_corrected = gdf.to_crs(epsg=4269)
    
    # Perform the spatial join
    result_gdf = gpd.sjoin(filtered_ca_boundaries, gdf_crs_corrected, how='left', predicate='intersects')
    
    # Store the result in the dictionary with the new name
    result_name = name + '_tract'
    result_gdfs[result_name] = result_gdf

# Print the resulting GeoDataFrames
for name, gdf in result_gdfs.items():
    print(f"{name}:")
    display(gdf)
    globals()[name] = gdf.copy()  # Store a copy in globals() to avoid modifying the original DataFrame


mental_healthcare_shortage_tract:


Unnamed: 0,tract,geometry,countyfp,index_right,HpsScore
0,6037137504,"POLYGON ((-118.58119 34.14318, -118.58099 34.1...",037,,
1,6037138000,"POLYGON ((-118.60573 34.14585, -118.60561 34.1...",037,,
2,6037139200,"POLYGON ((-118.53082 34.18024, -118.52952 34.1...",037,,
3,6087120901,"POLYGON ((-122.04607 37.07105, -122.04505 37.0...",087,,
4,6087120902,"POLYGON ((-122.02513 37.04320, -122.02500 37.0...",087,,
...,...,...,...,...,...
9125,6001450200,"POLYGON ((-121.92582 37.72685, -121.92470 37.7...",001,,
9126,6001450300,"POLYGON ((-121.92929 37.72608, -121.92864 37.7...",001,,
9127,6105000500,"POLYGON ((-123.54464 40.59402, -123.54445 40.6...",105,64.0,19.0
9127,6105000500,"POLYGON ((-123.54464 40.59402, -123.54445 40.6...",105,63.0,14.0


primary_healthcare_shortage_tract:


Unnamed: 0,tract,geometry,countyfp,index_right,HpsScore
0,6037137504,"POLYGON ((-118.58119 34.14318, -118.58099 34.1...",037,,
1,6037138000,"POLYGON ((-118.60573 34.14585, -118.60561 34.1...",037,,
2,6037139200,"POLYGON ((-118.53082 34.18024, -118.52952 34.1...",037,,
3,6087120901,"POLYGON ((-122.04607 37.07105, -122.04505 37.0...",087,,
4,6087120902,"POLYGON ((-122.02513 37.04320, -122.02500 37.0...",087,,
...,...,...,...,...,...
9125,6001450200,"POLYGON ((-121.92582 37.72685, -121.92470 37.7...",001,,
9126,6001450300,"POLYGON ((-121.92929 37.72608, -121.92864 37.7...",001,,
9127,6105000500,"POLYGON ((-123.54464 40.59402, -123.54445 40.6...",105,8.0,19.0
9127,6105000500,"POLYGON ((-123.54464 40.59402, -123.54445 40.6...",105,7.0,17.0


narcotic_support_unique_tract:


Unnamed: 0,tract,geometry,countyfp,index_right,DBA,OTP_CA10
0,6037137504,"POLYGON ((-118.58119 34.14318, -118.58099 34.1...",037,,,
1,6037138000,"POLYGON ((-118.60573 34.14585, -118.60561 34.1...",037,,,
2,6037139200,"POLYGON ((-118.53082 34.18024, -118.52952 34.1...",037,,,
3,6087120901,"POLYGON ((-122.04607 37.07105, -122.04505 37.0...",087,,,
4,6087120902,"POLYGON ((-122.02513 37.04320, -122.02500 37.0...",087,,,
...,...,...,...,...,...,...
9124,6001444500,"POLYGON ((-122.03998 37.52944, -122.03971 37.5...",001,,,
9125,6001450200,"POLYGON ((-121.92582 37.72685, -121.92470 37.7...",001,,,
9126,6001450300,"POLYGON ((-121.92929 37.72608, -121.92864 37.7...",001,,,
9127,6105000500,"POLYGON ((-123.54464 40.59402, -123.54445 40.6...",105,,,


In [137]:
# Specify the column name
column_name = 'HpsScore'

# Create a boolean mask where True indicates non-null values in the specified column
mask = primary_healthcare_shortage_tract[column_name].notna()

# Filter the DataFrame using the mask
non_null_rows = primary_healthcare_shortage_tract[mask]

# View the filtered DataFrame
non_null_rows

Unnamed: 0,tract,geometry,countyfp,index_right,HpsScore
41,6037231302,"POLYGON ((-118.30891 34.01188, -118.30891 34.0...",037,325.0,19.0
69,6047000305,"POLYGON ((-120.74270 37.39928, -120.74261 37.3...",047,430.0,20.0
83,6037139302,"POLYGON ((-118.54073 34.18019, -118.54070 34.1...",037,90.0,12.0
110,6067001102,"POLYGON ((-121.48617 38.57591, -121.48593 38.5...",067,342.0,16.0
110,6067001102,"POLYGON ((-121.48617 38.57591, -121.48593 38.5...",067,341.0,18.0
...,...,...,...,...,...
9099,6037194500,"POLYGON ((-118.37880 34.07639, -118.37865 34.0...",037,191.0,15.0
9120,6037195710,"POLYGON ((-118.27351 34.07208, -118.27292 34.0...",037,170.0,13.0
9127,6105000500,"POLYGON ((-123.54464 40.59402, -123.54445 40.6...",105,8.0,19.0
9127,6105000500,"POLYGON ((-123.54464 40.59402, -123.54445 40.6...",105,7.0,17.0
