In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/society_economy/social_services/ca_health_human_services/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'society_hrsa_mental_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_primary_care_shortage.gpkg' locally
Saved GeoPackage as 'society_hrsa_narcotic_support.gpkg' locally


In [3]:
mental_healthcare_shortage_data = gpd.read_file('society_hrsa_mental_care_shortage.gpkg')
primary_healthcare_shortage_data = gpd.read_file('society_hrsa_primary_care_shortage.gpkg')
narcotic_support_data = gpd.read_file('society_hrsa_narcotic_support.gpkg')


In [4]:
relevant_columns = [
                    'HpsScore',
                    'CntFips'
                    ]

In [5]:
mental_healthcare_shortage = mental_healthcare_shortage_data[relevant_columns]
primary_healthcare_shortage = primary_healthcare_shortage_data[relevant_columns]

In [6]:
mental_healthcare_shortage

Unnamed: 0,HpsScore,CntFips
0,18,103
1,15,103
2,17,103
3,18,103
4,21,039
...,...,...
450,18,073
451,13,073
452,19,007
453,18,019


In [7]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties = ca_counties_tract.drop(columns={'field_1', 'TRACT', 'geometry'})
ca_counties.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)

ca_counties

Unnamed: 0,CntFips,County
0,085,Santa Clara
1,085,Santa Clara
2,085,Santa Clara
3,085,Santa Clara
4,085,Santa Clara
...,...,...
9124,059,Orange
9125,059,Orange
9126,059,Orange
9127,013,Contra Costa


In [7]:
# Display the length of the original DataFrame
print("Length of original DataFrame:", len(mental_healthcare_shortage))

# Removing duplicates based on both 'CntFips' and 'HpsScore' columns
mental_healthcare_shortage_cleaned = mental_healthcare_shortage.drop_duplicates(subset=['CntFips', 'HpsScore'], keep='first')

# Display the length of the cleaned DataFrame
print("Length of cleaned DataFrame:", len(mental_healthcare_shortage_cleaned))

# Display the first few rows of the cleaned DataFrame
print(mental_healthcare_shortage_cleaned.head())

Length of original DataFrame: 455
Length of cleaned DataFrame: 227
   HpsScore CntFips  Flag
0        18     103     1
1        15     103     1
2        17     103     1
4        21     039     1
5        18     107     1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mental_healthcare_shortage['Flag'] = mental_healthcare_shortage['HpsScore'].apply(lambda x: 1 if x > 0 else 0)


In [8]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties = ca_counties_tract.drop(columns={'field_1', 'TRACT', 'geometry'})
ca_counties.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)

ca_counties

Unnamed: 0,CntFips,County
0,085,Santa Clara
1,085,Santa Clara
2,085,Santa Clara
3,085,Santa Clara
4,085,Santa Clara
...,...,...
9124,059,Orange
9125,059,Orange
9126,059,Orange
9127,013,Contra Costa


In [9]:
mental_merge = pd.merge(mental_healthcare_shortage_cleaned, ca_counties, how='left', on='CntFips')
mental_merge = mental_merge.drop_duplicates(subset=['CntFips', 'HpsScore'], keep='first')

mental_merge


Unnamed: 0,HpsScore,CntFips,Flag,County
0,18,103,1,Tehama
14,15,103,1,Tehama
28,17,103,1,Tehama
42,21,039,1,Madera
76,18,107,1,Tulare
...,...,...,...,...
65908,20,087,1,Santa Cruz
65978,22,095,1,Solano
66078,16,025,1,Imperial
66118,13,073,1,San Diego


In [11]:
average_hpsscore_county = mental_merge.groupby('CntFips')['HpsScore'].mean().reset_index()
average_hpsscore_county = average_hpsscore_county.rename(columns={'HpsScore':'Avg_HpsScore'})
average_hpsscore_county = pd.merge(average_hpsscore_county, ca_counties, how='right', on='CntFips')
average_hpsscore_county = average_hpsscore_county.drop_duplicates(subset=['CntFips', 'Avg_HpsScore'], keep='first')
average_hpsscore_county['Hps_Flag'] = average_hpsscore_county['Avg_HpsScore'].apply(lambda x: 1 if pd.notna(x) and x > 0 else None)

print(len(average_hpsscore_county))
average_hpsscore_county

58


Unnamed: 0,CntFips,Avg_HpsScore,County,Hps_Flag
0,85,18.2,Santa Clara,1.0
55,59,18.333333,Orange,1.0
80,1,17.25,Alameda,1.0
121,29,17.0,Kern,1.0
129,71,17.714286,San Bernardino,1.0
162,55,16.0,Napa,1.0
163,17,17.8,El Dorado,1.0
168,75,18.5,San Francisco,1.0
177,53,16.166667,Monterey,1.0
215,13,14.0,Contra Costa,1.0


In [22]:
ca_tracts = ca_counties_tract.drop(columns={'field_1', 'geometry'})
ca_tracts.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)
ca_tracts = ca_tracts.drop(columns={'County'})

ca_tracts

Unnamed: 0,TRACT,CntFips
0,06085504321,085
1,06085504410,085
2,06085507003,085
3,06085507004,085
4,06085502204,085
...,...,...
9124,06059001303,059
9125,06059001304,059
9126,06059001401,059
9127,06013367200,013


In [24]:
tract_merge = pd.merge(ca_tracts, average_hpsscore_county, on='CntFips', how='right')
tract_merge = tract_merge.drop(columns={'CntFips', 'County'})
tract_merge

Unnamed: 0,TRACT,Avg_HpsScore,Hps_Flag
0,06085504321,18.2,1.0
1,06085504410,18.2,1.0
2,06085507003,18.2,1.0
3,06085507004,18.2,1.0
4,06085502204,18.2,1.0
...,...,...,...
9124,06021010300,15.0,1.0
9125,06051000102,16.0,1.0
9126,06051000101,16.0,1.0
9127,06051000201,16.0,1.0


Function Call

In [31]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_counties_tract = gpd.read_file(census_shp_dir)
ca_counties = ca_counties_tract.drop(columns={'field_1', 'TRACT', 'geometry'})
ca_counties.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)

def healthcare_shortage_calc(df):    
    healthcare_shortage_df_cleaned = df.drop_duplicates(subset=['CntFips', 'HpsScore'], keep='first')
    merge_df = pd.merge(healthcare_shortage_df_cleaned, ca_counties, how='left', on='CntFips')
    merge_df = merge_df.drop_duplicates(subset=['CntFips', 'HpsScore'], keep='first')

    average_hpsscore_county = merge_df.groupby('CntFips')['HpsScore'].mean().reset_index()
    average_hpsscore_county = average_hpsscore_county.rename(columns={'HpsScore':'Avg_HpsScore'})
    average_hpsscore_county = pd.merge(average_hpsscore_county, ca_counties, how='right', on='CntFips')
    average_hpsscore_county = average_hpsscore_county.drop_duplicates(subset=['CntFips', 'Avg_HpsScore'], keep='first')
    average_hpsscore_county['Hps_Flag'] = average_hpsscore_county['Avg_HpsScore'].apply(lambda x: 1 if pd.notna(x) and x > 0 else None)

    ca_tracts = ca_counties_tract.drop(columns={'field_1', 'geometry'})
    ca_tracts.rename(columns={'COUNTYFP':'CntFips'}, inplace=True)
    ca_tracts = ca_tracts.drop(columns={'County'})

    tract_merge = pd.merge(ca_tracts, average_hpsscore_county, on='CntFips', how='right')
    tract_merge = tract_merge.drop(columns={'CntFips', 'County'})

    # Calculate and print the number of rows with NaN or 0 in Hps_Flag
    na_count = tract_merge['Hps_Flag'].isna().sum()
    zero_count = (tract_merge['Hps_Flag'] == 0).sum()
    print(f"Number of rows with NaN in Hps_Flag: {na_count}")
    print(f"Number of rows with 0 in Hps_Flag: {zero_count}")
    
    print(len(tract_merge))
    return tract_merge


healthcare_shortage_calc(primary_healthcare_shortage)

Number of rows with NaN in Hps_Flag: 2
Number of rows with 0 in Hps_Flag: 0
9129


Unnamed: 0,TRACT,Avg_HpsScore,Hps_Flag
0,06085504321,10.5,1.0
1,06085504410,10.5,1.0
2,06085507003,10.5,1.0
3,06085507004,10.5,1.0
4,06085502204,10.5,1.0
...,...,...,...
9124,06021010300,17.0,1.0
9125,06051000102,12.0,1.0
9126,06051000101,12.0,1.0
9127,06051000201,12.0,1.0


In [23]:
narcotic_columns = ['County',
                    'DBA',
                    'OTP_CA10',
                    'geometry'
                    ]
narcotic_support = narcotic_support_data[narcotic_columns]
narcotic_support = narcotic_support.rename(columns={'County': 'countyfp'})

# Ensure all entries in 'County' column are three digits
narcotic_support['countyfp'] = narcotic_support['countyfp'].astype(str).str.zfill(3)
narcotic_support


Unnamed: 0,countyfp,DBA,OTP_CA10,geometry
0,001,,249M,POINT (-121.88414 37.71620)
1,001,,309M,POINT (-122.10225 37.68718)
2,001,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
3,001,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
4,001,MedMark Treatment Centers - Hayward,304M,POINT (-122.08032 37.66542)
...,...,...,...,...
177,056,,298M,POINT (-119.06645 34.35216)
178,056,,236M,POINT (-118.78578 34.27040)
179,056,,386M,POINT (-119.22469 34.27695)
180,057,,398M,POINT (-121.57790 38.57071)


In [8]:
# Columns to check for duplicates
selected_columns = ['OTP_CA10', 'geometry']

# Check for duplicates
duplicates = narcotic_support.duplicated(subset=selected_columns, keep=False)

# Count duplicates
duplicate_count = duplicates.sum()

# Filter out the duplicate rows for display
duplicate_rows = narcotic_support[duplicates]

print(f"Number of duplicate entries in columns {selected_columns}: {duplicate_count}")
print("Duplicate rows:")
display(duplicate_rows)

# Drop duplicates, keeping the first occurrence
narcotic_support_unique_values = narcotic_support.drop_duplicates(subset=selected_columns, keep='first')


Number of duplicate entries in columns ['OTP_CA10', 'geometry']: 19
Duplicate rows:


Unnamed: 0,countyfp,DBA,OTP_CA10,geometry
2,1,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
3,1,BAART Programs Oakland,186M,POINT (-122.24830 37.79120)
62,19,BAART Programs Lynwood,205M,POINT (-118.18875 33.91724)
63,19,BAART Programs Lynwood,205M,POINT (-118.18875 33.91724)
75,19,BAART Programs Boyle Heights,279M,POINT (-118.21035 34.06239)
76,19,BAART Programs Boyle Heights,279M,POINT (-118.21035 34.06239)
108,34,BAART Programs Carmicheal,293M,POINT (-121.32939 38.61252)
109,34,BAART Programs Carmicheal,293M,POINT (-121.32939 38.61252)
110,34,BAART Programs Norwood,274M,POINT (-121.45857 38.63813)
111,34,BAART Programs Norwood,274M,POINT (-121.45857 38.63813)
