In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/natural_systems/ecosystem_conservation/cal_protected_areas/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'natural_cnra_protected_areas.gpkg' locally


In [3]:
cnra_protected_areas_data = gpd.read_file('natural_cnra_protected_areas.gpkg')

In [5]:
cnra_protected_areas_data.head()

Unnamed: 0,HOLDING_ID,ACCESS_TYP,UNIT_ID,UNIT_NAME,SUID_NMA,AGNCY_ID,AGNCY_NAME,AGNCY_LEV,AGNCY_TYP,AGNCY_WEB,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,3,No Public Access,35058,El Corte de Madera Creek Open Space Preserve,28198,2079,Midpeninsula Regional Open Space District,Special District,Recreation/Parks District,http://www.openspace.org,...,6081613800,6138.0,Census Tract 6138,G5020,S,467817425,17465118,37.2827241,-122.3443974,"POLYGON ((-122.34611 37.40300, -122.34619 37.4..."
1,53,Open Access,45393,La Raggione Tot Lot,27912,1303,"San Jose, City of",City,City Agency,http://www.sanjoseca.gov/index.aspx?NID=204,...,6085503213,5032.13,Census Tract 5032.13,G5020,S,1099444,0,37.3016723,-121.8373674,"POLYGON ((-121.84377 37.30875, -121.84373 37.3..."
2,56,Open Access,45387,Shady Oaks Park,27915,1303,"San Jose, City of",City,City Agency,http://www.sanjoseca.gov/index.aspx?NID=204,...,6085512001,5120.01,Census Tract 5120.01,G5020,S,20047195,0,37.2568596,-121.7680942,"POLYGON ((-121.79364 37.26456, -121.79239 37.2..."
3,52,Open Access,45391,West Evergreen Park,27916,1303,"San Jose, City of",City,City Agency,http://www.sanjoseca.gov/index.aspx?NID=204,...,6085503304,5033.04,Census Tract 5033.04,G5020,S,1487588,0,37.3126099,-121.8212218,"POLYGON ((-121.81556 37.30859, -121.81557 37.3..."
4,67,Open Access,45281,Martin Griffin Preserve,21904,3005,Audubon Canyon Ranch,Non Profit,Non Profit - Conservation,http://www.egret.org,...,6041132100,1321.0,Census Tract 1321,G5020,S,42866847,10752772,37.9216421,-122.6984111,"POLYGON ((-122.67464 37.92389, -122.67472 37.9..."


In [10]:
cnra_protected_areas_data.columns

Index(['HOLDING_ID', 'ACCESS_TYP', 'UNIT_ID', 'UNIT_NAME', 'SUID_NMA',
       'AGNCY_ID', 'AGNCY_NAME', 'AGNCY_LEV', 'AGNCY_TYP', 'AGNCY_WEB',
       'LAYER', 'MNG_AG_ID', 'MNG_AGNCY', 'MNG_AG_LEV', 'MNG_AG_TYP',
       'SITE_NAME', 'ALT_SITE_N', 'PARK_URL', 'LAND_WATER', 'SPEC_USE', 'CITY',
       'COUNTY', 'ACRES', 'LABEL_NAME', 'DATE_REVIS', 'SRC_ATTR', 'SRC_ALIGN',
       'YR_PROTECT', 'YR_EST', 'GAP1_acres', 'GAP2_acres', 'GAP3_acres',
       'GAP4_acres', 'GAP_tot_ac', 'GAP_Source', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [15]:
# selecting relevant columns for processing
selected_columns = ['HOLDING_ID', 'UNIT_ID', 'UNIT_NAME', 'AGNCY_NAME', 'AGNCY_TYP',
       'SITE_NAME', 'COUNTY', 'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'geometry']

filtered_protected_areas_data = cnra_protected_areas_data[selected_columns]
filtered_protected_areas_data

Unnamed: 0,HOLDING_ID,UNIT_ID,UNIT_NAME,AGNCY_NAME,AGNCY_TYP,SITE_NAME,COUNTY,USCB_COUNTYFP,USCB_TRACTCE,USCB_GEOID,geometry
0,3,35058,El Corte de Madera Creek Open Space Preserve,Midpeninsula Regional Open Space District,Recreation/Parks District,El Corte de Madera Creek Open Space Preserve,San Mateo,081,613800,06081613800,"POLYGON ((-122.34611 37.40300, -122.34619 37.4..."
1,53,45393,La Raggione Tot Lot,"San Jose, City of",City Agency,La Raggione Tot Lot,Santa Clara,085,503213,06085503213,"POLYGON ((-121.84377 37.30875, -121.84373 37.3..."
2,56,45387,Shady Oaks Park,"San Jose, City of",City Agency,Shady Oaks Park,Santa Clara,085,512001,06085512001,"POLYGON ((-121.79364 37.26456, -121.79239 37.2..."
3,52,45391,West Evergreen Park,"San Jose, City of",City Agency,West Evergreen Park,Santa Clara,085,503304,06085503304,"POLYGON ((-121.81556 37.30859, -121.81557 37.3..."
4,67,45281,Martin Griffin Preserve,Audubon Canyon Ranch,Non Profit - Conservation,Martin Griffin Preserve,Marin,041,132100,06041132100,"POLYGON ((-122.67464 37.92389, -122.67472 37.9..."
...,...,...,...,...,...,...,...,...,...,...,...
138787,98698,47808,Balboa,Coastside Land Trust,Non Profit - Land Trust,Balboa,San Mateo,081,613501,06081613501,"POLYGON ((-122.44440 37.46815, -122.44434 37.4..."
138788,106465,48234,Gazos Creek Redwoods,Sempervirens Fund,Non Profit - Land Trust,Gazos Creek Redwoods,San Mateo,081,613800,06081613800,"POLYGON ((-122.30589 37.20080, -122.31470 37.2..."
138789,98702,50475,Wavecrest,Coastside Land Trust,Non Profit - Land Trust,Wavecrest,San Mateo,081,613701,06081613701,"POLYGON ((-122.43742 37.44232, -122.43770 37.4..."
138790,5295,2072,Camp Comfort Park,"Ventura, County of",County Agency,Camp Comfort Park,Ventura,111,001102,06111001102,"MULTIPOLYGON (((-119.25859 34.42709, -119.2585..."


In [21]:
print(filtered_protected_areas_data.AGNCY_TYP.unique())

['Recreation/Parks District' 'City Agency' 'Non Profit - Conservation'
 'County Agency' 'Flood District' 'State Agency' 'Non Profit - Land Trust'
 'Federal Agency' 'Water District' 'Cemetery District'
 'County Agency - Parks' 'School District' 'Non Profit - Other' 'Utility'
 'Special District - Other' 'Open Space District' 'Port/Harbor District'
 'Private' 'Sanitation District' 'Joint Powers Authority'
 'Conservation District' 'Irrigation District'
 'Non Profit - Cemetery District' 'County Agency - Other'
 'Community Services District' 'Mitigation Bank' 'Transportation Agency'
 'Home Owners Association' 'Commercial' 'Other'
 'Metropolitan Planning Organization' 'Non Profit - Education'
 'Mitigation' 'Education' 'Special District' 'Fire District'
 'Airport District' 'Unknown']


In [31]:
public_agencies_non_profits = [
    'Recreation/Parks District', 'City Agency', 'Non Profit - Conservation', 'County Agency', 'Flood District',
    'State Agency', 'Non Profit - Land Trust', 'Federal Agency', 'Water District', 'Cemetery District',
    'County Agency - Parks', 'School District', 'Non Profit - Other', 'Utility', 'Special District - Other',
    'Open Space District', 'Port/Harbor District', 'Sanitation District', 'Joint Powers Authority',
    'Conservation District', 'Irrigation District', 'Non Profit - Cemetery District', 'County Agency - Other',
    'Community Services District', 'Transportation Agency', 'Metropolitan Planning Organization', 
    'Non Profit - Education', 'Fire District', 'Airport District'
]


In [36]:
public_agencies_non_profits_areas_data = filtered_protected_areas_data[filtered_protected_areas_data['AGNCY_TYP'].isin(public_agencies_non_profits)]
public_agencies_non_profits_areas_data

Unnamed: 0,HOLDING_ID,UNIT_ID,UNIT_NAME,AGNCY_NAME,AGNCY_TYP,SITE_NAME,COUNTY,USCB_COUNTYFP,USCB_TRACTCE,USCB_GEOID,geometry
0,3,35058,El Corte de Madera Creek Open Space Preserve,Midpeninsula Regional Open Space District,Recreation/Parks District,El Corte de Madera Creek Open Space Preserve,San Mateo,081,613800,06081613800,"POLYGON ((-122.34611 37.40300, -122.34619 37.4..."
1,53,45393,La Raggione Tot Lot,"San Jose, City of",City Agency,La Raggione Tot Lot,Santa Clara,085,503213,06085503213,"POLYGON ((-121.84377 37.30875, -121.84373 37.3..."
2,56,45387,Shady Oaks Park,"San Jose, City of",City Agency,Shady Oaks Park,Santa Clara,085,512001,06085512001,"POLYGON ((-121.79364 37.26456, -121.79239 37.2..."
3,52,45391,West Evergreen Park,"San Jose, City of",City Agency,West Evergreen Park,Santa Clara,085,503304,06085503304,"POLYGON ((-121.81556 37.30859, -121.81557 37.3..."
4,67,45281,Martin Griffin Preserve,Audubon Canyon Ranch,Non Profit - Conservation,Martin Griffin Preserve,Marin,041,132100,06041132100,"POLYGON ((-122.67464 37.92389, -122.67472 37.9..."
...,...,...,...,...,...,...,...,...,...,...,...
138787,98698,47808,Balboa,Coastside Land Trust,Non Profit - Land Trust,Balboa,San Mateo,081,613501,06081613501,"POLYGON ((-122.44440 37.46815, -122.44434 37.4..."
138788,106465,48234,Gazos Creek Redwoods,Sempervirens Fund,Non Profit - Land Trust,Gazos Creek Redwoods,San Mateo,081,613800,06081613800,"POLYGON ((-122.30589 37.20080, -122.31470 37.2..."
138789,98702,50475,Wavecrest,Coastside Land Trust,Non Profit - Land Trust,Wavecrest,San Mateo,081,613701,06081613701,"POLYGON ((-122.43742 37.44232, -122.43770 37.4..."
138790,5295,2072,Camp Comfort Park,"Ventura, County of",County Agency,Camp Comfort Park,Ventura,111,001102,06111001102,"MULTIPOLYGON (((-119.25859 34.42709, -119.2585..."


In [27]:
import geopandas as gpd

# Assuming gdf1 is your GeoDataFrame containing public agencies and non-profits

# Reproject to California Albers
gdf1 = filtered_protected_areas_data.to_crs(epsg=3310)

# Check for and fix invalid geometries
gdf1['geometry'] = gdf1['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)
print("Protected areas geometries valid:", gdf1.is_valid.all())

# Dissolve by USCB_GEOID to calculate the total area for public agencies and non-profits
public_agencies_non_profits_area = gdf1.dissolve(by='USCB_GEOID', aggfunc='sum')['geometry'].area.reset_index(name='est_non_agency_area')

# Calculate total area for all agencies (assuming `gdf` contains all agencies)
# Reproject to California Albers
gdf = gdf.to_crs(epsg=3310)

# Check for and fix invalid geometries
gdf['geometry'] = gdf['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)
print("All agencies geometries valid:", gdf.is_valid.all())

# Dissolve by USCB_GEOID to calculate the total area for all agencies
all_agencies_area = gdf.dissolve(by='USCB_GEOID', aggfunc='sum')['geometry'].area.reset_index(name='total_area')

# Merge the two DataFrames on USCB_GEOID
merged_areas = all_agencies_area.merge(public_agencies_non_profits_area, on='USCB_GEOID', how='left')

# Calculate the percentage of area for public agencies and non-profits
merged_areas['percentage_non_agency_area'] = (merged_areas['est_non_agency_area'] / merged_areas['total_area']) * 100

# Handle any NaN values in 'est_non_agency_area' by setting them to 0
merged_areas['est_non_agency_area'].fillna(0, inplace=True)

# Ensure that no division by zero occurs
merged_areas['total_area'].replace(0, np.nan, inplace=True)

# Calculate the percentage of area for public agencies and non-profits again after handling NaNs
merged_areas['percentage_non_agency_area'] = (merged_areas['est_non_agency_area'] / merged_areas['total_area']) * 100

# Print the result
print(merged_areas[['USCB_GEOID', 'total_area', 'est_non_agency_area', 'percentage_non_agency_area']])


Protected areas geometries valid: True


NameError: name 'gdf' is not defined

In [7]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

  ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)


In [8]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)
ca_boundaries = ca_boundaries[['GEOID', 'geometry']]
ca_boundaries = ca_boundaries.rename(columns={'GEOID':'tract'})

In [9]:
ca_tract_county_spatial = pd.merge(ca_tract_county, ca_boundaries, on='tract', how='left')
ca_tract_county_spatial = ca_tract_county_spatial.rename(columns={'tract':'USCB_GEOID'})
ca_tract_county_spatial

Unnamed: 0,USCB_GEOID,countyfp,county,geometry
0,06085504321,085,santa clara,"POLYGON ((-121.87556 37.39924, -121.87535 37.3..."
1,06085504410,085,santa clara,"POLYGON ((-121.88886 37.40758, -121.88576 37.4..."
2,06085507003,085,santa clara,"POLYGON ((-122.02489 37.21683, -122.02459 37.2..."
3,06085507004,085,santa clara,"POLYGON ((-121.99304 37.22562, -121.99249 37.2..."
4,06085502204,085,santa clara,"POLYGON ((-121.93167 37.29803, -121.92801 37.3..."
...,...,...,...,...
9124,06059001303,059,orange,"POLYGON ((-117.95917 33.92458, -117.95888 33.9..."
9125,06059001304,059,orange,"POLYGON ((-117.95918 33.92820, -117.95831 33.9..."
9126,06059001401,059,orange,"POLYGON ((-117.95056 33.94503, -117.95055 33.9..."
9127,06013367200,013,contra costa,"POLYGON ((-122.34551 37.96355, -122.34550 37.9..."


In [11]:
geo_ca_tract_county = gpd.GeoDataFrame(ca_tract_county_spatial)

In [22]:
# Step 1: Check the CRS
print("Initial CRS of protected areas:", filtered_protected_areas_data.crs)
print("Initial CRS of CA tract/counties dataset:", geo_ca_tract_county.crs)

# Step 2: Reproject to an appropriate CRS for area calculations
gdf1 = filtered_protected_areas_data.to_crs(epsg=3310)  # California Albers
gdf2 = geo_ca_tract_county.to_crs(epsg=3310)  # California Albers

print("Reprojected CRS of cnra protected areas:", gdf1.crs)
print("Reprojected CRS of Counties:", gdf2.crs)

# Check for and fix invalid geometries in both GeoDataFrames
gdf1['geometry'] = gdf1['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)
gdf2['geometry'] = gdf2['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)

# Verify if all geometries are now valid
print("Protected areas geometries valid:", gdf1.is_valid.all())
print("County geometries valid:", gdf2.is_valid.all())

# Step 3: Calculate the total area of protected areas per tract
protected_area_tract = gdf1.dissolve(by='USCB_GEOID', aggfunc='sum')['geometry'].area.reset_index(name='protected_tract_area')

# Step 4: Calculate the total area of each county
county_area = gdf2.dissolve(by='USCB_GEOID', aggfunc='sum')['geometry'].area.reset_index(name='tract_area')

# Step 5: Merge the two datasets on the county column
merged_df = pd.merge(protected_area_tract, county_area, on='USCB_GEOID')

# Step 6: Calculate the spatial percentage of NTMPs per county
merged_df['protected_areas_percentage'] = (merged_df['protected_tract_area'] / merged_df['tract_area']) * 100

Initial CRS of protected areas: EPSG:4269
Initial CRS of CA tract/counties dataset: EPSG:4269
Reprojected CRS of cnra protected areas: EPSG:3310
Reprojected CRS of Counties: EPSG:3310
Protected areas geometries valid: True
County geometries valid: True


In [13]:
merged_df

Unnamed: 0,USCB_GEOID,protected_tract_area,tract_area,protected_areas_percentage
0,06001400100,1.202579e+06,6.945851e+06,17.313634
1,06001400300,4.206150e+03,1.110836e+06,0.378647
2,06001400400,1.488980e+03,7.161367e+05,0.207918
3,06001400500,3.134222e+04,5.914232e+05,5.299457
4,06001400700,5.882607e+03,8.657521e+05,0.679479
...,...,...,...,...
6810,06115040902,1.182862e+03,9.384312e+07,0.001260
6811,06115041001,3.482645e+05,2.776416e+08,0.125437
6812,06115041002,5.443862e+07,1.976832e+08,27.538314
6813,06115041101,2.098810e+08,4.814353e+08,43.594855


In [None]:
# merge back to CA tract/county data so we have our 9129 census tracts
protected_areas_metric = pd.merge(ca_tract_county_spatial, merged_df, on='USCB_GEOID', how='left')
protected_areas_metric