In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/natural_systems/ecosystem_condition/ca_dept_fish_wildlife/species_biodiversity/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'natural_fws_biodiversity.gpkg' locally


In [34]:
species_biodiversity_data = gpd.read_file('natural_fws_biodiversity.gpkg')

In [35]:
species_biodiversity_data.columns

Index(['Hex_ID', 'HUC12', 'Name', 'SpBioRnkEco', 'SpBioWtEco', 'SpBioRnkSW',
       'BioTrRnkEco', 'BioTrRnkSW', 'BioAqRnkSW', 'NtvSpRnkEco', 'NtvSpRnkSW',
       'NtvTrRnkEco', 'NtvTrRnkSW', 'NtvAqRnkSW', 'RarRnkEco', 'RarRnkSW',
       'RarTrRnkEco', 'RarTrRnkSW', 'RarAqRnkSW', 'RwiRankEco', 'RwiRankSW',
       'RwiTrRnkEco', 'RwiTrRnkSW', 'RwiAqRnkSW', 'TerrConnRank',
       'TerrClimRank', 'TerrHabRank', 'AqHabRank', 'Eco_Sect', 'Eco_Name',
       'Jepson_Eco', 'County', 'Shape_Length', 'Shape_Area', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [4]:
species_biodiversity_data

Unnamed: 0,Hex_ID,HUC12,Name,SpBioRnkEco,SpBioWtEco,SpBioRnkSW,BioTrRnkEco,BioTrRnkSW,BioAqRnkSW,NtvSpRnkEco,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,100.0,171003090103,Dutch Creek-Elliott Creek,2,0.340206,4,4.0,5.0,3,1,...,06093001300,13,Census Tract 13,G5020,S,3724670003,12972083,+41.7792602,-123.2366787,"POLYGON ((-123.13610 41.98144, -123.13605 41.9..."
1,100.0,171003090104,Middle Fork Applegate River-Applegate River,3,0.343531,4,4.0,5.0,3,1,...,06093001300,13,Census Tract 13,G5020,S,3724670003,12972083,+41.7792602,-123.2366787,"POLYGON ((-123.15674 41.97867, -123.16679 41.9..."
2,101.0,171003090103,Dutch Creek-Elliott Creek,2,0.329657,4,3.0,4.0,3,1,...,06093001300,13,Census Tract 13,G5020,S,3724670003,12972083,+41.7792602,-123.2366787,"POLYGON ((-123.09973 41.98008, -123.10977 41.9..."
3,102.0,171003090103,Dutch Creek-Elliott Creek,2,0.317662,3,3.0,4.0,3,1,...,06093001300,13,Census Tract 13,G5020,S,3724670003,12972083,+41.7792602,-123.2366787,"POLYGON ((-123.04272 41.98147, -123.05276 41.9..."
4,1024.0,180101010301,Upper South Fork Smith River,2,0.301721,4,2.0,4.0,4,2,...,06093001300,13,Census Tract 13,G5020,S,3724670003,12972083,+41.7792602,-123.2366787,"MULTIPOLYGON (((-123.68987 41.80387, -123.6899..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173107,9561.0,180201540305,Clover Creek-Sacramento River,5,0.544980,5,5.0,3.0,5,5,...,06089012102,121.02,Census Tract 121.02,G5020,S,10507081,386121,+40.4565423,-122.2681737,"POLYGON ((-122.24421 40.46160, -122.24446 40.4..."
173108,9640.0,180201540502,Anderson Creek-Sacramento River,5,0.487189,4,4.0,3.0,5,4,...,06089012102,121.02,Census Tract 121.02,G5020,S,10507081,386121,+40.4565423,-122.2681737,"POLYGON ((-122.27435 40.45484, -122.27250 40.4..."
173109,9640.0,180201540305,Clover Creek-Sacramento River,5,0.506300,5,4.0,3.0,5,5,...,06089012102,121.02,Census Tract 121.02,G5020,S,10507081,386121,+40.4565423,-122.2681737,"POLYGON ((-122.26035 40.45536, -122.26121 40.4..."
173110,9719.0,180201540502,Anderson Creek-Sacramento River,5,0.488015,4,4.0,3.0,5,4,...,06089012102,121.02,Census Tract 121.02,G5020,S,10507081,386121,+40.4565423,-122.2681737,"POLYGON ((-122.28619 40.44043, -122.28672 40.4..."


SpBioRnkEco - Ranks of 1-5 assigned to the ecoregionally normalized biodiversity values, with all zero values removed and remaining values broken into 5 quantiles.
SpBioWtEco - Aggregated total of ecoregionally normalized biodiversity values including native species richness, rare species richness, and rarity weighted index. Final sum is        re-normalized to 0-1 statewide for ease of interpretation.


In [41]:
columns = ['Hex_ID', 'Name', 'SpBioRnkEco', 'SpBioWtEco', 'USCB_GEOID', 'geometry'
    
]
filtered_species_biodiversity_data = species_biodiversity_data[columns]
filtered_species_biodiversity_data

Unnamed: 0,Hex_ID,Name,SpBioRnkEco,SpBioWtEco,USCB_GEOID,geometry
0,100.0,Dutch Creek-Elliott Creek,2,0.340206,06093001300,"POLYGON ((-123.13610 41.98144, -123.13605 41.9..."
1,100.0,Middle Fork Applegate River-Applegate River,3,0.343531,06093001300,"POLYGON ((-123.15674 41.97867, -123.16679 41.9..."
2,101.0,Dutch Creek-Elliott Creek,2,0.329657,06093001300,"POLYGON ((-123.09973 41.98008, -123.10977 41.9..."
3,102.0,Dutch Creek-Elliott Creek,2,0.317662,06093001300,"POLYGON ((-123.04272 41.98147, -123.05276 41.9..."
4,1024.0,Upper South Fork Smith River,2,0.301721,06093001300,"MULTIPOLYGON (((-123.68987 41.80387, -123.6899..."
...,...,...,...,...,...,...
173107,9561.0,Clover Creek-Sacramento River,5,0.544980,06089012102,"POLYGON ((-122.24421 40.46160, -122.24446 40.4..."
173108,9640.0,Anderson Creek-Sacramento River,5,0.487189,06089012102,"POLYGON ((-122.27435 40.45484, -122.27250 40.4..."
173109,9640.0,Clover Creek-Sacramento River,5,0.506300,06089012102,"POLYGON ((-122.26035 40.45536, -122.26121 40.4..."
173110,9719.0,Anderson Creek-Sacramento River,5,0.488015,06089012102,"POLYGON ((-122.28619 40.44043, -122.28672 40.4..."


In [17]:
print(species_biodiversity_data.SpBioWtEco.max())
print(species_biodiversity_data.SpBioWtEco.min())
print(species_biodiversity_data.SpBioWtEco.mean())

1.0
0.0
0.35859792288229586


In [27]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county = ca_tract_county.rename(columns={'TRACT':'USCB_GEOID'})

In [28]:
ca_tract_county

Unnamed: 0,USCB_GEOID,COUNTYFP,County
0,06085504321,085,Santa Clara
1,06085504410,085,Santa Clara
2,06085507003,085,Santa Clara
3,06085507004,085,Santa Clara
4,06085502204,085,Santa Clara
...,...,...,...
9124,06059001303,059,Orange
9125,06059001304,059,Orange
9126,06059001401,059,Orange
9127,06013367200,013,Contra Costa


In [37]:
biodiversity_merge = pd.merge(ca_tract_county, filtered_species_biodiversity_data, on='USCB_GEOID', how='left')
biodiversity_merge

Unnamed: 0,USCB_GEOID,COUNTYFP,County,Hex_ID,Name,SpBioRnkEco,SpBioWtEco,geometry
0,06085504321,085,Santa Clara,28153.0,Lower Penitencia Creek-Frontal San Francisco B...,3.0,0.403570,"POLYGON ((-121.86618 37.38771, -121.87434 37.3..."
1,06085504321,085,Santa Clara,28241.0,Lower Penitencia Creek-Frontal San Francisco B...,2.0,0.351030,"POLYGON ((-121.86618 37.38771, -121.86620 37.3..."
2,06085504321,085,Santa Clara,28330.0,Lower Penitencia Creek-Frontal San Francisco B...,1.0,0.309290,"POLYGON ((-121.86618 37.38771, -121.85933 37.3..."
3,06085504321,085,Santa Clara,28330.0,Upper Penitencia Creek-Coyote Creek,2.0,0.348707,"POLYGON ((-121.86185 37.38586, -121.86155 37.3..."
4,06085504410,085,Santa Clara,28065.0,Lower Penitencia Creek-Frontal San Francisco B...,2.0,0.350678,"POLYGON ((-121.88576 37.40879, -121.88427 37.4..."
...,...,...,...,...,...,...,...,...
173124,06013367200,013,Contra Costa,24262.0,San Pablo Creek,5.0,0.706609,"POLYGON ((-122.34391 37.96354, -122.34357 37.9..."
173125,06013367200,013,Contra Costa,24341.0,Pinole Creek-Frontal San Pablo Bay Estuaries,5.0,0.796818,"POLYGON ((-122.34391 37.96354, -122.34476 37.9..."
173126,06013367200,013,Contra Costa,24341.0,San Pablo Creek,5.0,0.725536,"POLYGON ((-122.34476 37.96361, -122.34391 37.9..."
173127,06037578100,037,Los Angeles,59080.0,Alamitos Bay,1.0,0.301606,"POLYGON ((-118.12235 33.78001, -118.12235 33.7..."


In [40]:
columns_to_check = ['USCB_GEOID', 'COUNTYFP', 'County', 'Hex_ID', 'Name', 'SpBioRnkEco', 'SpBioWtEco']

# Dropping duplicates based on the specified columns
filtered_species_biodiversity_merge = biodiversity_merge.drop_duplicates(subset=columns_to_check, keep='first')
print(len(filtered_species_biodiversity_merge))

173122


In [46]:
check_tract = filtered_species_biodiversity_merge[filtered_species_biodiversity_merge['USCB_GEOID'] == '06001400100']
check_tract


Unnamed: 0,USCB_GEOID,COUNTYFP,County,Hex_ID,Name,SpBioRnkEco,SpBioWtEco,geometry
119337,6001400100,1,Alameda,24820.0,Pinole Creek-Frontal San Pablo Bay Estuaries,5.0,0.984712,"MULTIPOLYGON (((-122.23784 37.88305, -122.2382..."
119338,6001400100,1,Alameda,24820.0,Cerrito Creek-Frontal San Francisco Bay Estuaries,5.0,0.818047,"POLYGON ((-122.24652 37.88496, -122.24624 37.8..."
119339,6001400100,1,Alameda,24899.0,Pinole Creek-Frontal San Pablo Bay Estuaries,5.0,0.915967,"POLYGON ((-122.22366 37.87856, -122.22388 37.8..."
119340,6001400100,1,Alameda,24899.0,Cerrito Creek-Frontal San Francisco Bay Estuaries,5.0,0.749302,"POLYGON ((-122.21783 37.86853, -122.21787 37.8..."
119341,6001400100,1,Alameda,24899.0,San Pablo Creek,5.0,0.736807,"MULTIPOLYGON (((-122.21651 37.86921, -122.2165..."
119342,6001400100,1,Alameda,24977.0,Cerrito Creek-Frontal San Francisco Bay Estuaries,5.0,0.774291,"POLYGON ((-122.23778 37.87339, -122.22843 37.8..."
119343,6001400100,1,Alameda,25056.0,Cerrito Creek-Frontal San Francisco Bay Estuaries,5.0,0.669923,"POLYGON ((-122.21354 37.85828, -122.21338 37.8..."
119344,6001400100,1,Alameda,25056.0,San Pablo Creek,5.0,0.657428,"POLYGON ((-122.21354 37.85828, -122.21377 37.8..."


In [42]:
# Group by USCB_GEOID and calculate the mean for SpBioRnkEco and SpBioWtEco
processed_species_biodiversity = filtered_species_biodiversity_merge.groupby('USCB_GEOID').agg({
    'SpBioRnkEco': 'mean',
    'SpBioWtEco': 'mean'
}).reset_index()

In [43]:
processed_species_biodiversity

Unnamed: 0,USCB_GEOID,SpBioRnkEco,SpBioWtEco
0,06001400100,5.000000,0.788310
1,06001400200,3.500000,0.585978
2,06001400300,2.500000,0.434713
3,06001400400,3.333333,0.547906
4,06001400500,3.000000,0.471762
...,...,...,...
9124,06115040902,2.363636,0.321223
9125,06115041001,2.721154,0.348921
9126,06115041002,3.049383,0.346516
9127,06115041101,3.410256,0.357201


In [47]:
# Count the number of NaN values in each specified column
num_nan_SpBioRnkEco = filtered_species_biodiversity_merge['SpBioRnkEco'].isna().sum()
num_nan_SpBioWtEco = filtered_species_biodiversity_merge['SpBioWtEco'].isna().sum()

print(f"Number of NaN values in SpBioRnkEco: {num_nan_SpBioRnkEco}")
print(f"Number of NaN values in SpBioWtEco: {num_nan_SpBioWtEco}")


Number of NaN values in SpBioRnkEco: 17
Number of NaN values in SpBioWtEco: 17


In [48]:
processed_species_biodiversity.to_csv('natural_species_biodiversity_metric.csv', index=False)

## Function Call

In [49]:
@append_metadata
def species_biodiversity_upload(input_csv, export=False, varname=''):
    '''
    Uploads the species diversity metric to S3 bucket. The metric is:
    
    * Ecoregion Biodiversity Weight score

    Data for this metric was sourced from theCalifornia Natural Resources Agency: California Protected Areas Database at
    https://data.cnra.ca.gov/dataset/california-protected-areas-database/resource/27323846-4000-42a2-85b3-93ae40edeff9

    Methods
    -------
    Relevant data columns were isolated, some were renamed for later merging with California tract data.
    Data was reprojected to match California tract data.
    Using both datasets 'geometry' columns, a total area column for each tract entry was calculated.
    Estimated tract percentage under management practices was calculated by dividing estimated tract land under management
    by estimated total tract area.
    
    Parameters
    ----------
    input_csv: string
        csv protected areas metric data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI protected areas metric to AWS
        True = will upload resulting df containing CAL CRAI protected areas metric to AWS

    Script
    ------
    governance_protected_areas.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: relevant columns were isolated and renamed.')
    print('Data transformation: biodiversity data were merged with California census tract data.')
    print('Data transformation: duplicate rows based on biodiversity score and location were dropped')
    print('Data transformation: data were then grouped by census tract and averaged so each tract has a single score.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    '''if os.path.exists(input_csv):
        os.remove(input_csv)'''

In [51]:
input_csv = 'natural_species_biodiversity_metric.csv'
var = 'natural_fws_biodiversity'

species_biodiversity_upload(input_csv, export=True, varname='test')