## Cal-CRAI Metric Calculation
Domain: Climate Risks \
Indicator: Flooding Exposure

This notebook calculates one metric, sourced from Koordinates:
* Metric 1: Percentage of a tract in a 100 year floodplain area

In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# make a list of paths, since this is actually 10 files
def climate_floodplain_pull(
    path='2b_reproject/climate_risk/flood/exposure/koordinates/climate_koordinates_floodplain.parquet.gzip'
):
    """ Build a list of shapefile URIs contained in S3 folder """
    # initiate empty list for s3 URIs
    all_shapefiles = []
    bucket_name = 'ca-climate-index' 
    # initiate s3 session
    session = boto3.Session()
    # use the session to get the resource
    s3 = session.resource('s3')
    my_bucket = s3.Bucket(bucket_name)
    # iterate through directory
    for obj in my_bucket.objects.filter(
        Prefix=path):
        all_shapefiles.append(obj.key)
    return all_shapefiles

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/climate_risk/flood/exposure/koordinates/'

pull_gpkg_from_directory(bucket_name, aws_dir)

In [4]:
floodplain_data = gpd.read_file('climate_koordinates_floodplain.gpkg')

In [None]:
floodplain_data.head()

In [None]:
floodplain_data.FloodZone.unique()

In [None]:
floodplain_data.plot()

In [None]:
# selecting relevant columns for processing
selected_columns = ['USCB_GEOID', 'FloodZone', 'geometry']

floodplain_columns = floodplain_data[selected_columns]

one_hundred_yr_floodplain_drop_duplicates = floodplain_columns.drop_duplicates(['geometry'], keep='first')
one_hundred_yr_floodplain_drop_duplicates

In [None]:
# Count the total number of duplicates in the geometry column
duplicate_count = one_hundred_yr_floodplain_drop_duplicates.duplicated(subset=['geometry']).sum()

print(f"Number of dropped duplicate geometries: {duplicate_count}")

In [None]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)
ca_tract_county = ca_tract_county.drop(columns='geometry')

In [11]:
# read in CA spatial census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)
ca_boundaries = ca_boundaries[['GEOID', 'geometry']]
ca_boundaries = ca_boundaries.rename(columns={'GEOID':'tract'})

In [None]:
# merge the two CA tract datasets together
ca_tract_county_spatial = pd.merge(ca_tract_county, ca_boundaries, on='tract', how='left')
ca_tract_county_spatial = ca_tract_county_spatial.rename(columns={'tract':'USCB_GEOID'})
ca_tract_county_spatial

In [13]:
geo_ca_tract_county = gpd.GeoDataFrame(ca_tract_county_spatial)

In [None]:
# Step 1: Check the CRS
print("Initial CRS of protected areas:", one_hundred_yr_floodplain_drop_duplicates.crs)
print("Initial CRS of CA tract/counties dataset:", geo_ca_tract_county.crs)

# Step 2: Reproject to an appropriate CRS for area calculations
gdf1 = one_hundred_yr_floodplain_drop_duplicates.to_crs(epsg=3310)  # California Albers
gdf2 = geo_ca_tract_county.to_crs(epsg=3310)  # California Albers

print("Reprojected CRS of cnra protected areas:", gdf1.crs)
print("Reprojected CRS of Counties:", gdf2.crs)

# Check for and fix invalid geometries in both GeoDataFrames
gdf1['geometry'] = gdf1['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)
gdf2['geometry'] = gdf2['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)

# Verify if all geometries are now valid
print("Protected areas geometries valid:", gdf1.is_valid.all())
print("County geometries valid:", gdf2.is_valid.all())

# Step 3: Calculate the total area of floodplain areas per tract
floodplain_area_tract = gdf1.dissolve(by='USCB_GEOID', aggfunc='sum')['geometry'].area.reset_index(name='floodplain_tract_area')

# Step 4: Calculate the total area of each tract
tract_area = gdf2.dissolve(by='USCB_GEOID', aggfunc='sum')['geometry'].area.reset_index(name='tract_area')

# Step 5: Merge the two datasets on the tract column
merged_df = pd.merge(tract_area, floodplain_area_tract, on='USCB_GEOID', how='left')

# Step 6: Calculate the spatial percentage of floodplains per tract
merged_df['floodplain_percentage'] = (merged_df['floodplain_tract_area'] / merged_df['tract_area']) * 100  

# Round the percentages to two decimal places
merged_df['floodplain_percentage'] = merged_df['floodplain_percentage'].round(2)

# Display the results where the percentage exceeds 100 for further investigation
over_100_percent = merged_df[merged_df['floodplain_percentage'] > 100]
print('number of entries over 100 percent:', len(over_100_percent))

In [None]:
merged_df.floodplain_percentage.mean()

In [None]:
merged_df

In [18]:
merged_df.to_csv('climate_floodplain_areas_metric.csv', index=False)

## Function Call

In [19]:
@append_metadata
def floodplain_areas_upload(input_csv, export=False, varname=''):
    '''
    Uploads the floodplain areas metric to S3 bucket. The metric is:
    
    * % of tract in 100 year floodplain area

    Data for this metric was sourced from the Koordinates at:
    https://koordinates.com/layer/96056-california-fema-100-year-floodplains/

    Methods
    -------
    Relevant data columns were isolated.
    One hundred year floodplain entries were filtered for.
    Data was reprojected to match California tract data.
    Using both datasets 'geometry' columns, a total area column for each tract entry was calculated.
    Estimated floodplain tract percentage was calculated by dividing estimated tract floodplain land
    by estimated total tract area.
    These values were rounded to the nears hundredths to eliminate 18 instances of being >100%.
    
    Parameters
    ----------
    input_csv: string
        csv floodplain areas metric data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI floodplain areas metric to AWS
        True = will upload resulting df containing CAL CRAI floodplain areas metric to AWS

    Script
    ------
    climate_floodplain.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: relevant columns were isolated and renamed.')
    print('Data transformation: data reprojected to epsg 3310.')
    print('Data transformation: a new column was created to estimate percentage of each tract with management practices.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    '''if os.path.exists(input_csv):
        os.remove(input_csv)'''

In [20]:
input_csv = 'climate_floodplain_areas_metric.csv'
variable = 'climate_koordinates_floodplain'

floodplain_areas_upload(input_csv, varname='test', export=True)