## This notebook calculates the climate risk metrics sourced from Pacific Institute. The metrics include:

* % tract vulnerable to sea level rise under baseline conditions
* % tract vulnerable to sea level rise under 1.4m rise conditions
* number of fire stations vulnerable to sea level rise under baseline conditions
* number of fire stations vulnerable to sea level rise under 1.4m rise contions
* number of police stations vulnerable to sea level rise under baseline conditions
* number of police stations vulnerable to sea level rise under 1.4m rise contions
* number of schools vulnerable to sea level rise under baseline conditions
* number of schools vulnerable to sea level rise under 1.4m rise contions
* number of wastewater treatment plants vulnerable to sea level rise under baseline conditions
* number of wastewater treatment plants vulnerable to sea level rise under 1.4m rise conditions
* number of hospitals vulnerable to sea level rise under baseline conditions
* number of hospitals vulnerable to sea level rise under 1.4m rise conditions
* number of superfund sites vulnerable to flooding following 1.4m sea level rise


In [21]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [22]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/climate_risk/sea_level_rise/exposure/projections/pacific_institute/'

pull_gpkg_from_directory(bucket_name, aws_dir)
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved GeoPackage as 'climate_pacific_institute_slr_exposure_fire_stations_2000.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_fire_stations_2100.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_hospitals_2000.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_hospitals_2100.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_police_stations_2000.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_police_stations_2100.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_schools_2000.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_schools_2100.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_superfund.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_wastewater_treatment.gpkg' locally
Saved GeoPackage as 'climate_pacific_institute_slr_exposure_wastewater_treatment_2000.gpkg' l

# Metric 1) Starting with the single csv file
* % tract vulnerable to sea level rise under base and 1.4m rise conditions
    *  the data is at the census block level, so there are multiple entries for census tracts
    * group by census tract and take the average percentage per tract

In [23]:
census_vulnerable_slr_tracts = pd.read_csv('climate_pacific_institute_slr_exposure_census_vulnerability.csv')
os.remove('climate_pacific_institute_slr_exposure_census_vulnerability.csv')
census_vulnerable_slr_tracts

Unnamed: 0,CensusBlock,CountyFIPS,BlkArea_m2,Perc_2000,Perc_2100,CensusTract
0,60014017001002,1,7.400145e+04,0.013629,0.103142,6001401700
1,60014017001009,1,1.661440e+04,0.162421,0.818744,6001401700
2,60014017001011,1,2.034830e+04,0.033993,0.145896,6001401700
3,60014017002024,1,2.437019e+04,0.007894,0.054463,6001401700
4,60014017002025,1,1.384990e+04,0.218054,0.323897,6001401700
...,...,...,...,...,...,...
9415,61110073001115,111,6.546174e+04,0.008397,0.037215,6111007300
9416,61110073001120,111,2.331707e+04,0.010518,0.024219,6111007300
9417,61110073001121,111,3.143619e+06,0.000482,0.001053,6111007300
9418,61110073001124,111,2.357066e+04,0.006471,0.271358,6111007300


In [24]:
# select relevant columns
slr_tracts_columns = census_vulnerable_slr_tracts[['CensusTract', 'Perc_2000', 'Perc_2100']]
slr_tracts_columns = slr_tracts_columns.rename(columns={'CensusTract':'tract','Perc_2000':'percent_2000', 'Perc_2100':'percent_2100'})
slr_tracts_columns

Unnamed: 0,tract,percent_2000,percent_2100
0,6001401700,0.013629,0.103142
1,6001401700,0.162421,0.818744
2,6001401700,0.033993,0.145896
3,6001401700,0.007894,0.054463
4,6001401700,0.218054,0.323897
...,...,...,...
9415,6111007300,0.008397,0.037215
9416,6111007300,0.010518,0.024219
9417,6111007300,0.000482,0.001053
9418,6111007300,0.006471,0.271358


In [25]:
# this data is at the census block scale so there are multiple repeat census tracts
# census tracts are grouped together and have their percentage columns averaged
slr_tracts_grouped = slr_tracts_columns.groupby('tract')[['percent_2000', 'percent_2100']].mean().reset_index()
slr_tracts_grouped

Unnamed: 0,tract,percent_2000,percent_2100
0,6001401700,0.173551,0.332988
1,6001401900,0.130183,0.389499
2,6001403200,0.010183,0.186697
3,6001403300,0.302783,0.452502
4,6001403400,0.205320,0.411437
...,...,...,...
626,6111004600,0.911470,0.959438
627,6111004702,0.522970,0.805990
628,6111004704,0.235746,0.650934
629,6111005600,0.361981,0.665406


In [26]:
# looking at how many unique tracts there are
unique = slr_tracts_grouped.tract.unique()
print(len(unique))

631


In [27]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)

# adjust the columns and entries within so merging with slr data is easier
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'countyfp'})

# Remove leading zeros from the 'tract' column
ca_tract_county['tract'] = ca_tract_county['tract'].str.lstrip('0')

ca_tract_county

  ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)


Unnamed: 0,tract,county
0,6085504321,santa clara
1,6085504410,santa clara
2,6085507003,santa clara
3,6085507004,santa clara
4,6085502204,santa clara
...,...,...
9124,6059001303,orange
9125,6059001304,orange
9126,6059001401,orange
9127,6013367200,contra costa


In [28]:
# make sure columns are the same datatype
ca_tract_county['tract'] = ca_tract_county['tract'].astype(str)
slr_tracts_grouped['tract'] = slr_tracts_grouped['tract'].astype(str)

# merge slr data with census tract data
merged_slr_tracts = pd.merge(ca_tract_county, slr_tracts_grouped, on='tract', how='left')
merged_slr_tracts

Unnamed: 0,tract,county,percent_2000,percent_2100
0,6085504321,santa clara,,
1,6085504410,santa clara,,
2,6085507003,santa clara,,
3,6085507004,santa clara,,
4,6085502204,santa clara,,
...,...,...,...,...
9124,6059001303,orange,,
9125,6059001304,orange,,
9126,6059001401,orange,,
9127,6013367200,contra costa,,


In [29]:
# make a separate df for each metric, baseline and 1.4m projected slr
baseline_slr_vulnerability = merged_slr_tracts[['tract', 'county', 'percent_2000']]
future_slr_vulnerability = merged_slr_tracts[['tract', 'county', 'percent_2100']]

## Take a look! Looking at the max values within the datasets, we see just under 1, so it is likely the dataset arent actually percents but need to be adjusted to percents.. but then we would have some tracts at 100% flooding vulnerability, which could make sense depending on the tract

In [30]:
# Find the index of the maximum value in the percent_2000 column
max_index = baseline_slr_vulnerability['percent_2000'].idxmax()

# Retrieve the row corresponding to the maximum value
max_row = baseline_slr_vulnerability.loc[max_index]

# Display the entire row
print(max_row)


tract           6081608002
county           san mateo
percent_2000       0.99985
Name: 3215, dtype: object


In [31]:
# save as csv
baseline_slr_vulnerability.to_csv('slr_vulnerable_baseline_metric.csv', index=False)
future_slr_vulnerability.to_csv('slr_vulnerable_future_metric.csv', index=False)

## Now for the rest of the metrics:
* number of fire stations vulnerable to sea level rise under baseline conditions
* number of fire stations vulnerable to sea level rise under 1.4m rise contions
* number of police stations vulnerable to sea level rise under baseline conditions
* number of police stations vulnerable to sea level rise under 1.4m rise contions
* number of schools vulnerable to sea level rise under baseline conditions
* number of schools vulnerable to sea level rise under 1.4m rise contions
* number of wastewater treatment plants vulnerable to sea level rise under baseline conditions
* number of wastewater treatment plants vulnerable to sea level rise under 1.4m rise conditions
* number of hospitals vulnerable to sea level rise under baseline conditions
* number of hospitals vulnerable to sea level rise under 1.4m rise conditions
* number of superfund sites vulnerable to flooding following 1.4m sea level rise

In [32]:
slr_fire_stations_2000 = gpd.read_file('climate_pacific_institute_slr_exposure_fire_stations_2000.gpkg')
slr_fire_stations_2100 = gpd.read_file('climate_pacific_institute_slr_exposure_fire_stations_2100.gpkg')
slr_hospitals_2000 = gpd.read_file('climate_pacific_institute_slr_exposure_hospitals_2000.gpkg')
slr_hospitals_2100 = gpd.read_file('climate_pacific_institute_slr_exposure_hospitals_2100.gpkg')
slr_police_stations_2000 = gpd.read_file('climate_pacific_institute_slr_exposure_police_stations_2000.gpkg')
slr_police_stations_2100 = gpd.read_file('climate_pacific_institute_slr_exposure_police_stations_2100.gpkg')
slr_schools_2000 = gpd.read_file('climate_pacific_institute_slr_exposure_schools_2000.gpkg')
slr_schools_2100 = gpd.read_file('climate_pacific_institute_slr_exposure_schools_2100.gpkg')
slr_superfund_2100 = gpd.read_file('climate_pacific_institute_slr_exposure_superfund.gpkg')
slr_wastewater_2100 = gpd.read_file('climate_pacific_institute_slr_exposure_wastewater_treatment.gpkg')
slr_wastewater_2000 = gpd.read_file('climate_pacific_institute_slr_exposure_wastewater_treatment_2000.gpkg')

## Looking at shared columns between all of the shape files
* all share countyfp columns which is what we want

In [33]:
# List of GeoDataFrames
gdfs = [slr_fire_stations_2000, slr_fire_stations_2100, slr_hospitals_2000, slr_hospitals_2100,
        slr_police_stations_2000, slr_police_stations_2100, slr_schools_2000, slr_schools_2100,
        slr_superfund_2100, slr_wastewater_2100, slr_wastewater_2000]

# Get the set of columns for each GeoDataFrame
column_sets = [set(gdf.columns) for gdf in gdfs]

# Find the intersection of all columns (shared columns)
shared_columns = set.intersection(*column_sets)

# Find the unique columns for each GeoDataFrame
unique_columns = [columns - shared_columns for columns in column_sets]

# Print the shared columns
print("Shared columns among all GeoDataFrames:", shared_columns)

# Print the unique columns for each GeoDataFrame
for idx, unique in enumerate(unique_columns):
    print(f"Unique columns in GeoDataFrame {idx+1}: {unique}")

Shared columns among all GeoDataFrames: {'USCB_AWATER', 'geometry', 'USCB_INTPTLAT', 'USCB_COUNTYFP', 'USCB_STATEFP', 'USCB_GEOID', 'USCB_TRACTCE', 'USCB_FUNCSTAT', 'USCB_INTPTLON', 'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_NAME', 'USCB_ALAND'}
Unique columns in GeoDataFrame 1: {'Address', 'City', 'Statea', 'Zipcode', 'Name'}
Unique columns in GeoDataFrame 2: {'Address', 'City', 'Statea', 'Zipcode', 'fld2100', 'Name', 'fld2000'}
Unique columns in GeoDataFrame 3: {'ZIP_CODE', 'TYPE', 'LIC_BEDS', 'CITY', 'ADDRESS_1', 'FACILITY', 'STATE', 'CATEGORY', 'COUNTY'}
Unique columns in GeoDataFrame 4: {'ZIP_CODE', 'TYPE', 'LIC_BEDS', 'CITY', 'ADDRESS_1', 'FACILITY', 'STATE', 'CATEGORY', 'COUNTY'}
Unique columns in GeoDataFrame 5: {'PoliceStat', 'Address', 'City', 'Statea', 'EfClass', 'Zipcode', 'fld2100', 'Name', 'Tract', 'fld2000'}
Unique columns in GeoDataFrame 6: {'fld_2000', 'Address', 'City', 'Statea', 'Zipcode', 'Name', 'fld_2100'}
Unique columns in GeoDataFrame 7: {'ADDRESS', 'NAME', 'CITY', 'N

In [41]:
# all geodataframes share a common county fp column, so we have a function to count number
# of rows for a specified county fp number
# this will help fact check our new dfs that contain county counts per metric
def count_entries_by_county_fp(df, county_fp_number):
    """
    Returns the count of entries in the 'USCB_COUNTYFP' column that match the given county_fp_number.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    county_fp_number (str or int): The county FP number to match.

    Returns:
    int: The count of matching entries.
    """
    return df['USCB_COUNTYFP'].astype(str).eq(str(county_fp_number)).sum()

In [43]:
count_entries_by_county_fp(slr_schools_2100, '059')

27

In [36]:
gdf_names = ['slr_fire_stations_2000', 'slr_fire_stations_2100', 'slr_hospitals_2000', 'slr_hospitals_2100',
             'slr_police_stations_2000', 'slr_police_stations_2100', 'slr_schools_2000', 'slr_schools_2100',
             'slr_superfund_2100', 'slr_wastewater_2100', 'slr_wastewater_2000']

# Dictionary to store the new DataFrames
new_dfs = {}

# Loop through each GeoDataFrame and its corresponding name
for gdf, gdf_name in zip(gdfs, gdf_names):
    # Isolate the required columns
    df_isolated = gdf[['USCB_NAME', 'USCB_COUNTYFP']]

    # Group by USCB_COUNTYFP and count the occurrences
    df_grouped = df_isolated.groupby('USCB_COUNTYFP').size().reset_index(name='count')
    df_grouped = df_grouped.rename(columns={'USCB_COUNTYFP':'countyfp'})

    # Extract the phrasing from gdf_name, including any relevant keywords
    parts = gdf_name.split('_')[1:-1]
    phrasing = '_'.join(parts)

    # Rename the count column
    df_grouped = df_grouped.rename(columns={'USCB_COUNTYFP': 'countyfp', 'count': f'{phrasing}_count'})

    # Create a new name for the DataFrame
    new_df_name = f"{gdf_name}_columns"

    # Add the new DataFrame to the dictionary
    new_dfs[new_df_name] = df_grouped
    
# Access the new DataFrames using new_dfs dictionary
for name, df in new_dfs.items():
    print(f"\n{name}:\n", df)


slr_fire_stations_2000_columns:
   countyfp  fire_stations_count
0      037                    1
1      059                    1

slr_fire_stations_2100_columns:
    countyfp  fire_stations_count
0       001                    3
1       015                    1
2       037                    2
3       041                    1
4       055                    1
5       059                    1
6       075                    1
7       081                    3
8       085                    1
9       087                    2
10      095                    1

slr_hospitals_2000_columns:
   countyfp  hospitals_count
0      015                1
1      023                1
2      053                1
3      087                2

slr_hospitals_2100_columns:
   countyfp  hospitals_count
0      015                3
1      023                2
2      053                2
3      059                1
4      087                5

slr_police_stations_2000_columns:
   countyfp  police_stations_count
0 

In [37]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

  ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)


Unnamed: 0,tract,countyfp,county
0,06085504321,085,santa clara
1,06085504410,085,santa clara
2,06085507003,085,santa clara
3,06085507004,085,santa clara
4,06085502204,085,santa clara
...,...,...,...
9124,06059001303,059,orange
9125,06059001304,059,orange
9126,06059001401,059,orange
9127,06013367200,013,contra costa


In [38]:
# Dictionary to store the merged DataFrames
merged_dfs = {}

# Loop through each output DataFrame in new_dfs
for name, df in new_dfs.items():
    # Create a new name for the DataFrame with '_metric' suffix
    new_name = name.replace('_columns', '_metric')

    # Merge with ca_tract_county
    merged_df = pd.merge(ca_tract_county, df, on='countyfp', how='left')

    # Store the merged DataFrame in the dictionary
    merged_dfs[new_name] = merged_df

    # Save the merged DataFrame as a CSV file
    csv_file_path = f"{new_name}.csv"
    merged_df.to_csv(csv_file_path, index=False)
    print(f"Saved {csv_file_path}")

# Access the merged DataFrames using merged_dfs dictionary
for name, df in merged_dfs.items():
    print(f"\n{name}:\n", df)

Saved slr_fire_stations_2000_metric.csv
Saved slr_fire_stations_2100_metric.csv
Saved slr_hospitals_2000_metric.csv
Saved slr_hospitals_2100_metric.csv
Saved slr_police_stations_2000_metric.csv
Saved slr_police_stations_2100_metric.csv
Saved slr_schools_2000_metric.csv
Saved slr_schools_2100_metric.csv
Saved slr_superfund_2100_metric.csv
Saved slr_wastewater_2100_metric.csv
Saved slr_wastewater_2000_metric.csv

slr_fire_stations_2000_metric:
             tract countyfp        county  fire_stations_count
0     06085504321      085   santa clara                  NaN
1     06085504410      085   santa clara                  NaN
2     06085507003      085   santa clara                  NaN
3     06085507004      085   santa clara                  NaN
4     06085502204      085   santa clara                  NaN
...           ...      ...           ...                  ...
9124  06059001303      059        orange                  1.0
9125  06059001304      059        orange                 

## Function Call

In [39]:
@append_metadata
def slr_pacific_metric_upload(input_csv, export=False, varname=""):  
    '''
    The function uploads calculated metrics for sea level rise sourced from Pacific Institute at:
    https://pacinst.org/califonia-sea-level-rise-gis-data-downloads/
    
    The metrics include:
    * % tract vulnerable to sea level rise under baseline conditions
    * % tract vulnerable to sea level rise under 1.4m rise conditions
    * # of fire stations vulnerable to sea level rise under baseline conditions
    * # of fire stations vulnerable to sea level rise under 1.4m rise contions
    * # of police stations vulnerable to sea level rise under baseline conditions
    * # of police stations vulnerable to sea level rise under 1.4m rise contions
    * # of schools vulnerable to sea level rise under baseline conditions
    * # of schools vulnerable to sea level rise under 1.4m rise contions
    * # of wastewater treatment plants vulnerable to sea level rise under baseline conditions
    * # of wastewater treatment plants vulnerable to sea level rise under 1.4m rise conditions
    * # of hospitals vulnerable to sea level rise under baseline conditions
    * # of hospitals vulnerable to sea level rise under 1.4m rise conditions
    * # of superfund sites vulnerable to flooding following 1.4m sea level rise

    Parameters
    ----------
    input_csv: list
        list of calculated metric csv's
    export: bool, optional
        True to upload csvs to AWS, False otherwise.

    Methods
    --------
    Relevant columns for Cal-CRAI SLR metrics were isolated from the original dataset.
    For 'number of' metrics, data were grouped and summed by county.
    For 'percent of' metrics, data were grouped by tract and averaged.
    Data were then merged with California census data so 'number of' metrics represent total vulnerable facilities for each county while 'percent of' metrics represent the vulnerability at the tract level.

    Script
    ------
    climate_slr_pacific.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    ''' 
    print('Data transformation: eliminate excess headers and columns not relevant to metric calculation.')
    print('Data transformation: for number of metrics, data were grouped and summed by county.')
    print('Data transformation: for percentage of metrics, data were grouped by tract and averaged as there were multiple entries per tract.')
    print('Data transformation: data were merged to California census tracts so each metric value were attributed to CA tracts.')
    
    if export == True:
        # pull csv from aws
        bucket_name = 'ca-climate-index'
        upload_csv_aws([input_csv], bucket_name, '3_fair_data/index_data')

    if export == False:
        print(f'{input_csv} uploaded to AWS.')

        os.remove(input_csv)  # Remove local file after upload

In [40]:
input_csvs = [
'slr_vulnerable_baseline_metric.csv',  
'slr_vulnerable_future_metric.csv', 
'slr_fire_stations_2000_metric.csv',
'slr_fire_stations_2100_metric.csv',
'slr_police_stations_2000_metric.csv',
'slr_police_stations_2100_metric.csv',
'slr_schools_2000_metric.csv',
'slr_schools_2100_metric.csv',
'slr_wastewater_2000_metric.csv',
'slr_wastewater_2100_metric.csv',
'slr_hospitals_2000_metric.csv',
'slr_hospitals_2100_metric.csv',
'slr_superfund_2100_metric.csv'
]

varnames = [
'climate_pacific_institute_slr_exposure_tracts_2000',
'climate_pacific_institute_slr_exposure_tracts_2100',
'climate_pacific_institute_slr_exposure_fire_stations_2000',
'climate_pacific_institute_slr_exposure_fire_stations_2100',
'climate_pacific_institute_slr_exposure_police_stations_2000',
'climate_pacific_institute_slr_exposure_police_stations_2100',
'climate_pacific_institute_slr_exposure_schools_2000',
'climate_pacific_institute_slr_exposure_schools_2100',
'climate_pacific_institute_slr_exposure_wastewater_treatment_2000',
'climate_pacific_institute_slr_exposure_wastewater_treatment_2100',
'climate_pacific_institute_slr_exposure_hospitals_2000',
'climate_pacific_institute_slr_exposure_hospitals_2100',
'climate_pacific_institute_slr_exposure_superfund'
]

# Process the data and export
for csv, var in zip(input_csvs, varnames):
    slr_pacific_metric_upload(csv, export=True, varname='test') #var