## Cal-CRAI Metric Calculation
Domain: Climate Risks \
Indicator: Flooding Loss

This notebook calculates two metrics, sourced from NOAA:
* Metric 1: Number of direct flood/flash flood fatalities since 1996
* Metric 2: Estimated monetary crop damage

In [2]:
import os
import sys
import pandas as pd
import io
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)
from scripts.utils.file_helpers import (
    pull_csv_from_directory, upload_csv_aws
) 
pd.set_option('display.max_columns', None)

In [None]:
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/climate_risk/flood/loss/noaa/downloaded_files/all_events/'
folder = 'csv_folder'

pull_csv_from_directory(bucket_name, aws_dir, folder, search_zipped=False)

In [None]:
# read in data
noaa_storm_event_data = pd.read_csv(f'csv_folder/all_noaa_storm_events_ca.csv')
noaa_storm_event_data

In [None]:
noaa_storm_event_data.columns

In [None]:
# isolate for relevant columns
columns = [
'EVENT_ID',
'YEAR', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_FIPS', 'DEATHS_DIRECT',
'DAMAGE_CROPS','BEGIN_LAT', 'BEGIN_LON'
]
noaa_storm_columns = noaa_storm_event_data[columns]
noaa_storm_columns

In [None]:
# checking how many nan entries are within one of the lat columns
nan_count = noaa_storm_columns.BEGIN_LAT.isna().sum()
print(f"Number of NaN values in BEGIN_LAT: {nan_count}")

In [None]:
# view all entries within the event type columns
noaa_storm_columns.EVENT_TYPE.unique()

In [None]:
# select for relevent flood related events
flood_types = [    
    'Flash Flood',
    'Heavy Rain',  
    'Flood',
    'Coastal Flood',  
    'Tsunami'
]

cz_type_county = ['C']

# filter for flood events based on event type
flood_events = noaa_storm_columns[noaa_storm_columns['EVENT_TYPE'].isin(flood_types)]
flood_events

In [None]:
# there is a small percentage of the data with lat & lon entries, so we will use county entries within the CZ_TYPE and CZ_FIPS
# turther filter for county events (CZ_TYPE == 'C')
flood_county_events = flood_events[flood_events['CZ_TYPE'].isin(cz_type_county)]

flood_county_events = flood_county_events.copy()
flood_county_event_filtered = flood_county_events.drop(columns=['BEGIN_LAT', 'BEGIN_LON'])
# display the updated DataFrame
flood_county_event_filtered

In [None]:
# drop duplicates based on event id columns, there are no duplicates
selected_columns = ['EVENT_ID']
flood_county_event_drop_duplicates = flood_county_event_filtered.duplicated(subset=selected_columns, keep='first')
flood_county_event_drop_duplicates
duplicate_count = flood_county_event_drop_duplicates.sum()
duplicate_count

In [None]:
flood_county_event_filtered

In [None]:
flood_county_event_filtered.DAMAGE_CROPS.unique()

In [None]:
# function to convert 'M' and 'K' suffixes while properly adjusting value
def convert_to_numeric(value):
    if pd.isna(value):
        return np.nan
    value = value.strip()
    if value.endswith('M'):
        return float(value[:-1]) * 1e6
    elif value.endswith('K'):
        return float(value[:-1]) * 1e3
    else:
        return float(value)
    
flood_county_event_adjusted = flood_county_event_filtered
# Apply the function to the DAMAGE_CROPS column
flood_county_event_adjusted['DAMAGE_CROPS'] = flood_county_event_adjusted['DAMAGE_CROPS'].apply(convert_to_numeric)

# Display the DataFrame to verify the conversion
flood_county_event_adjusted

In [None]:
# sum indirect and direct deaths to create total fatalies column
flood_county_event_filtered['total_fatalities'] = flood_county_event_filtered['DEATHS_DIRECT']
flood_county_event_filtered['DAMAGE_CROPS'] = pd.to_numeric(flood_county_event_filtered['DAMAGE_CROPS'])

flood_county_event_filtered

In [None]:
columns_to_sum = ['total_fatalities','DAMAGE_CROPS']
flood_loss_sums = flood_county_event_filtered.groupby('CZ_FIPS')[columns_to_sum].sum().reset_index()
flood_loss_sums = flood_loss_sums.rename(columns={'CZ_FIPS': 'countyfp', 'DAMAGE_CROPS':'estimated_crop_loss_cost'})
flood_loss_sums['countyfp'] = flood_loss_sums['countyfp'].astype(str).str.zfill(3)

flood_loss_sums
                                                   

In [None]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

In [None]:
# merge flood loss data with California census tract data
flood_loss_merge = pd.merge(ca_tract_county, flood_loss_sums, on='countyfp', how='left')
flood_loss_merge

In [19]:
# save metric calculated data as separate csvs for upload
flood_crop_loss_metric = flood_loss_merge[['tract', 'countyfp', 'estimated_crop_loss_cost']]
flood_crop_loss_metric = flood_crop_loss_metric.rename(columns={'estimated_crop_loss_cost':'estimated_flood_crop_loss_cost'})

flood_fatality_loss_metric = flood_loss_merge[['tract', 'countyfp', 'total_fatalities']]
flood_fatality_loss_metric = flood_fatality_loss_metric.rename(columns={'total_fatalities':'total_flood_fatalities'})

flood_crop_loss_metric.to_csv('climate_flood_crop_loss_metric.csv', index=False)
flood_fatality_loss_metric.to_csv('climate_flood_fatality_loss_metric.csv', index=False)

In [23]:
@append_metadata
def flood_loss_upload(input_csv, export=False, varname=''):
    '''
    Uploads the flood loss metrics to S3 bucket. The metrics are:
    
    * number of direct flood/flash flood fatalities per county since 1996
    * estimated monetary crop damage
    
    Data for this metric was sourced from NOAA's - Storm event database:
    https://www.ncdc.noaa.gov/stormevents/

    Methods
    -------
    Relevant columns to our data metrics were isolated, various spatial options were considered, but county fips had the fewest NaN data.
    Data was isolated to include flooding and flood related storm events.
    Duplicates were removed based on storm event ID.
    Data was isolated to included populated county entries.
    Direct fatalities summed to create total fatalities column.
    Data were grouped by county and summed to calculate final metrics.
    
    Parameters
    ----------
    input_csv: string
        csv flood loss data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI flood loss metrics to AWS
        True = will upload resulting df containing CAL CRAI flood loss metrics to AWS

    Script
    ------
    climate_flood_loss.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: relevant columns were isolated and renamed')
    print('Data transformation: duplicate entries by event ID were dropped.')
    print('Data transformation: data was grouped by county and summed.')
    print('Data transformation: data was merged with California census tracts.') 
 
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    if os.path.exists(input_csv):
        os.remove(input_csv)

In [None]:
input_csvs = ['climate_flood_crop_loss_metric.csv',
               'climate_flood_fatality_loss_metric.csv']

varnames = ['climate_noaa_flood_crop_cost',
            'climate_noaa_flood_fatalities']

# Process the data and export
for input_csv, varname in zip(input_csvs, varnames):
    print(f'Processing {input_csv} with varname {varname}')
    flood_loss_upload(input_csv, export=True, varname='test')
    print(f'Completed uploading {input_csv} with varname {varname}')