## Cal-CRAI Metric Calculation
Domain: Climate Risks \
Indicator: Flooding Loss

This notebook calculates one metric, sourced from the Federal Emergency Management Agency:
* Metric 1: Average flood insurance payout per number of claims per census tract

In [1]:
import os
import sys
import pandas as pd
import io
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)
from scripts.utils.file_helpers import (
    pull_csv_from_directory, upload_csv_aws
) 
pd.set_option('display.max_columns', None)

In [None]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

In [None]:
bucket_name = 'ca-climate-index'
aws_dir = '2a_subset/climate_risk/flood/loss/fema/flood_claims_ca/'
folder = 'csv_folder'

pull_csv_from_directory(bucket_name, aws_dir, folder, search_zipped=False)

In [None]:
flood_claim_data = pd.read_csv(r'csv_folder/fema_claims_CA_subset.csv')

In [None]:
flood_claim_data.columns

In [None]:
flood_claim_data.head()

In [None]:
# select relevant columns to our metrics
columns_keep = [
    'id',
    'countyCode',
    'censusTract',
    'policyCount',
    'yearOfLoss',
    'netIccPaymentAmount',
    'netContentsPaymentAmount',
    'netBuildingPaymentAmount',
    "amountPaidOnBuildingClaim", 
    "amountPaidOnContentsClaim", 
    "amountPaidOnIncreasedCostOfComplianceClaim"
]
flood_claim_columns = flood_claim_data[columns_keep]

# adjust county and tract columns, changing name, data type, and data formatting
flood_claim_columns['countyCode'] = flood_claim_columns['countyCode'].astype(str).str[1:].str.split('.').str[0]
flood_claim_columns = flood_claim_columns.rename(columns={'countyCode':'countyfp'})

flood_claim_columns['censusTract'] = flood_claim_columns['censusTract'].apply(lambda x: '0' + str(int(float(x))) if pd.notnull(x) else x)
flood_claim_columns = flood_claim_columns.rename(columns={'censusTract':'tract'})

# drop duplicates based on event id columns, there are no duplicates
selected_columns = ['id']
flood_claim_drop_duplicates = flood_claim_columns.duplicated(subset=selected_columns, keep='first')
duplicate_count = flood_claim_drop_duplicates.sum()
duplicate_count

duplicate_count

In [None]:
# drop rows that do not have location identifier
flood_claim_cleaned = flood_claim_columns.dropna(subset=['countyfp', 'tract'], how='all')

# drop rows that do not have a value when summing payout columns
columns_to_sum = ["amountPaidOnBuildingClaim", 
                  "amountPaidOnContentsClaim", 
                  "amountPaidOnIncreasedCostOfComplianceClaim"]

# Create a new dataframe with rows where the sum of the specified columns is non-zero
flood_claim_cleaned = flood_claim_cleaned[flood_claim_cleaned[columns_to_sum].sum(axis=1) != 0]

# Display the new dataframe
print(len(flood_claim_cleaned))
flood_claim_cleaned.head()


## Metric 1: Average flood insurance payout per number of claims per census tract

In [None]:
# create a new column summing relevant columns representing how much insurance paid out on these claims
flood_claim_total_cost = flood_claim_cleaned
flood_claim_total_cost['total_insurance_payment'] = flood_claim_cleaned['netIccPaymentAmount'] + flood_claim_cleaned['netContentsPaymentAmount'] + flood_claim_cleaned['netBuildingPaymentAmount']
flood_claim_total_cost

In [None]:
# group the data my tract and year of loss and sum the total cost
flood_claim_cost_grouped = flood_claim_total_cost.groupby(['tract', 'yearOfLoss']).agg({
    'total_insurance_payment': 'sum',            # Sum the policyCount
    'id': 'count'
}).reset_index()

# Display the resulting DataFrame
flood_claim_cost_grouped.head(5)

In [None]:
# Group by countyCode and calculate the mean for total_insurance_payment
county_cost_averages = flood_claim_cost_grouped.groupby('tract').agg({
    'total_insurance_payment': 'mean',   # Calculate the average rowCount
    'id' : 'sum'
}).reset_index()

# Rename columns for clarity
county_cost_averages = county_cost_averages.rename(columns={
    'total_insurance_payment': 'average_insurance_payout',
    'id':'total_claims'
})

# Display the resulting DataFrame
county_cost_averages.head()

In [None]:
county_cost_averages['avg_insurance_payout_per_claim'] = county_cost_averages['average_insurance_payout'] / county_cost_averages['total_claims']

county_cost_averages

In [None]:
# merge with CA census tracts
merged_flood_claims_cost = pd.merge(ca_tract_county, county_cost_averages, on='tract', how='left')
print(len(merged_flood_claims_cost))
merged_flood_claims_cost.tail()

In [None]:
flood_claims_metric = merged_flood_claims_cost.drop(columns={'countyfp', 'average_insurance_payout', 'total_claims'})
flood_claims_metric = flood_claims_metric.rename(columns={'avg_insurance_payout_per_claim':'avg_flood_insurance_payout_per_claim'})
flood_claims_metric

In [None]:
non_nan_count = flood_claims_metric['avg_flood_insurance_payout_per_claim'].count()
non_nan_count

In [16]:
flood_claims_metric.to_csv('climate_flood_cost_metric.csv', index=False)

Function Call

In [17]:
@append_metadata
def flood_claims_cost_upload(input_csv, export=False, varname=''):
    '''
    Uploads the flood claim and cost metrics to S3 bucket. The metrics are:
    
    * Average flood insurance payout per number of claims per census tract
    
    Data for these metrics are sourced from FEMA's redacted NFIP claims at:
    https://www.google.com/url?q=https://www.fema.gov/openfema-data-page/fima-nfip-redacted-claims-v2&sa=D&source=editors&ust=1723749642983941&usg=AOvVaw0-Ri52Pad7wzLYu2eNKABx

    Methods
    -------
    Relevant columns to our data metrics were isolated, renamed, and adjusted for consistency.
    Data was isolated to include data in non-nan census tracts.
    Duplicates were removed based on event ID.
    Only rows that had non-nan values after summing payout columns were retained.
    Data were grouped by tract and year then summed to identify number of events/cost per tract per year.
    Data were grouped again by tract and averaged to identify average cost/number of policies per census tract.
    The payout column was divided by number of claims.
    Parameters
    ----------
    input_csv: string
        csv flood claim/cost data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI flood claim/cost  metrics to AWS
        True = will upload resulting df containing CAL CRAI flood claim/cost metrics to AWS

    Script
    ------
    climate_flood_claims.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: relevant columns were isolated and renamed')
    print('Data transformation: duplicate entries by event ID were dropped.')
    print('Data transformation: data was grouped by tract & year then summed, then grouped once again and averaged.')
    print('Data transformation: data was merged with California census tracts.') 
 
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    if os.path.exists(input_csv):
        os.remove(input_csv)

In [None]:
input_csvs = ['climate_flood_cost_metric.csv']

varnames = ['climate_fema_nfip_claim_cost']

# Process the data and export
for input_csv, varname in zip(input_csvs, varnames):
    print(f'Processing {input_csv} with varname {varname}')
    flood_claims_cost_upload(input_csv, export=True, varname='test')
    print(f'Completed uploading {input_csv} with varname {varname}')