## Cal-CRAI Metric Calculation
Domain: Climate Risks \
Indicator: Wildfire Loss

This notebook calculates two metrics, sourced from CalFire's historical redbooks:
* Metric 1: Number of damaged/destroyed buildings per county per year from wildfire
* Metric 2: Number of fatalities per county per year from wildfire

In [1]:
import pandas as pd
import os
import sys
import math
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/climate_risk/wildfire/loss/historical/calfire_redbooks/cleaned_stitched_calfire_redbook_data/'
folder = 'csv_folder'

pull_csv_from_directory(bucket_name, aws_dir, folder, search_zipped=False)

In [None]:
# read in air quality data (already for state of CA)
wildfire_loss = pd.read_csv(r'csv_folder/calfire_redbooks_cleaned.csv')
print(len(wildfire_loss))
wildfire_loss.head(5)
#os.remove('calfire_redbooks_cleaned.csv')

#### The dataset has inconsistent date formats and capitalization
* we adjust the dating formats and create a new year column
* we bring all entries to lower case
* all nan's are treated as 0

In [None]:
# Replace NaN values with 0
wildfire_loss.fillna(0, inplace=True)

# Convert 'date_start' column to datetime
wildfire_loss['date_start'] = pd.to_datetime(wildfire_loss['date_start'], format='mixed', dayfirst=True)

# Format datetime to mm/dd/yyyy
wildfire_loss['date_start'] = wildfire_loss['date_start'].dt.strftime('%m/%d/%Y')

# Assuming wildfire_loss is your DataFrame
wildfire_loss['date_start'] = pd.to_datetime(wildfire_loss['date_start'], format='%m/%d/%Y', errors='coerce')

# Extract year from the 'date_start' column and create a new 'year' column
wildfire_loss['year'] = wildfire_loss['date_start'].dt.year.astype(int)

# Ensure all entries within county column are lower case
wildfire_loss['county/unit'] = wildfire_loss['county/unit'].str.lower()

wildfire_loss

#### There are multiple entries that contain two or more counties

The code below duplicates rows that have a '-' OR ',' within the county column and assigns the original row the county preceeding the '-' or ',' while the duplicate takes on the second or third county name. 
The method below equally splits the destroyed/damaged structures and fatality values between the rows that had shared counties

In [5]:
# Initialize an empty list to store the modified rows
new_rows = []

# Iterate through each row in the DataFrame
for index, row in wildfire_loss.iterrows():
    county_unit = row['county/unit']
    
    # Check if the county/unit is a string
    if isinstance(county_unit, str):
        county_unit = county_unit.replace('\n', '')

        # Split the county/unit string by '-' or ',' or '/'
        if '-' in county_unit:
            county_units = county_unit.split('-')
        elif ',' in county_unit:
            county_units = county_unit.split(',')
        elif '/' in county_unit:
            county_units = county_unit.split('/')
            # Strip any whitespace around the county names
            county_units = [county.strip() for county in county_units]
        else:
            county_units = [county_unit]
        
        # Count the number of counties
        num_counties = len(county_units)
        
        # Convert values in the specified columns to integers or floats
        destroyed_structures = row['destroyed_structures']
        damaged_structures = row['damaged_structures']
        firefighter_fatalities = row['firefighter_fatalities']
        civil_fatalities = row['civil_fatalities']
        
        if isinstance(destroyed_structures, str):
            destroyed_structures = destroyed_structures.replace(',', '')
            if destroyed_structures.isdigit():
                destroyed_structures = int(destroyed_structures)
            else:
                destroyed_structures = 0
        
        if isinstance(damaged_structures, str):
            damaged_structures = damaged_structures.replace(',', '')
            if damaged_structures.isdigit():
                damaged_structures = int(damaged_structures)
            else:
                damaged_structures = 0
        
        if isinstance(firefighter_fatalities, str):
            firefighter_fatalities = firefighter_fatalities.replace(',', '')
            if firefighter_fatalities.isdigit():
                firefighter_fatalities = float(firefighter_fatalities)
            else:
                firefighter_fatalities = 0
        
        if isinstance(civil_fatalities, str):
            civil_fatalities = civil_fatalities.replace(',', '')
            if civil_fatalities.isdigit():
                civil_fatalities = float(civil_fatalities)
            else:
                civil_fatalities = 0
        
        if num_counties > 1:
            destroyed_structures = math.ceil(destroyed_structures / num_counties)
            damaged_structures = math.ceil(damaged_structures / num_counties)
            firefighter_fatalities = math.ceil(firefighter_fatalities / num_counties)
            civil_fatalities = math.ceil(civil_fatalities / num_counties)
        
        # Iterate through each county/unit part
        for county in county_units:
            # Create a new row for each county/unit part
            new_row = row.copy()
            new_row['county'] = county.strip()
            new_row['destroyed_structures'] = destroyed_structures
            new_row['damaged_structures'] = damaged_structures
            new_row['firefighter_fatalities'] = firefighter_fatalities
            new_row['civil_fatalities'] = civil_fatalities
            new_rows.append(new_row)
    else:
        # If the county/unit is not a string (e.g., NaN), handle it accordingly
        # For now, just append the original row to the new_rows list
        new_rows.append(row)

# Create a new DataFrame from the modified rows
cleaned_wildfire_loss = pd.DataFrame(new_rows)

# Define a dictionary mapping old values to new values
rename_map = {
    'mardera': 'madera',
    'tahema': 'tehama',
    'toulumne' : 'tuolumne',
     'tehema' : 'tehama',
     'tuolomne' : 'tuolumne'
    }
# deleting problematic row that will not split, only had one data field with a value of 1
cleaned_wildfire_loss = cleaned_wildfire_loss.drop(index=122)

# Use the replace method to rename the entries
cleaned_wildfire_loss['county'] = cleaned_wildfire_loss['county'].replace(rename_map)

#### Creating CRI metric columns by summing structure and fatality related data respectively

In [None]:
cleaned_wildfire_loss['damaged_destroyed_structures'] = cleaned_wildfire_loss['damaged_structures'] + cleaned_wildfire_loss['destroyed_structures']
cleaned_wildfire_loss['total_fatalities'] = cleaned_wildfire_loss['firefighter_fatalities'] + cleaned_wildfire_loss['civil_fatalities']
cleaned_wildfire_loss.head(5)

#### Isolating relevant columns

In [7]:
isolated_cleaned_wildfire_loss = cleaned_wildfire_loss[['county', 'year', 'damaged_destroyed_structures', 'total_fatalities']]
isolated_cleaned_wildfire_loss, omitted_rows = filter_counties(isolated_cleaned_wildfire_loss, 'county', county_list=None)

In [None]:
omitted_rows

In [None]:
isolated_cleaned_wildfire_loss

#### Grouping data by county and aggregate the structure and fatality data so we get total damaged structures and total fatalities per county from 2008-2020

In [None]:
cri_wildfire_loss = isolated_cleaned_wildfire_loss.groupby('county').agg({'damaged_destroyed_structures': 'sum', 'total_fatalities': 'sum'}).reset_index()
print(len(cri_wildfire_loss))
cri_wildfire_loss.head(5)

In [None]:
cri_wildfire_loss['average_damaged_destroyed_structures'] = cri_wildfire_loss['damaged_destroyed_structures'] // 13
cri_wildfire_loss['average_fatalities'] = cri_wildfire_loss['total_fatalities'] // 13
cri_wildfire_loss.head(50)

In [None]:
# checking the total county results by isolated to one county per year
fact_checking = isolated_cleaned_wildfire_loss.groupby(['county', 'year']).agg({'damaged_destroyed_structures': 'sum', 'total_fatalities': 'sum'}).reset_index()
desired_county_data = fact_checking[fact_checking['county'] == 'butte']
print(desired_county_data)

In [13]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
ca_tract_county = ca_tract_county.rename(columns={'TRACT':'USCB_GEOID', 'County':'county'})
ca_tract_county['county'] = ca_tract_county['county'].str.lower()

In [None]:
ca_tract_county

In [None]:
cri_wildfire_loss

In [None]:
cri_wildfire_loss_metric = pd.merge(ca_tract_county, cri_wildfire_loss, on='county', how='left')
cri_wildfire_loss_metric

In [17]:
cri_wildfire_structure_loss_metric = cri_wildfire_loss_metric[['USCB_GEOID', 'county', 'average_damaged_destroyed_structures']]
cri_wildfire_structure_loss_metric = cri_wildfire_structure_loss_metric.rename(columns={'average_damaged_destroyed_structures':'average_damaged_destroyed_structures_wildfire'})

cri_wildfire_fatality_loss_metric = cri_wildfire_loss_metric[['USCB_GEOID', 'county', 'average_fatalities']]
cri_wildfire_fatality_loss_metric = cri_wildfire_fatality_loss_metric.rename(columns={'average_fatalities':'average_annual_fatalities_wildfire'})

In [18]:
# Saving metric df to .csv file
cri_wildfire_structure_loss_metric.to_csv('climate_wildfire_redbooks_loss_buildings_metric.csv', index=False)
cri_wildfire_fatality_loss_metric.to_csv('climate_wildfire_redbooks_loss_fatalities_metric.csv', index=False)

#### Function call for this metric

In [21]:
@append_metadata
def wildfire_loss_calc(input_csv, export=False, varname=''):
    '''
    Calculates the total number of damaged/destroyed structures and fatalities 
    resulting from wildfires per California county between 2008-2020. The data 
    used to calculate these metrics are sourced from CALFIRE's Redbook data: 
    https://www.fire.ca.gov/our-impact/statistics
  
    Methods
    --------
    Data was stitched together and cleaned by utilizing a California county filter function. 
    Often, entries contained multiple counties where a single fire occured. 
    In these cases, the number of fatalities and affected structures were equally split between the number of
    counties. 
    The number of fatalities and affected structures were summed within each county across the 13 year dataset. 
    Averages were also calculated using the total temporal range of the dataset (13).
    Data were then merged to California tract data, so each tract had the metric values from the county it resides in.

    Parameters
    ------------
    df: string
        the dataframe containing the wildfire losses data
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI wildfire loss metric to AWS
        True = will upload resulting df containing CAL CRAI wildfire loss metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Script
    ------
    climate_wildfire_redbooks_loss.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are
    stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: convert fire ignition date to year.')
    print('Data transformation: nan values within the data are treated as 0.')
    print('Data transformation: rows that contained multiple counties had their data split equally between each county.')
    print('Data transformation: convert fire ignition date to year.')
    print('Data transformation: misspelled counties were adjusted to correct spelling.')
    print('Data transformation: data were merged to California tracts.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    '''if os.path.exists(input_csv):
        os.remove(input_csv)'''

In [None]:
input_csvs = ['climate_wildfire_redbooks_loss_fatalities_metric.csv',
               'climate_wildfire_redbooks_loss_buildings_metric.csv']

varnames = ['climate_calfire_wildfire_fatalities','climate_calfire_wildfire_building_loss']

# Process the data and export
for input_csv, varname in zip(input_csvs, varnames):
    print(f'Processing {input_csv} with varname {varname}')
    wildfire_loss_calc(input_csv, export=True, varname='test') #varname