# This notebook calculates the following metrics sourced from CAL-FIRE's historical Redbooks
* number of damaged/destroyed buildings per county per year from wildfire
* number of fatalities per county per year from wildfire


In [1]:
import pandas as pd
import os
import sys
import math

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/climate_risk/wildfire/loss/historical/calfire_redbooks/cleaned_stitched_calfire_redbook_data/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'calfire_redbooks_cleaned.csv'


In [3]:
# read in air quality data (already for state of CA)
wildfire_loss = pd.read_csv('calfire_redbooks_cleaned.csv')
print(len(wildfire_loss))
wildfire_loss.head(5)
os.remove('calfire_redbooks_cleaned.csv')

1005


#### The dataset has inconsistent date formats and capitalization
* we adjust the dating formats and creat a new year column
* we bring all entries to lower case
* all nan's are treated as 0

In [4]:
# Replace NaN values with 0
wildfire_loss.fillna(0, inplace=True)

# Convert 'date_start' column to datetime
wildfire_loss['date_start'] = pd.to_datetime(wildfire_loss['date_start'], format='mixed', dayfirst=True)

# Format datetime to mm/dd/yyyy
wildfire_loss['date_start'] = wildfire_loss['date_start'].dt.strftime('%m/%d/%Y')

# Assuming wildfire_loss is your DataFrame
wildfire_loss['date_start'] = pd.to_datetime(wildfire_loss['date_start'], format='%m/%d/%Y', errors='coerce')

# Extract year from the 'date_start' column and create a new 'year' column
wildfire_loss['year'] = wildfire_loss['date_start'].dt.year.astype(int)

# Ensure all entries within county column are lower case
wildfire_loss['county/unit'] = wildfire_loss['county/unit'].str.lower()

wildfire_loss

Unnamed: 0,incident_number,county/unit,fire_name,date_start,date_controlled,direct_protection_agency,total_burned_acres,veg_type,cause,destroyed_structures,damaged_structures,firefighter_fatalities,civil_fatalities,year
0,0,san bernardino,Bluff,2008-03-16,3/20/2008,CALFIRE,680,"Brush, Grass",Campfire,0,0,0.0,0.0,2008
1,0,tehama-glenn,Colyear,2008-06-05,5/9/2008,CALFIRE,1331,Brush,Debris,0,0,0.0,0.0,2008
2,0,fresno-kings,Avocado,2008-05-20,5/21/2008,CALFIRE,1100,Grass,Equipment Use,0,0,0.0,0.0,2008
3,0,fresno-kings,Gatos,2008-05-22,5/22/2008,CALFIRE,331,Grass,Miscellaneous,0,0,0.0,0.0,2008
4,0,santa clara,Summit,2008-05-22,6/15/2008,CALFIRE,4270,Timber,Unidentified,0,91,0.0,0.0,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,LPF3161,monterey,COLEMAN,2020-10-18,11/15/20,USFS,574,"B, G",UNDETERMINED,0,0,0.0,0.0,2020
1001,NOD-6431,lassen,LAURA 2,2020-11-17,11/23/20,BLM,2800,"B, G",UNDETERMINED,42,7,0.0,0.0,2020
1002,OVD-30860,mono,MOUNTAIN VIEW,2020-11-17,11/30/20,BLM,20385,"B, G",UNDETERMINED,97,0,0.0,1.0,2020
1003,COR-12597,riverside,AIRPORT,2020-01-12,12/17/20,LOCAL,1087,"B, G",UNDETERMINED,0,0,0.0,0.0,2020


#### There are multiple entries that contain two or more counties

The code below duplicates rows that have a '-' OR ',' within the county column and assigns the original row the county preceeding the '-' or ',' while the duplicate takes on the second or third county name. 
The method below equally splits the destroyed/damaged structures and fatality values between the rows that had shared counties

In [5]:
# Initialize an empty list to store the modified rows
new_rows = []

# Iterate through each row in the DataFrame
for index, row in wildfire_loss.iterrows():
    county_unit = row['county/unit']
    
    # Check if the county/unit is a string
    if isinstance(county_unit, str):
        county_unit = county_unit.replace('\n', '')

        # Split the county/unit string by '-' or ',' or '/'
        if '-' in county_unit:
            county_units = county_unit.split('-')
        elif ',' in county_unit:
            county_units = county_unit.split(',')
        elif '/' in county_unit:
            county_units = county_unit.split('/')
            # Strip any whitespace around the county names
            county_units = [county.strip() for county in county_units]
        else:
            county_units = [county_unit]
        
        # Count the number of counties
        num_counties = len(county_units)
        
        # Convert values in the specified columns to integers or floats
        destroyed_structures = row['destroyed_structures']
        damaged_structures = row['damaged_structures']
        firefighter_fatalities = row['firefighter_fatalities']
        civil_fatalities = row['civil_fatalities']
        
        if isinstance(destroyed_structures, str):
            destroyed_structures = destroyed_structures.replace(',', '')
            if destroyed_structures.isdigit():
                destroyed_structures = int(destroyed_structures)
            else:
                destroyed_structures = 0
        
        if isinstance(damaged_structures, str):
            damaged_structures = damaged_structures.replace(',', '')
            if damaged_structures.isdigit():
                damaged_structures = int(damaged_structures)
            else:
                damaged_structures = 0
        
        if isinstance(firefighter_fatalities, str):
            firefighter_fatalities = firefighter_fatalities.replace(',', '')
            if firefighter_fatalities.isdigit():
                firefighter_fatalities = float(firefighter_fatalities)
            else:
                firefighter_fatalities = 0
        
        if isinstance(civil_fatalities, str):
            civil_fatalities = civil_fatalities.replace(',', '')
            if civil_fatalities.isdigit():
                civil_fatalities = float(civil_fatalities)
            else:
                civil_fatalities = 0
        
        if num_counties > 1:
            destroyed_structures = math.ceil(destroyed_structures / num_counties)
            damaged_structures = math.ceil(damaged_structures / num_counties)
            firefighter_fatalities = math.ceil(firefighter_fatalities / num_counties)
            civil_fatalities = math.ceil(civil_fatalities / num_counties)
        
        # Iterate through each county/unit part
        for county in county_units:
            # Create a new row for each county/unit part
            new_row = row.copy()
            new_row['county'] = county.strip()
            new_row['destroyed_structures'] = destroyed_structures
            new_row['damaged_structures'] = damaged_structures
            new_row['firefighter_fatalities'] = firefighter_fatalities
            new_row['civil_fatalities'] = civil_fatalities
            new_rows.append(new_row)
    else:
        # If the county/unit is not a string (e.g., NaN), handle it accordingly
        # For now, just append the original row to the new_rows list
        new_rows.append(row)

# Create a new DataFrame from the modified rows
cleaned_wildfire_loss = pd.DataFrame(new_rows)

# Define a dictionary mapping old values to new values
rename_map = {
    'mardera': 'madera',
    'tahema': 'tehama',
    'toulumne' : 'tuolumne',
     'tehema' : 'tehama',
     'tuolomne' : 'tuolumne'
    }
# deleting problematic row that will not split, only had one data field with a value of 1
cleaned_wildfire_loss = cleaned_wildfire_loss.drop(index=122)

# Use the replace method to rename the entries
cleaned_wildfire_loss['county'] = cleaned_wildfire_loss['county'].replace(rename_map)

#### Creating CRI metric columns by summing structure and fatality related data respectively

In [6]:
cleaned_wildfire_loss['damaged_destroyed_structures'] = cleaned_wildfire_loss['damaged_structures'] + cleaned_wildfire_loss['destroyed_structures']
cleaned_wildfire_loss['total_fatalities'] = cleaned_wildfire_loss['firefighter_fatalities'] + cleaned_wildfire_loss['civil_fatalities']
cleaned_wildfire_loss.head(5)

Unnamed: 0,incident_number,county/unit,fire_name,date_start,date_controlled,direct_protection_agency,total_burned_acres,veg_type,cause,destroyed_structures,damaged_structures,firefighter_fatalities,civil_fatalities,year,county,damaged_destroyed_structures,total_fatalities
0,0,san bernardino,Bluff,2008-03-16,3/20/2008,CALFIRE,680,"Brush, Grass",Campfire,0,0,0.0,0.0,2008,san bernardino,0,0.0
1,0,tehama-glenn,Colyear,2008-06-05,5/9/2008,CALFIRE,1331,Brush,Debris,0,0,0.0,0.0,2008,tehama,0,0.0
1,0,tehama-glenn,Colyear,2008-06-05,5/9/2008,CALFIRE,1331,Brush,Debris,0,0,0.0,0.0,2008,glenn,0,0.0
2,0,fresno-kings,Avocado,2008-05-20,5/21/2008,CALFIRE,1100,Grass,Equipment Use,0,0,0.0,0.0,2008,fresno,0,0.0
2,0,fresno-kings,Avocado,2008-05-20,5/21/2008,CALFIRE,1100,Grass,Equipment Use,0,0,0.0,0.0,2008,kings,0,0.0


#### Isolating relevant columns

In [7]:
isolated_cleaned_wildfire_loss = cleaned_wildfire_loss[['county', 'year', 'damaged_destroyed_structures', 'total_fatalities']]
isolated_cleaned_wildfire_loss, omitted_rows = filter_counties(isolated_cleaned_wildfire_loss, 'county', county_list=None)

In [8]:
omitted_rows

Unnamed: 0,county,year,damaged_destroyed_structures,total_fatalities
156,,1970,0,0.0
219,vandenburg afb,2009,0,0.0
648,washoe,2017,0,0.0
665,jackson (or),2017,0,0.0
674,washoe (nv),2017,0,0.0
971,washoe (nv),2020,1,0.0


#### Grouping data by county and aggregate the structure and fatality data so we get total damaged structures and total fatalities per county from 2008-2020

In [9]:
cri_wildfire_loss = isolated_cleaned_wildfire_loss.groupby('county').agg({'damaged_destroyed_structures': 'sum', 'total_fatalities': 'sum'}).reset_index()
print(len(cri_wildfire_loss))
cri_wildfire_loss.head()

56


Unnamed: 0,county,damaged_destroyed_structures,total_fatalities
0,alameda,54,0.0
1,alpine,0,0.0
2,amador,967,2.0
3,butte,21634,93.0
4,calaveras,9,0.0


In [10]:
cri_wildfire_loss['average_damaged_destroyed_structures'] = cri_wildfire_loss['damaged_destroyed_structures'] // 13
cri_wildfire_loss['average_fatalities'] = cri_wildfire_loss['total_fatalities'] // 13
cri_wildfire_loss.head(5)

Unnamed: 0,county,damaged_destroyed_structures,total_fatalities,average_damaged_destroyed_structures,average_fatalities
0,alameda,54,0.0,4,0.0
1,alpine,0,0.0,0,0.0
2,amador,967,2.0,74,0.0
3,butte,21634,93.0,1664,7.0
4,calaveras,9,0.0,0,0.0


In [11]:
# checking the total county results by isolated to one county per year
fact_checking = isolated_cleaned_wildfire_loss.groupby(['county', 'year']).agg({'damaged_destroyed_structures': 'sum', 'total_fatalities': 'sum'}).reset_index()
desired_county_data = fact_checking[fact_checking['county'] == 'butte']
print(desired_county_data)

   county  year  damaged_destroyed_structures  total_fatalities
8   butte  2008                           511               0.0
9   butte  2010                             0               0.0
10  butte  2013                            68               0.0
11  butte  2014                             0               0.0
12  butte  2015                            16               0.0
13  butte  2016                             3               0.0
14  butte  2017                           239               0.0
15  butte  2018                         19558              85.0
16  butte  2019                             3               0.0
17  butte  2020                          1236               8.0


In [13]:
# Saving metric df to .csv file
cri_wildfire_loss.to_csv('climate_wildfire_redbooks_loss_metric.csv')

In [14]:
# upload final csv file to aws
bucket_name = 'ca-climate-index'
file_name = ['climate_wildfire_redbooks_loss_metric.csv']
directory = '3_fair_data/index_data'

#@append_metadata
upload_csv_aws(file_name, bucket_name, directory)
os.remove('climate_wildfire_redbooks_loss_metric.csv')

climate_wildfire_redbooks_loss_metric.csv uploaded to AWS


#### Function call for this metric

In [12]:
@append_metadata
def wildfire_loss_calc(df, export=False, export_filename=None, varname = ''):
    '''
    Calculates the total number of damaged/destroyed structures and fatalities 
    resulting from wildfires per California county between 2008-2020. The data 
    used to calculate these metrics are sourced from CALFIRE's Redbook data: 
    https://www.fire.ca.gov/our-impact/statistics
  
    Methods
    --------
    Data was stitched together and cleaned by utilizing a California county filter function. 
    Often, entries contained multiple counties where a single fire occured. 
    In these cases, the number of fatalities and affected structures were equally split between the number of counties. 
    The number of fatalities and affected structures were summed within each county across the 13 year dataset. 
    Averages were also calculated using the total temporal range of the dataset (13).

    Parameters
    ------------
    df: string
        the dataframe containing the wildfire losses data
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI wildfire loss metric to AWS
        True = will upload resulting df containing CAL CRAI wildfire loss metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Script
    ------
    climate_wildfire_redbooks_loss.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: convert fire ignition date to year.')
    # Replace NaN values with 0
    wildfire_loss.fillna(0, inplace=True)

    # Convert 'date_start' column to datetime
    wildfire_loss['date_start'] = pd.to_datetime(wildfire_loss['date_start'], format='mixed', dayfirst=True)

    # Format datetime to mm/dd/yyyy
    wildfire_loss['date_start'] = wildfire_loss['date_start'].dt.strftime('%m/%d/%Y')
    wildfire_loss['date_start'] = pd.to_datetime(wildfire_loss['date_start'], format='%m/%d/%Y', errors='coerce')

    # Extract year from the 'date_start' column and create a new 'year' column
    wildfire_loss['year'] = wildfire_loss['date_start'].dt.year.astype(int)

    # Ensure all entries within county column are lower case
    wildfire_loss['county/unit'] = wildfire_loss['county/unit'].str.lower()
        
    # Initialize an empty list to store the modified rows
    new_rows = []

    print('Data transformation: split entries with multiple counties and divide values equally between.')
    # Iterate through each row in the DataFrame
    for index, row in wildfire_loss.iterrows():
        county_unit = row['county/unit']
        
        # Check if the county/unit is a string
        if isinstance(county_unit, str):
            county_unit = county_unit.replace('\n', '')

            # Split the county/unit string by '-' or ',' or '/'
            if '-' in county_unit:
                county_units = county_unit.split('-')
            elif ',' in county_unit:
                county_units = county_unit.split(',')
            elif '/' in county_unit:
                county_units = county_unit.split('/')
                # Strip any whitespace around the county names
                county_units = [county.strip() for county in county_units]
            else:
                county_units = [county_unit]
            
            # Count the number of counties
            num_counties = len(county_units)
            
            # Convert values in the specified columns to integers or floats
            destroyed_structures = row['destroyed_structures']
            damaged_structures = row['damaged_structures']
            firefighter_fatalities = row['firefighter_fatalities']
            civil_fatalities = row['civil_fatalities']
            
            if isinstance(destroyed_structures, str):
                destroyed_structures = destroyed_structures.replace(',', '')
                if destroyed_structures.isdigit():
                    destroyed_structures = int(destroyed_structures)
                else:
                    destroyed_structures = 0
            
            if isinstance(damaged_structures, str):
                damaged_structures = damaged_structures.replace(',', '')
                if damaged_structures.isdigit():
                    damaged_structures = int(damaged_structures)
                else:
                    damaged_structures = 0
            
            if isinstance(firefighter_fatalities, str):
                firefighter_fatalities = firefighter_fatalities.replace(',', '')
                if firefighter_fatalities.isdigit():
                    firefighter_fatalities = float(firefighter_fatalities)
                else:
                    firefighter_fatalities = 0
            
            if isinstance(civil_fatalities, str):
                civil_fatalities = civil_fatalities.replace(',', '')
                if civil_fatalities.isdigit():
                    civil_fatalities = float(civil_fatalities)
                else:
                    civil_fatalities = 0
            
            if num_counties > 1:
                destroyed_structures = math.ceil(destroyed_structures / num_counties)
                damaged_structures = math.ceil(damaged_structures / num_counties)
                firefighter_fatalities = math.ceil(firefighter_fatalities / num_counties)
                civil_fatalities = math.ceil(civil_fatalities / num_counties)
            
            # Iterate through each county/unit part
            for county in county_units:
                # Create a new row for each county/unit part
                new_row = row.copy()
                new_row['county'] = county.strip()
                new_row['destroyed_structures'] = destroyed_structures
                new_row['damaged_structures'] = damaged_structures
                new_row['firefighter_fatalities'] = firefighter_fatalities
                new_row['civil_fatalities'] = civil_fatalities
                new_rows.append(new_row)
        else:
            # If the county/unit is not a string (e.g., NaN), handle it accordingly
            # For now, just append the original row to the new_rows list
            new_rows.append(row)

    # Create a new DataFrame from the modified rows
    cleaned_wildfire_loss = pd.DataFrame(new_rows)

    print('Data transformation: fix misspelled counties.')
    # Define a dictionary mapping old values to new values
    rename_map = {
        'mardera': 'madera',
        'tahema': 'tehama',
        'toulumne' : 'tuolumne',
        'tehema' : 'tehama',
        'tuolomne' : 'tuolumne'
        }
    # deleting problematic row that will not split, only had one data field with a value of 1
    cleaned_wildfire_loss = cleaned_wildfire_loss.drop(index=122)

    # Use the replace method to rename the entries
    cleaned_wildfire_loss['county'] = cleaned_wildfire_loss['county'].replace(rename_map)

    cleaned_wildfire_loss['damaged_destroyed_structures'] = cleaned_wildfire_loss['damaged_structures'] + cleaned_wildfire_loss['destroyed_structures']
    cleaned_wildfire_loss['total_fatalities'] = cleaned_wildfire_loss['firefighter_fatalities'] + cleaned_wildfire_loss['civil_fatalities']

    isolated_cleaned_wildfire_loss = cleaned_wildfire_loss[['county', 'year', 'damaged_destroyed_structures', 'total_fatalities']]

    print('Data transformation: run "filter_counties" function out non-California entries.')
    isolated_cleaned_wildfire_loss, omitted_rows = filter_counties(isolated_cleaned_wildfire_loss, 'county', county_list=None)

    print('Data transformation: group data by county and sum fatalities and affected structures.')
    # Group the DataFrame by 'county' and sum the 'damaged_destroyed_structures' and 'total_fatalities'
    cri_wildfire_loss = isolated_cleaned_wildfire_loss.groupby('county').agg({'damaged_destroyed_structures': 'sum', 'total_fatalities': 'sum'}).reset_index()

    print('Data transformation: metric calulated by averaging total fatalities and affected structures per county over the 13 year data range.')
    cri_wildfire_loss['average_damaged_destroyed_structures_2008_2020'] = cri_wildfire_loss['damaged_destroyed_structures'] // 13
    cri_wildfire_loss['average_fatalities_2008_2020'] = cri_wildfire_loss['total_fatalities'] // 13
    
    # export to csv and upload to AWS
    if export == True:
        cri_wildfire_loss.to_csv(export_filename)
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [export_filename]
        upload_csv_aws(export_filename, bucket_name, directory)

        # Check if the file exists before attempting to remove it
        if os.path.exists('calfire_redbooks_cleaned.csv'):
            os.remove('calfire_redbooks_cleaned.csv')  # remove from local to clear up directory

        if os.path.exists(export_filename[0]):
            os.remove(export_filename[0])

    return cri_wildfire_loss # returns df

In [13]:
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/climate_risk/wildfire/loss/historical/calfire_redbooks/cleaned_stitched_calfire_redbook_data/'
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

wildfire_loss = pd.read_csv('calfire_redbooks_cleaned.csv')

wildfire_loss_varnames = {'fatalities': 'climate_calfire_wildfire_fatalities',
                          'buildings': 'climate_calfire_wildfire_building_loss'}

for key, varname in wildfire_loss_varnames.items():
    wildfire_loss_calc(wildfire_loss, export=True, export_filename=f'climate_wildfire_redbooks_loss_metric_{key}.csv', varname=varname)


Saved DataFrame as 'calfire_redbooks_cleaned.csv'
