# This notebook calculates the following metrics sourced from CAL-FIRE's historical Redbooks
* number of damaged/destroyed buildings per county per year from wildfire
* number of fatalities per county per year from wildfire


In [238]:
import pandas as pd
import os
import sys
import math

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import (
    pull_csv_from_directory, upload_csv_aws
)

In [239]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/climate_risk/wildfire/loss/historical/calfire_redbooks/cleaned_stitched_calfire_redbook_data/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'calfire_redbooks_cleaned.csv'


In [240]:
# read in air quality data (already for state of CA)
wildfire_loss = pd.read_csv('calfire_redbooks_cleaned.csv')
print(len(wildfire_loss))
wildfire_loss.head(5)
os.remove('calfire_redbooks_cleaned.csv')

1005


## The dataset has inconsistent date formats and capitalization
* we adjust the dating formats and creat a new year column
* we bring all entries to lower case
* all nan's are treated as 0

In [241]:
# Replace NaN values with 0
wildfire_loss.fillna(0, inplace=True)

# Convert 'date_start' column to datetime
wildfire_loss['date_start'] = pd.to_datetime(wildfire_loss['date_start'], format='mixed', dayfirst=True)

# Format datetime to mm/dd/yyyy
wildfire_loss['date_start'] = wildfire_loss['date_start'].dt.strftime('%m/%d/%Y')

# Assuming wildfire_loss is your DataFrame
wildfire_loss['date_start'] = pd.to_datetime(wildfire_loss['date_start'], format='%m/%d/%Y', errors='coerce')

# Extract year from the 'date_start' column and create a new 'year' column
wildfire_loss['year'] = wildfire_loss['date_start'].dt.year.astype(int)

# Ensure all entries within county column are lower case
wildfire_loss['county/unit'] = wildfire_loss['county/unit'].str.lower()

wildfire_loss

Unnamed: 0,incident_number,county/unit,fire_name,date_start,date_controlled,direct_protection_agency,total_burned_acres,veg_type,cause,destroyed_structures,damaged_structures,firefighter_fatalities,civil_fatalities,year
0,0,san bernardino,Bluff,2008-03-16,3/20/2008,CALFIRE,680,"Brush, Grass",Campfire,0,0,0.0,0.0,2008
1,0,tehama-glenn,Colyear,2008-06-05,5/9/2008,CALFIRE,1331,Brush,Debris,0,0,0.0,0.0,2008
2,0,fresno-kings,Avocado,2008-05-20,5/21/2008,CALFIRE,1100,Grass,Equipment Use,0,0,0.0,0.0,2008
3,0,fresno-kings,Gatos,2008-05-22,5/22/2008,CALFIRE,331,Grass,Miscellaneous,0,0,0.0,0.0,2008
4,0,santa clara,Summit,2008-05-22,6/15/2008,CALFIRE,4270,Timber,Unidentified,0,91,0.0,0.0,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,LPF3161,monterey,COLEMAN,2020-10-18,11/15/20,USFS,574,"B, G",UNDETERMINED,0,0,0.0,0.0,2020
1001,NOD-6431,lassen,LAURA 2,2020-11-17,11/23/20,BLM,2800,"B, G",UNDETERMINED,42,7,0.0,0.0,2020
1002,OVD-30860,mono,MOUNTAIN VIEW,2020-11-17,11/30/20,BLM,20385,"B, G",UNDETERMINED,97,0,0.0,1.0,2020
1003,COR-12597,riverside,AIRPORT,2020-01-12,12/17/20,LOCAL,1087,"B, G",UNDETERMINED,0,0,0.0,0.0,2020


# There are multiple entries that contain two or more counties

The code below duplicates rows that have a '-' OR ',' within the county column and assigns the original row the county preceeding the '-' or ',' while the duplicate takes on the second or third county name

The method below equally splits the destroyed/damaged structures and fatality values between the rows that had shared counties

# NOTE -- JUST SAW THEY ALSO SEPARATED (ONCE) WITH A SLASH, WILL ADD

In [242]:
# Initialize an empty list to store the modified rows
new_rows = []

# Iterate through each row in the DataFrame
for index, row in wildfire_loss.iterrows():
    county_unit = row['county/unit']
    
    # Check if the county/unit is a string
    if isinstance(county_unit, str):
        # Split the county/unit string by '-' or ','
        if '-' in county_unit:
            county_units = county_unit.split('-')
        elif ',' in county_unit:
            county_units = county_unit.split(',')
        else:
            county_units = [county_unit]
        
        # Count the number of counties
        num_counties = len(county_units)
        
        # Convert values in the specified columns to integers or floats
        destroyed_structures = row['destroyed_structures']
        damaged_structures = row['damaged_structures']
        firefighter_fatalities = row['firefighter_fatalities']
        civil_fatalities = row['civil_fatalities']
        
        if isinstance(destroyed_structures, str):
            destroyed_structures = destroyed_structures.replace(',', '')
            if destroyed_structures.isdigit():
                destroyed_structures = int(destroyed_structures)
            else:
                destroyed_structures = 0
        
        if isinstance(damaged_structures, str):
            damaged_structures = damaged_structures.replace(',', '')
            if damaged_structures.isdigit():
                damaged_structures = int(damaged_structures)
            else:
                damaged_structures = 0
        
        if isinstance(firefighter_fatalities, str):
            firefighter_fatalities = firefighter_fatalities.replace(',', '')
            if firefighter_fatalities.isdigit():
                firefighter_fatalities = float(firefighter_fatalities)
            else:
                firefighter_fatalities = 0
        
        if isinstance(civil_fatalities, str):
            civil_fatalities = civil_fatalities.replace(',', '')
            if civil_fatalities.isdigit():
                civil_fatalities = float(civil_fatalities)
            else:
                civil_fatalities = 0
        
        if num_counties > 1:
            destroyed_structures = math.ceil(destroyed_structures / num_counties)
            damaged_structures = math.ceil(damaged_structures / num_counties)
            firefighter_fatalities = math.ceil(firefighter_fatalities / num_counties)
            civil_fatalities = math.ceil(civil_fatalities / num_counties)
        
        # Iterate through each county/unit part
        for county in county_units:
            # Create a new row for each county/unit part
            new_row = row.copy()
            new_row['county'] = county.strip()
            new_row['destroyed_structures'] = destroyed_structures
            new_row['damaged_structures'] = damaged_structures
            new_row['firefighter_fatalities'] = firefighter_fatalities
            new_row['civil_fatalities'] = civil_fatalities
            new_rows.append(new_row)
    else:
        # If the county/unit is not a string (e.g., NaN), handle it accordingly
        # For now, just append the original row to the new_rows list
        new_rows.append(row)

# Create a new DataFrame from the modified rows
cleaned_wildfire_loss = pd.DataFrame(new_rows)

## Creating CRI metric columns by summing structure and fatality related data respectively

In [243]:
cleaned_wildfire_loss['damaged_destroyed_structures'] = cleaned_wildfire_loss['damaged_structures'] + cleaned_wildfire_loss['destroyed_structures']
cleaned_wildfire_loss['total_fatalities'] = cleaned_wildfire_loss['firefighter_fatalities'] + cleaned_wildfire_loss['civil_fatalities']

cleaned_wildfire_loss.head(5)

Unnamed: 0,incident_number,county/unit,fire_name,date_start,date_controlled,direct_protection_agency,total_burned_acres,veg_type,cause,destroyed_structures,damaged_structures,firefighter_fatalities,civil_fatalities,year,county,damaged_destroyed_structures,total_fatalities
0,0,san bernardino,Bluff,2008-03-16,3/20/2008,CALFIRE,680,"Brush, Grass",Campfire,0,0,0.0,0.0,2008,san bernardino,0,0.0
1,0,tehama-glenn,Colyear,2008-06-05,5/9/2008,CALFIRE,1331,Brush,Debris,0,0,0.0,0.0,2008,tehama,0,0.0
1,0,tehama-glenn,Colyear,2008-06-05,5/9/2008,CALFIRE,1331,Brush,Debris,0,0,0.0,0.0,2008,glenn,0,0.0
2,0,fresno-kings,Avocado,2008-05-20,5/21/2008,CALFIRE,1100,Grass,Equipment Use,0,0,0.0,0.0,2008,fresno,0,0.0
2,0,fresno-kings,Avocado,2008-05-20,5/21/2008,CALFIRE,1100,Grass,Equipment Use,0,0,0.0,0.0,2008,kings,0,0.0


## Isolating relevant columns

In [244]:
isolated_cleaned_wildfire_loss = cleaned_wildfire_loss[['county', 'year', 'damaged_destroyed_structures', 'total_fatalities']]


In [245]:
isolated_cleaned_wildfire_loss.head(5)

Unnamed: 0,county,year,damaged_destroyed_structures,total_fatalities
0,san bernardino,2008,0,0.0
1,tehama,2008,0,0.0
1,glenn,2008,0,0.0
2,fresno,2008,0,0.0
2,kings,2008,0,0.0


## Grouping data by county AND year and summing the structure and fatality data so we get total damaged structures and total fatalities per county per year

In [246]:
# Group the DataFrame by 'county' and 'year', then sum the 'damaged_destroyed_structures' and 'total_fatalities'
cri_wildfire_loss = isolated_cleaned_wildfire_loss.groupby(['county', 'year']).agg({'damaged_destroyed_structures': 'sum', 'total_fatalities': 'sum'}).reset_index()

# Print the aggregated DataFrame
cri_wildfire_loss.head(5)

Unnamed: 0,county,year,damaged_destroyed_structures,total_fatalities
0,alameda,2010,1,0.0
1,alameda,2011,0,0.0
2,alameda,2015,1,0.0
3,alameda,2018,1,0.0
4,alameda,2020,51,0.0


In [247]:
# Saving metric df to .csv file
cri_wildfire_loss.to_csv('climate_wildfire_redbooks_loss_metric.csv')

In [248]:
# upload final csv file to aws
bucket_name = 'ca-climate-index'
file_name = ['climate_wildfire_redbooks_loss_metric.csv']
directory = '3_fair_data/index_data'

#@append_metadata
upload_csv_aws(file_name, bucket_name, directory)
os.remove('climate_wildfire_redbooks_loss_metric.csv')

climate_wildfire_redbooks_loss_metric.csv uploaded to AWS
