## This notebook calculates the following metrics sourced from Cal-Adapt Tools: Wildfire
* Change in absolute tract area burned in m2

In [1]:
import pandas as pd
import os
import sys
import math
import geopandas as gpd
import xarray as xr
import cftime

import pyproj
import rioxarray as rio
import xarray as xr
import numpy as np

# projection information
import cartopy.crs as ccrs

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
def process_fire_data(fn):
    bucket_loc = "s3://ca-climate-index/1_pull_data/climate_risk/wildfire/loss/projections/caladapt/"
    filepath = bucket_loc + fn + "_AA.monthly_all.bau.mu.nc.zarr"
    print('Opening: {}'.format(filepath))
    ds = xr.open_dataset(
        filepath, 
        engine="zarr",
        consolidated=False,
        decode_times=False
    )    
    units, refdate = ds.time.attrs['units'].split('since')
    ds['time'] = pd.date_range(start=refdate, periods=ds.sizes['time'], freq='MS')

    # make mask to re-nan out grid cells with no data,
    # since summing per year ends up turning the nans to 0
    mask_layer = ds.isel(time=0).squeeze()

    # calculate annual area burned
    print('Calculating annual sum of area burned...')
    ds = ds.resample(time='1Y').sum()

    # mask out grid cells which originally had nans
    ds = xr.where(np.isnan(mask_layer), x=np.nan, y=ds)

    return ds

def convert_30yr(start):
    return (start-5, start+24)

In [None]:
# this takes a few minutes because it is opening the file and then pre-processing by calculating the annual sum
# takes approx 25 min total
# RCP 4.5
miroc45 = process_fire_data(fn='MIROC5_45')
cnrm45 = process_fire_data(fn='CNRM-CM5_45')
hadgem45 = process_fire_data(fn='HadGEM2-ES_45')
canesm45 = process_fire_data(fn='CanESM2_45')

# RCP 8.5
miroc85 = process_fire_data(fn='MIROC5_85')
cnrm85 = process_fire_data(fn='CNRM-CM5_85')
hadgem85 = process_fire_data(fn='HadGEM2-ES_85')
canesm85 = process_fire_data(fn='CanESM2_85')

#### Step 1a) Calculate historical baseline (1981-2010)

In [None]:
# grab historical period in each, and take multimodel mean
all_sims = [miroc45, miroc85, cnrm45, cnrm85, hadgem45, hadgem85, canesm45, canesm85]

ds_hist = xr.concat(all_sims, 'simulation')
ds_hist = ds_hist.sel(time=slice('1981', '2010')).mean(dim='time').mean(dim='simulation') # subset for historical baseline period, and take multi-model mean
ds_hist

Let's do a quick validation of the data before moving on.

In [None]:
# sum total area to compare against historical observations
# convert to acres for easy comparison 
acre_per_hectare = 2.47105
ds_hist_sum = ds_hist.sum()*acre_per_hectare
print(f"Historical model-mean total burned area in CA is {ds_hist_sum.hectares.values} acres.")

[Wikipedia article on CA wildfires says](https://en.wikipedia.org/wiki/List_of_California_wildfires): "...since 2000, the area that burned annually has ranged between 90,000 acres... and 1,590,000 acres...", so an average annual burn of ~452,000 acres is reasonable especially when considering the earlier time period considered in the historical data. 

#### Step 1b) Calculate warming level per model
This will have to be manually done per model
* https://github.com/mathause/cmip_warming_levels/blob/main/warming_levels/cmip5_all_ens/csv/cmip5_warming_levels_all_ens_1850_1900_no_bounds_check.csv
* Assuming all models are r1i1ip1f1
* The csv file at loc above has typos in it, making it unreadable programmatically -- working manually
   * uses a 20 year average around start year
   * MIROC5 4.5 -- 2063-2082, MIROC5 8.5 -- 2039-2058
   * CNRM-CM5 4.5 -- 2049-2068, CNRM-CM5 8.5 -- 2036-2055
   * HADGEM-ES 4.5 -- 2034-2053, HADGEM-ES 8.5 -- 2026-2045
   * CANESM2 4.5 -- 2022-2041, CANESM2 8.5 -- 2017-2036

In [None]:
sim_yrs = {
    'miroc45'  : 2063,
    'miroc85'  : 2039,
    'cnrm45'   : 2049,
    'cnrm85'   : 2036,
    'hadgem45' : 2034,
    'hadgem85' : 2026,
    'canesm45' : 2022,
    'canesm85' : 2017
}

for i in sim_yrs.keys():
    print(i, '20-yr-start: ', sim_yrs.get(i), '30-yr range:', convert_30yr(sim_yrs.get(i)))

In [None]:
# RCP 4.5
miroc45_wl = miroc45.sel(time=slice('2058', '2087')).mean(dim='time')
cnrm45_wl = cnrm45.sel(time=slice('2044', '2073')).mean(dim='time')
hadgem45_wl = hadgem45.sel(time=slice('2029', '2058')).mean(dim='time')
canesm45_wl = canesm45.sel(time=slice('2017', '2046')).mean(dim='time')

# RCP 8.5
miroc85_wl = miroc85.sel(time=slice('2034', '2063')).mean(dim='time')
cnrm85_wl = cnrm85.sel(time=slice('2031', '2060')).mean(dim='time')
hadgem85_wl = hadgem85.sel(time=slice('2021', '2050')).mean(dim='time')
canesm85_wl = canesm85.sel(time=slice('2012', '2041')).mean(dim='time')

projs = [miroc45_wl, miroc85_wl, cnrm45_wl, cnrm85_wl, hadgem45_wl, hadgem85_wl, canesm45_wl, canesm85_wl]
ds_proj = xr.concat(projs,'simulation')
ds_proj = ds_proj.mean(dim='simulation')
ds_proj

In [8]:
# convert from hectares to m2
ds_proj_m2 = ds_proj.hectares * 10000
ds_hist_m2 = ds_hist.hectares * 10000
ds_proj_m2.name = 'burn_area_m2'
ds_hist_m2.name = 'burn_area_m2'

In [9]:
# calculate delta signal
ds_delta = ds_proj_m2 - ds_hist_m2

In [None]:
ds_delta.min(), ds_delta.max(), ds_delta.mean()

In [11]:
# read in CA census tiger file -- not working from s3 link, uploading manually to keep testing
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

column_names = ca_boundaries.columns
ca_boundaries = ca_boundaries.to_crs(crs=3857) 
ca_boundaries = ca_boundaries.set_index(['GEOID'])

In [None]:
df_delta = ds_delta.to_dataframe().reset_index()

gdf_delta = gpd.GeoDataFrame(
    df_delta, geometry=gpd.points_from_xy(df_delta.lon, df_delta.lat), crs="EPSG:4326")
gdf_delta = gdf_delta.to_crs(ca_boundaries.crs)

# spatially join gridded data with the census tracts
clipped_gdf = gpd.sjoin(ca_boundaries, gdf_delta, how='left', predicate='intersects')
clipped_gdf = clipped_gdf[["geometry","burn_area_m2"]]
clipped_gdf['tract_area'] = clipped_gdf.area
clipped_gdf

In [None]:
valid_gdf = clipped_gdf[~np.isnan(clipped_gdf["burn_area_m2"])]
valid_gdf = valid_gdf.sort_values(by=['GEOID']).reset_index()
valid_gdf

In [None]:
valid_gdf.burn_area_m2.min(), valid_gdf.burn_area_m2.max()

In [None]:
# takes about 1.5 min
# now sum all valid grid cells contained within the tracts
diss_gdf_valid = valid_gdf.reset_index().dissolve(by='GEOID', aggfunc='sum')
display(diss_gdf_valid)

In [None]:
# now make a new geodataframe with all GEOIDs and set hectares to nan
gdf_to_fill = ca_boundaries[["geometry"]]
gdf_to_fill["burn_area_m2"] = np.nan

# last, fill nans with valid data where it exists
filled_gdf = (diss_gdf_valid.combine_first(gdf_to_fill))
filled_gdf

In [None]:
filled_gdf.burn_area_m2.min(), filled_gdf.burn_area_m2.max(), filled_gdf.burn_area_m2.mean()

In [None]:
import matplotlib.pyplot as plt

cmap = 'bwr'
fig, ax = plt.subplots(figsize=(15, 7))
filled_gdf.plot(column='burn_area_m2', ax=ax, cmap=cmap, vmin=-100000, vmax=100000, legend=True)

In [None]:
# double check on invalid census tracts remaining as nans
check_valid_tracts = filled_gdf.loc[~filled_gdf.burn_area_m2.isnull()]
check_valid_tracts

# of tracts checks out -- we're good to go!

#### Step 4: Min-max standardization
Using Cal-CRAI min-max standardization function, available in `utils.calculate_index.py`

In [20]:
def min_max_standardize(df, cols_to_run_on):
    '''
    Calculates min and max values for specified columns, then calculates
    min-max standardized values.

    Parameters
    ----------
    df: DataFrame
        Input dataframe   
    cols_to_run_on: list
        List of columns to calculate min, max, and standardize
    '''
    for col in cols_to_run_on:
        max_value = df[col].max()
        min_value = df[col].min()

        # Get min-max values, standardize, and add columns to df
        prefix = col # Extracting the prefix from the column name
        df[f'{prefix}_min'] = min_value
        df[f'{prefix}_max'] = max_value
        df[f'{prefix}_min_max_standardized'] = ((df[col] - min_value) / (max_value - min_value))
        
        # note to add checker to make sure new min_max column values arent < 0 >
        df[f'{prefix}_min_max_standardized'].loc[df[f'{prefix}_min_max_standardized'] < 0] = 0
        df[f'{prefix}_min_max_standardized'].loc[df[f'{prefix}_min_max_standardized'] > 1] = 1
     
    return df

In [None]:
data_std = min_max_standardize(filled_gdf, cols_to_run_on=['burn_area_m2'])

#one more quick visual
cmap = 'bwr'
fig, ax = plt.subplots(figsize=(15, 7))
data_std.plot(column='burn_area_m2_min_max_standardized', cmap="Oranges", legend=True, ax=ax)

# drop geometry column
data_std = data_std.drop(columns = ['geometry'])

#### Step 5: Export data as csv

In [22]:
data_std.to_csv('climate_wildfire_burned_area_metric.csv')

In [None]:
data_std

# Fix header output and move column order

In [None]:
# Create a new DataFrame with the desired columns
new_columns = ['GEOID', 'burn_area_m2_min_max_standardized', 'burn_area_m2']
data_std_cleaned = pd.DataFrame(columns=new_columns)

data_std_reset = data_std.reset_index()

# Populate the new DataFrame with data from the existing one
data_std_cleaned['GEOID'] = data_std.index  # Get GEOID from the index
data_std_cleaned['burn_area_m2_min_max_standardized'] = data_std_reset['burn_area_m2_min_max_standardized']  # Get burn_area_m2_min_max_standardized column
data_std_cleaned['burn_area_m2'] = data_std_reset['burn_area_m2']  # Get burn_area_m2 column

# Check the new DataFrame
data_std_cleaned

In [28]:
data_std_cleaned.to_csv('climate_wildfire_burned_area_metric.csv', index=False)

In [5]:
data_std_cleaned = pd.read_csv('climate_wildfire_burned_area_metric.csv')

#### Step 6: Metadata

In [3]:
@append_metadata
def wildfire_burned_area_metadata(df, export=False, export_filename=None, varname=''):
    '''
    Transforms the raw data into the following baseline metrics:
    * Change in % tract area burned
    
    Methods
    -------
    Data is natively modeled area burned
    Uses CMIP5 warming level targets identified from: 
    https://github.com/mathause/cmip_warming_levels/blob/main/warming_levels/cmip5_all_ens/csv/cmip5_warming_levels_all_ens_1850_1900_no_bounds_check.csv
    
    Parameters
    ----------
    df: pd.DataFrame
        Input data.
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI wildfire burn area metric to AWS
        True = will upload resulting df containing CAL CRAI wildfire burn area metric to AWS
    export_filename: string
        name of csv file to be uploaded to AWS
    varname: string
        Final metric name, for metadata generation
        
    Script
    ------
    climate_area_burned.ipynb
    '''
    print('Data transformation: monthly # of hectares summed to annual counts per model.')
          
    # historical baseline
    print("Data transformation: historical baseline data subsetted for 1981-2010, averaging across models.")
    
    # calculate with 2°C WL
    print('Data transformation: data subsetted for warming level of 2.0°C, by manually subsetting based on GWL for parent GCM, and calculating 30 year average, averaging across models.')

    # calculate delta signal
    print('Data transformation: data unit converted to m2 from hectares.')
    print("Data transformation: delta signal calculated by taking difference between chronic (2.0°C) and historical baseline.")

    # reprojection to census tracts
    print("Data transformation: data transformed from xarray dataset into pandas dataframe.")
    print("Data transformation: data reprojected from Lambert Conformal Conic CRS to CRS 3857.")
    print("Data transformation: data spatially joined with census tracts.")
    print("Data transformation: point based burn area summed within tracts with valid data (ie, all grid points contain measurements rather than nans)")
    print("Data transformation: tracts which originally contained any nan grid points are masked out with nan values.")
    print("Data transformation: valid data merged with masked data to create a complete geodataframe with all tracts.")
        
    # min-max standardization
    print("Data transformation: data min-max standardized with min_max_standardize function.")
    
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [df]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{df} uplaoded to AWS.')

    if os.path.exists(df):
        os.remove(df)

In [None]:
wildfire_burned_area_metadata('climate_wildfire_burned_area_metric.csv', export=True, export_filename=None, varname='test') # varname)