## Extreme Heat Day and Warm Night Likelihood

This notebook briefly walks through how to calculate the extreme heat exposure metric `% of change in extreme heat day and warm night event likelihood` from Cal-Adapt: Analytics Engine data. This notebook may be expanded upon for inclusion in cae-notebooks in the future. 

**Order of operations**:
1. Read data in
2. Calculate base function (FFWI, SPEI, warm nights, etc.)
3. Calculate chronic
4. Calculate delta signal
5. Reprojection to census tracts
6. Min-max standardization
7. Export data
8. Generate metadata (via Cal-CRAI environment, not AE)

**Runtime**: This notebook takes approximately ~3 hours to run due to data size, warming levels, and reprojection steps. 

### Step 0: Import libraries

In [None]:
import climakitae as ck
from climakitae.explore import warming_levels 
from climakitae.util.utils import add_dummy_time_to_wl
import pandas as pd
import numpy as np
import geopandas as gpd

from xclim.indices import warm_night_frequency, hot_spell_frequency # extreme heat day and warm night function
import pyproj
import rioxarray as rio
import xarray as xr
from bokeh.models import HoverTool

# projection information
import cartopy.crs as ccrs
crs = ccrs.LambertConformal(
    central_longitude=-70, 
    central_latitude=38, 
    false_easting=0.0, 
    false_northing=0.0,  
    standard_parallels=[30, 60], 
    globe=None,
)

In [None]:
sims_wl = [
    'WRF_MPI-ESM1-2-HR_r3i1p1f1_Historical + SSP 3-7.0 -- Business as Usual',
    'WRF_MIROC6_r1i1p1f1_Historical + SSP 3-7.0 -- Business as Usual',
    'WRF_EC-Earth3_r1i1p1f1_Historical + SSP 3-7.0 -- Business as Usual',
    'WRF_TaiESM1_r1i1p1f1_Historical + SSP 3-7.0 -- Business as Usual',
]
sims_hist = [
    'WRF_MPI-ESM1-2-HR_r3i1p1f1',
    'WRF_MIROC6_r1i1p1f1', 
    'WRF_EC-Earth3_r1i1p1f1',
    'WRF_TaiESM1_r1i1p1f1', 
] 

def count_delta_extreme_heat_events(ds_hist,ds_wl):    
  
    # define the months over which we are going to 
    # determine the 98th percentile temperature threshold
    # to define a hot day or warm night
    months_to_measure = [m for m in np.arange(4,11,1)]
    
    sim_coord_dict = dict(zip(sims_wl,sims_hist))
    
    ds_hist = ds_hist.squeeze()
    ds_wl = ds_wl.squeeze()
    ds_template = ds_hist.isel(time=0, simulation=0).squeeze()
    # first set consistent coordinates
    ds_hist = ds_hist.sortby("simulation")
    ds_wl = ds_wl.rename({"all_sims" : "simulation"})
    ds_wl = ds_wl.sortby("simulation")
    ds_wl = ds_wl.assign_coords({'simulation': list(sim_coord_dict.values())})
    ds_wl = ds_wl.transpose("simulation","time","y","x")

    # compute 98th percentile historical temperature between April and October
    thresh_ds = ds_hist.sel(
        time=ds_hist.time.dt.month.isin(months_to_measure)).chunk(
            dict(time=-1)).quantile(0.98, dim="time")
    # count total days > 98th percentile in historical data and take annual average
    hist_count = xr.where(ds_hist > thresh_ds, x=1, y=0).groupby(
        "time.year").sum().mean(dim="year").mean(dim="simulation")
    # count total days > 98th percentile in warming levels data and take annual average
    chronic_count = xr.where(ds_wl > thresh_ds, x=1, y=0).groupby(
        "time.year").sum().mean(dim="year").mean(dim="simulation")
    # get the delta signal
    delta_count = chronic_count - hist_count
    # nan out non-CA grid points
    delta_count = xr.where(np.isnan(ds_template), x=np.nan, y=delta_count)
    return delta_count

def reproject_to_tracts(ds_delta, ca_boundaries, county):
    # this step takes about 12 minutes with 3km data (~1 min with 9km data)
    df = ds_delta.to_dataframe().reset_index()
    gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.x,df.y))
    gdf = gdf.set_crs(crs)
    gdf = gdf.to_crs(ca_boundaries.crs)
    
    ca_boundaries = ca_boundaries.set_index(['GEOID'])
    
    clipped_gdf = gpd.sjoin_nearest(ca_boundaries, gdf, how='left')
    clipped_gdf = clipped_gdf.drop(['index_right'], axis=1)
    clipped_gdf = clipped_gdf[clipped_gdf["NAME"]==county[0]]
    ### some coastal tracts do not contain any land grid cells ###
    ### due to the WRF's underlying surface type for a given grid cell. ###
    
    # aggregate the gridded data to the tract level
    clipped_gdf_diss = clipped_gdf.reset_index().dissolve(
        by='GEOID', aggfunc='mean')
    clipped_gdf_diss = clipped_gdf_diss.rename(
        columns={f"{ds_delta.name}_right":
                 ds_delta.name}
    )
    
    # separate tracts with data from tracts without data
    clipped_gdf_nan = clipped_gdf_diss[np.isnan(
        clipped_gdf_diss[ds_delta.name]
    )]
    clipped_gdf_nan = clipped_gdf_nan[["geometry",ds_delta.name]]
    clipped_gdf_valid = clipped_gdf_diss[~np.isnan(
        clipped_gdf_diss[ds_delta.name]
    )]
    clipped_gdf_valid = clipped_gdf_valid[["geometry",ds_delta.name]]

    # compute the centroid of each tract
    clipped_gdf_nan["centroid"] = clipped_gdf_nan.centroid
    clipped_gdf_nan = clipped_gdf_nan.set_geometry("centroid")
    clipped_gdf_valid["centroid"] = clipped_gdf_valid.centroid
    clipped_gdf_valid = clipped_gdf_valid.set_geometry("centroid")
    
    # fill in missing tracts with values from the closest tract
    # in terms of distance between the tract centroids
    clipped_gdf_filled = clipped_gdf_nan.sjoin_nearest(clipped_gdf_valid, how='left')
    clipped_gdf_filled = clipped_gdf_filled[["geometry_left",f"{ds_delta.name}_right"]]
    clipped_gdf_filled = clipped_gdf_filled.rename(columns={
        "geometry_left":"geometry", f"{ds_delta.name}_right":ds_delta.name
    })
    clipped_gdf_valid = clipped_gdf_valid.drop(columns="centroid")
 
    # concatenate filled-in tracts with the original tract which had data
    gdf_all_tracts = pd.concat([clipped_gdf_valid,clipped_gdf_filled])

    return gdf_all_tracts

def min_max_standardize(df, col):
    '''
    Calculates min and max values for specified columns, then calculates
    min-max standardized values.

    Parameters
    ----------
    df: DataFrame
        Input dataframe   
    cols_to_run_on: list
        List of columns to calculate min, max, and standardize
    '''
    max_value = df[col].max()
    min_value = df[col].min()

    # Get min-max values, standardize, and add columns to df
    prefix = col # Extracting the prefix from the column name
    df[f'{prefix}_min'] = min_value
    df[f'{prefix}_max'] = max_value
    df[f'{prefix}_min_max_standardized'] = ((df[col] - min_value) / (max_value - min_value))

    # note to add checker to make sure new min_max column values arent < 0 > 1

    # Drop the original columns
    df = df.drop(columns=[col])
     
    return df

In [None]:
# read in CA census tiger file -- not working from s3 link, uploading manually to keep testing
# census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
census_shp_dir = "tl_2021_06_tract.shp"
ca_boundaries = gpd.read_file(census_shp_dir)
ca_counties = gpd.read_file('ca_counties/')
ca_counties = ca_counties.to_crs(ca_boundaries.crs)
ca_boundaries = ca_boundaries[["COUNTYFP","GEOID","geometry"]]
ca_boundaries = pd.merge(ca_boundaries,ca_counties[["COUNTYFP","NAME"]],on="COUNTYFP")
ca_boundaries = ca_boundaries.to_crs(crs=3310) 

In [None]:
county = ["Los Angeles"]
print(f"Calculating metric for {county[0]} County")

# get bounding box for county + small tolerance to avoid missing edge data
county_bounds = ca_counties[ca_counties.NAME == county[0]].bounds
minx = county_bounds.minx.values[0] - 0.1
maxx = county_bounds.maxx.values[0] + 0.1
miny = county_bounds.miny.values[0] - 0.1
maxy = county_bounds.maxy.values[0] + 0.1

# retrieve 2 deg C temperature maximum
wl = warming_levels()
wl.wl_params.timescale = "daily"
wl.wl_params.downscaling_method = "Dynamical"
wl.wl_params.variable = "Maximum air temperature at 2m"
wl.wl_params.area_subset = "CA counties"
wl.wl_params.cached_area = ["Los Angeles County"]
wl.wl_params.warming_levels = ["2.0"]
wl.wl_params.units = "degC"
wl.wl_params.resolution = "3 km"
wl.wl_params.anom = "No"
wl.calculate()
ds = wl.sliced_data["2.0"] # grab 2.0 degC data
ds = ds.sel(all_sims = sims_wl)
wl_max_ds = add_dummy_time_to_wl(ds) # add time dimension back in, as this is removed by WL and is required for xclim functionality

# retrieve 2 deg C temperature minimum
wl = warming_levels() # reset
wl.wl_params.timescale = "daily"
wl.wl_params.downscaling_method = "Dynamical"
wl.wl_params.variable = "Minimum air temperature at 2m"
wl.wl_params.area_subset = "CA counties"
wl.wl_params.cached_area = ["Los Angeles County"]
wl.wl_params.warming_levels = ["2.0"]
wl.wl_params.units = "degC"
wl.wl_params.resolution = "3 km"
wl.wl_params.anom = "No"
wl.calculate()
ds = wl.sliced_data["2.0"] # grab 2.0 degC data
ds = ds.sel(all_sims = sims_wl)
wl_min_ds = add_dummy_time_to_wl(ds) # add time dimension back in, as this is removed by WL and is required for xclim functionality

# retrieve historical baseline max temperature
selections = ck.Select()
selections.area_average = 'No'
selections.timescale = 'daily'
selections.variable = 'Maximum air temperature at 2m'
selections.area_subset = 'CA counties'
selections.cached_area = ["Los Angeles County"]
selections.scenario_historical = ['Historical Climate']
selections.time_slice = (1981, 2010)
selections.resolution = '3 km'
selections.units = 'degC'
hist_max_ds = selections.retrieve()
hist_max_ds = hist_max_ds.sel(simulation=sims_hist)

# retrieve historical baseline min temperature
selections = ck.Select() # rest
selections.area_average = 'No'
selections.timescale = 'daily'
selections.variable = 'Minimum air temperature at 2m'
selections.area_subset = 'CA counties'
selections.cached_area = ["Los Angeles County"]
selections.scenario_historical = ['Historical Climate']
selections.time_slice = (1981, 2010)
selections.resolution = '3 km'
selections.units = 'degC'
hist_min_ds = selections.retrieve()
hist_min_ds = hist_min_ds.sel(simulation=sims_hist)

In [None]:
# get change in # of hot days
hd_delta_ds = count_delta_extreme_heat_events(
    hist_max_ds, wl_max_ds
)
hd_delta_ds = ck.load(hd_delta_ds)
hd_delta_ds.name = "Mean change in annual extreme heat days"

# get change in # of warm nights
wn_delta_ds = count_delta_extreme_heat_events(
    hist_min_ds, wl_min_ds
)
wn_delta_ds.name = "Mean change in annual warm nights"
wn_delta_ds = ck.load(wn_delta_ds)

In [None]:
# reproject data to census tracts
hd_df = reproject_to_tracts(hd_delta_ds, ca_boundaries, county)
wn_df = reproject_to_tracts(wn_delta_ds, ca_boundaries, county)
    
wn_data_std = min_max_standardize(wn_df, col=wn_delta_ds.name)
display(wn_data_std)
hd_data_std = min_max_standardize(hd_df, col=hd_delta_ds.name)
display(hd_data_std)

In [None]:
wn_data_std.to_csv('climate_extreme_heat_warm_night_metric.csv')
hd_data_std.to_csv('climate_extreme_heat_hot_day_metric.csv')

### Step 6: Metadata
This function below is designed to be run solely in Cal-CRAI environment, not on the Analytics Engine JupyterHub. 

Since this is slightly different than the other metrics, process to generate metadata:
* Open in Cal-CRAI environment
* Make sure that csv file to export is in same directory
* **Only run the following 3 cells**
* Upload to AWS

In [None]:
import pandas as pd
import os
import sys

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [None]:
df_in = pd.read_csv('climate_extreme_heat_likelihood_metric.csv') # make sure this is in the same folder!
df_in # check

In [None]:
@append_metadata
def extreme_heat_ae_data_process(df, export=False, export_filename=None, varname=''):
    '''
    Reduces the size of the initial daily raw temperature data in order to streamline compute time.
    Transforms the raw data into the following baseline metrics:
    * Warm night frequency
    * Extreme heat day frequency
    
    Methods
    -------
    Metric is aggregated using xclim.indices functionality corresponding to the varname.
    
    Parameters
    ----------
    df: pd.DataFrame
        Input data.
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI extreme heat metric to AWS
        True = will upload resulting df containing CAL CRAI extreme heat metric to AWS
    export_filename: string
        name of csv file to be uploaded to AWS
    varname: string
        Final metric name, for metadata generation
        
    Script
    ------
    climate_ae_heatday_warmnight.ipynb
    
    Note
    ----
    Because the climate projections data is on the order of 2.4 TB in size, intermediary
    processed files are not produced for each stage of the metric calculation. All processing
    occurs in a single complete run in the notebook listed above.
    '''
        
    # historical baseline
    print("Data transformation: historical baseline data retrieved for 1981-2010, averaging across models.")
    print("Data transformation: dynamically-downscaled climate data subsetted for a-priori bias-corrected models.")
    print("Data transformation: drop all singleton dimensions (scenario).")
    print("Data transformation: daily minimum calculated from hourly data for input into xclim.indices.warm_night_frequency.")
    print("Data transformation: daily maximum calculated from hourly data for input into xclim.indices.hot_spell_frequency.")
    print("Data transformation: percent likelihood of event occurrence calculated for historical baseline (1981-2010) period, averaging across time.")
    
    # calculate chronic with 2°C WL
    print('Data transformation: raw projections data retrieved for warming level of 2.0°C, by manually subsetting based on GWL for parent GCM and calculating 30 year average.')
    print("Data transformation: dynamically-downscaled climate data subsetted for a-priori bias-corrected models.")
    print("Data transformation: drop all singleton dimensions (scenario).")
    print("Data transformation: daily minimum calculated from hourly data for input into xclim.indices.warm_night_frequency.")
    print("Data transformation: daily maximum calculated from hourly data for input into xclim.indices.hot_spell_frequency.")
    print("Data transformation: percent likelihood of event occurrence calculated for chronic period (2.0°C warming level), averaging across time.")
    
    # calculate delta signal
    print("Data transformation: hot spell frequency and warm night frequency likelihoods averaged together into single metric.")
    print("Data transformation: delta signal calculated by taking difference between chronic (2.0°C) and historical baseline.")

    # reprojection to census tracts
    print("Data transformation: data transformed from xarray dataset into pandas dataframe.")
    print("Data transformation: data reprojected from Lambert Conformal Conic CRS to CRS 3857.")
        
    # min-max standardization
    print("Data transformation: data min-max standardized with min_max_standardize function.")
    
    # export data as csv
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [export_filename]
        upload_csv_aws(export_filename, bucket_name, directory)
        
        # # Check if file exists before attempting to remove it
        # if os.path.exists('climate_extreme_heat_likelihood_metric.csv'):
        #     os.remove('climate_extreme_heat_likelihood_metric') # remove from local to clear up directory
        
        # if os.path.exists(export_filename[0]):
        #     os.remove(export_filename[0])
    
    if export == False:
        print(f'{export_filename} uploaded to AWS.')

In [None]:
extreme_heat_ae_data_process(df_in, export=False, export_filename ='climate_extreme_heat_likelihood_metric.csv', varname='test') # varname)