In [1]:
import climakitae as ck
from climakitae.explore import warming_levels 
from climakitae.util.utils import add_dummy_time_to_wl
import pandas as pd
import numpy as np
import geopandas as gpd

from xclim.indices import warm_night_frequency, hot_spell_frequency # extreme heat day and warm night function
import pyproj
import rioxarray as rio
import xarray as xr
from bokeh.models import HoverTool

# projection information
import cartopy.crs as ccrs
crs = ccrs.LambertConformal(
    central_longitude=-70, 
    central_latitude=38, 
    false_easting=0.0, 
    false_northing=0.0,  
    standard_parallels=[30, 60], 
    globe=None,
)

In [2]:
## Helpful function set-up
sim_name_dict = {  
    'WRF_CNRM-ESM2-1_r1i1p1f2_Historical + SSP 3-7.0 -- Business as Usual' :
    'CNRM-ESM2-1',
    'WRF_EC-Earth3-Veg_r1i1p1f1_Historical + SSP 3-7.0 -- Business as Usual' :
    'EC-Earth3-Veg',
    'WRF_CESM2_r11i1p1f1_Historical + SSP 3-7.0 -- Business as Usual' :
    'CESM2',
    'WRF_FGOALS-g3_r1i1p1f1_Historical + SSP 3-7.0 -- Business as Usual' :
    'FGOALS-g3'
}

In [3]:
sims_wl = [
    'WRF_MPI-ESM1-2-HR_r3i1p1f1_Historical + SSP 3-7.0 -- Business as Usual',
    'WRF_MIROC6_r1i1p1f1_Historical + SSP 3-7.0 -- Business as Usual',
    'WRF_EC-Earth3_r1i1p1f1_Historical + SSP 3-7.0 -- Business as Usual',
    'WRF_TaiESM1_r1i1p1f1_Historical + SSP 3-7.0 -- Business as Usual',
]
sims_hist = [
    'WRF_MPI-ESM1-2-HR_r3i1p1f1',
    'WRF_MIROC6_r1i1p1f1', 
    'WRF_EC-Earth3_r1i1p1f1',
    'WRF_TaiESM1_r1i1p1f1', 
] 

def count_delta_extreme_heat_events(ds_hist,ds_wl):    
  
    # define the months over which we are going to 
    # determine the 98th percentile temperature threshold
    # to define a hot day or warm night
    months_to_measure = [m for m in np.arange(4,11,1)]
    
    sim_coord_dict = dict(zip(sims_wl,sims_hist))
    
    ds_hist = ds_hist.squeeze()
    ds_wl = ds_wl.squeeze()
    ds_template = ds_hist.isel(time=0, simulation=0).squeeze()
    # first set consistent coordinates
    ds_hist = ds_hist.sortby("simulation")
    ds_wl = ds_wl.rename({"all_sims" : "simulation"})
    ds_wl = ds_wl.sortby("simulation")
    ds_wl = ds_wl.assign_coords({'simulation': list(sim_coord_dict.values())})
    ds_wl = ds_wl.transpose("simulation","time","y","x")

    # compute 98th percentile historical temperature between April and October
    thresh_ds = ds_hist.sel(
        time=ds_hist.time.dt.month.isin(months_to_measure)).chunk(
            dict(time=-1)).quantile(0.98, dim="time")
    # count total days > 98th percentile in historical data and take annual average
    hist_count = xr.where(ds_hist > thresh_ds, x=1, y=0).groupby(
        "time.year").sum().mean(dim="year").mean(dim="simulation")
    # count total days > 98th percentile in warming levels data and take annual average
    chronic_count = xr.where(ds_wl > thresh_ds, x=1, y=0).groupby(
        "time.year").sum().mean(dim="year").mean(dim="simulation")
    # get the delta signal
    delta_count = chronic_count - hist_count
    # nan out non-CA grid points
    delta_count = xr.where(np.isnan(ds_template), x=np.nan, y=delta_count)
    return delta_count

def reproject_to_tracts(ds_delta, ca_boundaries, county):
    # this step takes about 12 minutes with 3km data (~1 min with 9km data)
    df = ds_delta.to_dataframe().reset_index()
    gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df.x,df.y))
    gdf = gdf.set_crs(crs)
    gdf = gdf.to_crs(ca_boundaries.crs)
    
    ca_boundaries = ca_boundaries.set_index(['GEOID'])    

    clipped_gdf = gpd.sjoin_nearest(ca_boundaries, gdf, how='left')
    clipped_gdf = clipped_gdf.drop(['index_right'], axis=1)
    clipped_gdf = clipped_gdf[clipped_gdf["NAME"]==county[0]]
    ### some coastal tracts do not contain any land grid cells ###
    ### due to the WRF's underlying surface type for a given grid cell. ###
    
    # aggregate the gridded data to the tract level
    clipped_gdf_diss = clipped_gdf.reset_index().dissolve(
        by='GEOID', aggfunc='mean')
    clipped_gdf_diss = clipped_gdf_diss.rename(
        columns={f"{ds_delta.name}_right":
                 ds_delta.name}
    )
    
    # separate tracts with data from tracts without data
    clipped_gdf_nan = clipped_gdf_diss[np.isnan(
        clipped_gdf_diss[ds_delta.name]
    )]
    clipped_gdf_nan = clipped_gdf_nan[["geometry",ds_delta.name]]
    clipped_gdf_valid = clipped_gdf_diss[~np.isnan(
        clipped_gdf_diss[ds_delta.name]
    )]
    clipped_gdf_valid = clipped_gdf_valid[["geometry",ds_delta.name]]

    # compute the centroid of each tract
    clipped_gdf_nan["centroid"] = clipped_gdf_nan.centroid
    clipped_gdf_nan = clipped_gdf_nan.set_geometry("centroid")
    clipped_gdf_valid["centroid"] = clipped_gdf_valid.centroid
    clipped_gdf_valid = clipped_gdf_valid.set_geometry("centroid")
    
    # fill in missing tracts with values from the closest tract
    # in terms of distance between the tract centroids
    clipped_gdf_filled = clipped_gdf_nan.sjoin_nearest(clipped_gdf_valid, how='left')
    clipped_gdf_filled = clipped_gdf_filled[["geometry_left",f"{ds_delta.name}_right"]]
    clipped_gdf_filled = clipped_gdf_filled.rename(columns={
        "geometry_left":"geometry", f"{ds_delta.name}_right":ds_delta.name
    })
    clipped_gdf_valid = clipped_gdf_valid.drop(columns="centroid")
 
    # concatenate filled-in tracts with the original tract which had data
    gdf_all_tracts = pd.concat([clipped_gdf_valid,clipped_gdf_filled])

    return gdf_all_tracts


def min_max_standardize(df, col):
    '''
    Calculates min and max values for specified columns, then calculates
    min-max standardized values.

    Parameters
    ----------
    df: DataFrame
        Input dataframe   
    cols_to_run_on: list
        List of columns to calculate min, max, and standardize
    '''
    max_value = df[col].max()
    min_value = df[col].min()

    # Get min-max values, standardize, and add columns to df
    prefix = col # Extracting the prefix from the column name
    df[f'{prefix}_min'] = min_value
    df[f'{prefix}_max'] = max_value
    df[f'{prefix}_min_max_standardized'] = ((df[col] - min_value) / (max_value - min_value))

    # note to add checker to make sure new min_max column values arent < 0 > 1

    # Drop the original columns
    df = df.drop(columns=[col])
     
    return df

In [4]:
# read in CA census tiger file -- not working from s3 link, uploading manually to keep testing
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_counties = "s3://ca-climate-index/0_map_data/ca_counties/"

#census_shp_dir = "tl_2021_06_tract.shp"
ca_boundaries = gpd.read_file(census_shp_dir)
ca_counties = gpd.read_file(ca_counties)
ca_counties = ca_counties.to_crs(ca_boundaries.crs)
ca_boundaries = ca_boundaries[["COUNTYFP","GEOID","geometry"]]
ca_boundaries = pd.merge(ca_boundaries,ca_counties[["COUNTYFP","NAME"]],on="COUNTYFP")
ca_boundaries = ca_boundaries.to_crs(crs=3310) 

In [5]:
county = ["El Dorado"]
print(f"Calculating metric for {county[0]} County")

# get bounding box for county + small tolerance to avoid missing edge data
county_bounds = ca_counties[ca_counties.NAME == county[0]].bounds
minx = county_bounds.minx.values[0] - 0.1
maxx = county_bounds.maxx.values[0] + 0.1
miny = county_bounds.miny.values[0] - 0.1
maxy = county_bounds.maxy.values[0] + 0.1

# retrieve 2 deg C snowfall (snow and ice) data
wl = warming_levels()
wl.wl_params.timescale = "hourly"
wl.wl_params.downscaling_method = "Dynamical"
wl.wl_params.variable = "Snowfall (snow and ice)"
wl.wl_params.area_subset = "CA counties"
wl.wl_params.cached_area = ["El Dorado County"]
wl.wl_params.warming_levels = ["2.0"]
wl.wl_params.units = "mm"
wl.wl_params.resolution = "3 km"
wl.wl_params.anom = "No"
wl.calculate()
ds = wl.sliced_data["2.0"] # grab 2.0 degC data
ds = ds.sel(all_sims = list(sim_name_dict.keys()))
total_snowfall = add_dummy_time_to_wl(ds)

# retrieve 2 deg C precipitation total data
wl = warming_levels()
wl.wl_params.timescale = "hourly"
wl.wl_params.downscaling_method = "Dynamical"
wl.wl_params.variable = "Precipitation (total)"
wl.wl_params.area_subset = "CA counties"
wl.wl_params.cached_area = ["El Dorado County"]
wl.wl_params.warming_levels = ["2.0"]
wl.wl_params.units = "mm"
wl.wl_params.resolution = "3 km"
wl.wl_params.anom = "No"
wl.calculate()
ds = wl.sliced_data["2.0"] # grab 2.0 degC data
ds = ds.sel(all_sims = list(sim_name_dict.keys()))
total_precip = add_dummy_time_to_wl(ds)


Calculating metric for El Dorado County
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!! Returned data array is huge. Operations could take 10x to infinity longer than 1GB of data !!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!



Computing each warming level:   0%|          | 0/1 [00:00<?, ?it/s]

Processing data to read 6.12 GB of data into memory... 
[########################################] | 100% Completed | 99.25 s
Complete!


  timestamps = pd.date_range(


!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!! Returned data array is huge. Operations could take 10x to infinity longer than 1GB of data !!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!



Computing each warming level:   0%|          | 0/1 [00:00<?, ?it/s]

Processing data to read 6.12 GB of data into memory... 
[########################################] | 100% Completed | 136.68 s
Complete!


  timestamps = pd.date_range(


In [6]:
# reproject data to census tracts
total_precip_reproj = reproject_to_tracts(total_precip, ca_boundaries, county)
total_snowfall_reproj = reproject_to_tracts(total_snowfall, ca_boundaries, county)

MemoryError: Unable to allocate 6.12 GiB for an array with shape (822038400,) and data type int64

In [None]:
wn_data_std = min_max_standardize(wn_df, col=wn_delta_ds.name)
display(wn_data_std)
hd_data_std = min_max_standardize(hd_df, col=hd_delta_ds.name)
display(hd_data_std)