### Extreme Event Case Study: October 2007 Santa Ana winds + wildfire
The *Historical Observations Data Platform* is a cloud-based, historical weather observations dataset that enables access to high-quality, rigorously quality-controlled open climate and weather data. The historical weather stations included in this dataset include information that can assess the severity, duration, frequency, and rate of change over time of extreme weather events, as well as supporting projections downscaling efforts. Stringent QA/QC procedures, in-line with international protocols, are applied with custom modifications relevant to the Western US and the energy sector are included (such as temperature and precipitation extremes, winds, and solar radiation). This notebook is a detailed investigation into how the QA/QC protocol performed during a known extreme event that stressed communities and the electric grid.

The event took place in the following counties:
- San Diego
- Los Angeles
- Ventura
- Santa Barbara
- San Bernardino
- Orange


variables of interest
- 'hurs' or 'hurs_derived' - relative humidity, derived
- 'sfcWind' - wind speed at 10m
- 'sfcWind_dir' - wind direction
- 'tas' - air temperature

In [None]:
# imports
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import matplotlib.pyplot as plt
import os
from shapely.geometry import Point
import sys  # Used for progress bar
from case_study_eval_utils import *
import datetime
import sys
import os

# Import qaqc stage plot functions
sys.path.append(os.path.abspath("../scripts/3_qaqc_data"))
from qaqc_plot import flagged_timeseries_plot, _plot_format_helper, id_flag


# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
BUCKET_NAME = "wecc-historical-wx"
QAQC_DIR = "3_qaqc_wx"
MERGE_DIR = "4_merge_wx"
stations_csv_path = f"s3://{BUCKET_NAME}/{QAQC_DIR}/all_network_stationlist_qaqc.csv"

In [None]:
def find_other_events(
    df, event_start, event_end, buffer=14, subset=None, return_stn_ids=True
):
    """
    Event finder not tied to specified case study events.

    Parameters
    ---------
    df : pd.DataFrame
        stationlist
    event_start : str
        start of event, format "YYYY-MM-DD"
    event_end : str
        end of event, format "YYYY-MM-DD"

    Returns
    -------
    eval_stns : pd.DataFrame
        subset of stations for other events of interest

    To dos
    ------
    1. Manual end date check no longer relevant, make sure stationlist passed is the correct updated version.
    2. Start / end date format check
    """

    print(
        f"Subsetting station record for event duration with {str(buffer)} day buffer..."
    )

    df["start-date"] = pd.to_datetime(df["start-date"])
    df["end-date"] = pd.to_datetime(df["end-date"])
    event_start = pd.to_datetime(event_start).tz_localize("UTC")
    event_end = pd.to_datetime(event_end).tz_localize("UTC")

    event_sub = df.loc[
        (df["start-date"] <= (event_start - datetime.timedelta(days=buffer)))
        & (df["end-date"] >= (event_end + datetime.timedelta(days=buffer)))
    ]

    # # exclude "manual check on end date" stations since we don't know when they actually end
    # event_sub = event_sub.loc[event_sub["notes"] != "manual check on end date"]

    # subset to make more manageable
    if subset != None:
        if len(event_sub) <= subset:
            eval_stns = event_sub
        else:
            eval_stns = event_sub.sample(subset, replace=False)
            print(f"{subset} stations selected for evaluation for comparison!")
    else:
        eval_stns = event_sub

    # return station ids for ease
    if return_stn_ids:
        print("Stations selected for evaluation:\n", list(eval_stns["era-id"]))

    return eval_stns

## Step 1: Subset the data

In [None]:
# set up event start and end dates, and affected counties
event_start_date = "2007-10-05"
event_end_date = "2007-11-30"
event_counties = ['San Diego','Los Angeles','Ventura','Santa Barbara','San Bernardino','Orange','Riverside']

In [None]:
# read in merge station list
stn_list = pd.read_csv(
    "s3://wecc-historical-wx/4_merge_wx/all_network_stationlist_merge.csv"
)

In [None]:
# convert it into geodataframe
stns_gdf = gpd.GeoDataFrame(
    stn_list,
    geometry=gpd.points_from_xy(
        stn_list.longitude, stn_list.latitude, crs="EPSG:4326"
    ),
)

In [None]:
# read in  CA county boundaries shapefile
ca_counties = gpd.read_file(
    "s3://wecc-historical-wx/0_maps/ca_counties/CA_Counties.shp"
)
ca_counties = ca_counties.to_crs(stns_gdf.crs)  # Convert to station CRS

In [None]:
# define the event geometry - we'll focus on once county for now (San Diego - where the largest fires occurred)
event_geom = ca_counties[ca_counties["NAME"] == ("San Diego")]  # .isin(event_counties)]

In [None]:
# filter down to stations that are in the target county
stns_gdf["intersects"] = stns_gdf.intersects(
    event_geom.unary_union
)  # See which stations intersect with the event polygon

event_stns = stns_gdf[stns_gdf["intersects"] == True].reset_index(
    drop=True
)  # Get just those stations, drop the others

In [None]:
# ...AND with start and end date within the event timeframe
event_stns = event_stns[
    (event_stns["start-date"] < event_end_date)
    & (event_stns["end-date"] > event_start_date)
]

In [None]:
# which stations are in there?
event_stns

## Step 2: Investigate specific stations

In [None]:
url1 = f"s3://{BUCKET_NAME}/{MERGE_DIR}/SGXWFO/SGXWFO_SDUSS.zarr"

ds1 = xr.open_zarr(url1)

df1 = ds1.to_dataframe()
df1 = df1.reset_index()

In [None]:
df1.columns 

In [None]:
subset1 = event_subset(df1, 'santa_ana_wind', 14)

In [None]:
flags_during_event(subset1, "sfcWind", "santa_ana_wind")

In [None]:
stn_visualize("SGXWFO_SDL34", stn_list, "santa_ana_wind")

In [None]:
# filter the dataframe down to 2 weeks before and after the event window
mask = (df1["time"] >= "2007-10-05") & (df1["time"] <= "2007-11-30")
df_filt = df1.loc[mask]

In [None]:
event_plot(df_filt, "tas", "santa_ana_wind")

In [None]:
def id_flag(flag_to_id: int) -> str:
    """
    Identifies flag based on numerical value assigned for plotting.

    Parameters
    ----------
    flag_to_id : int
        specific flag to identify

    Returns
    -------
    fn_name : str
        name of QA/QC flag
    """

    flag_df = pd.read_csv("../data/era_qaqc_flag_meanings.csv")
    fn_name = flag_df.loc[flag_df["Flag_value"] == int(flag_to_id)][
        "QAQC_function"
    ].values[0]

    return fn_name

## Step 3:

In [None]:
# some kind of map

In [None]:
# table / stats "read out" on extremes during the event

In [None]:
# table / stats "read out" on QC flags, including if we think refinement to QC tests would improve coverage

In [None]:
# some function/thing in terms of how many stations "detected" the event

In [None]:
# summary information via markdown close out of what we have learned