# Historical Data Platform QA/QC Event Evaluation Procedure

**Event**: Santa Ana Wind Event<br>
Start date: 2/16/1988<br> 
End date: 2/19/1988<br>
Location: Los Angeles, Orange counties<br>
Variables: wind speed, wind direction, air temperature, humidity<br>

In [None]:
# import libraries
import pandas as pd
import numpy as np
import xarray as xr
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cf
from matplotlib.ticker import MaxNLocator
import sys

%load_ext autoreload
%autoreload 2

### QAQC evaluation

In [None]:
# read in stations
train_stns = pd.read_csv("../qaqc_training_station_list_events.csv")
train_stns.head()

In [None]:
len(train_stns)

In [None]:
from pyproj import CRS, Transformer


def latlon_to_mercator_cartopy(lat, lon):
    proj_latlon = CRS("EPSG:4326")
    proj_mercator = CRS("EPSG:3857")

    # Transform the coordinates
    transformer = Transformer.from_crs(proj_latlon, proj_mercator, always_xy=True)
    x, y = transformer.transform(lon, lat)

    return x, y

In [None]:
# identify stations with coverage of event
event_flags = ["all", "santa_ana_wind"]
event_stns = train_stns[train_stns["event_type"].isin(event_flags)]

# exclude "manual check on end date" for the time being -- SNOTEL stations all have 2100 as their end date regardless of when the data actually ends
mask = event_stns["notes"] == "manual check on end date"
event_stns = event_stns[~mask]
event_stns.head()

In [None]:
event_stns[event_stns["event_type"] == "santa_ana_wind"]

In [None]:
train_stns["event_type"].unique()

In [None]:
event_stns[event_stns["event_type"] == "santa_ana_wind"]

In [None]:
print(len(event_stns))
event_stns.network.unique()

In [None]:
# identify stations that are in the geographic region we are looking for
# santa ana wind event, focusing on LA and Orange counties

census_shp_dir = "s3://wecc-historical-wx/0_maps/ca_counties/"
# ca_county = gpd.read_file('../../../data/0_maps/ca_counties/CA_Counties.shp') # local
ca_county = gpd.read_file(census_shp_dir)  # from s3 bucket

county_names = ["Los Angeles", "Orange"]
target_counties = ca_county[ca_county["NAME"].isin(county_names)]
target_counties = GeoDataFrame(target_counties, geometry=target_counties.geometry)
target_counties

In [None]:
# process event list to subset
geometry = [
    Point(latlon_to_mercator_cartopy(lat, lon))
    for lat, lon in zip(event_stns.latitude, event_stns.longitude)
]
event_stns = GeoDataFrame(event_stns, geometry=geometry).set_crs(
    crs="EPSG:3857", allow_override=True
)  # adding geometry column

event_stns_local = gpd.overlay(
    event_stns, target_counties, how="intersection"
)  # subsetting for stations within county boundaries

# subset further based on number, if needed
if len(event_stns_local) > 20:
    event_stns_local = event_stns_local.sample(20)
print(len(event_stns_local))
event_stns_local.head(3)

In [None]:
CIMIS_75 = event_stns[event_stns["era-id"] == "CIMIS_75"]
lon, lat = CIMIS_75.longitude.values[0], CIMIS_75.latitude.values[0]
x, y = latlon_to_mercator_cartopy(lat, lon)

In [None]:
orange = ca_county.query("NAME=='Orange'")
orange_geom = orange["geometry"].iloc[0]
LA = ca_county.query("NAME=='Los Angeles'")
LA_geom = LA["geometry"].iloc[0]

In [None]:
fig, ax = plt.subplots(subplot_kw={"projection": ccrs.epsg(3857)})

ax.coastlines()
ax.add_feature(cf.BORDERS)
ax.add_geometries(LA_geom, crs=ccrs.epsg(3857), color="C0", alpha=0.25)
ax.add_geometries(orange_geom, crs=ccrs.epsg(3857), color="C1", alpha=0.25)
ax.add_feature(cf.STATES, lw=0.5)
ax.set_extent([-118.2, -117.4, 33.3, 34])
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.plot(lon, lat, "ok", markersize=8, transform=ccrs.PlateCarree(), mfc="none")
ax.plot(x, y, ".r", markersize=4)
gl = ax.gridlines(
    crs=ccrs.PlateCarree(), draw_labels=["bottom", "left"], ls=":", lw=0.5
)
ax.set_title("");

### Step 2: Holistic / qualitative station evaluation
* downloading these stations manually into train_files, but should grab from AWS unless a better way to read nc files from AWS...

In [None]:
# phase 2 look at full timeseries for flags -- grabbing clean version, not qaqc version to build out
# alternatively.... some of these files may be very large and we should avoid reading in all because of memory concerns

# for stn in event_stns:
#     want to pull out all flags noted, frequency of flags of time record

In [None]:
# read a single station in
stn = xr.open_dataset("/Users/hector/Downloads/CIMIS_75.nc")
stn

In [None]:
def id_all_flags(ds):
    """Prints all unique values of all eraqaqc flags"""
    ds_vars = list(stn.keys())
    qc_vars = [i for i in ds_vars if "_eraqc" in i]
    if len(qc_vars) == 0:
        print(
            "Station has no eraqc variables -- please double check that this station has completed QA/QC!"
        )
    else:
        for var in qc_vars:
            print(var, np.unique(ds[var].data))

In [None]:
id_all_flags(stn)

In [None]:
# look at full timeseries for holistic view
stn.sfcWind.plot()

In [None]:
# look at timeseries of all months of that event (e.g., all februaries) to understand climatological signal?
month = [2]
stn_monthly_clim = stn.isel(time=stn.time.dt.month.isin(month))

stn_monthly_clim.sfcWind.plot()

# hmm not so useful, perhaps a climatology of the month to compare with the event

In [None]:
# look at timeseries during event for flags
# decision: do we add a few days prior/after event end date to evaluate "event anomaly"? i think this may be useful
# santa ana event was 2/16/1988 to 2/19/1988; including +/- 1 week
event_start_date = "1988-02-09"
event_end_date = "1988-02-26"

# subset for event period of time
event = stn.sel(time=slice(event_start_date, event_end_date))

In [None]:
# just grabbing a single var for the event itself
event.sfcWind.plot()

In [None]:
event.sfcWind_dir.plot()

#### Append local GHCNh library path

In [None]:
ghcnh_lib_path = "/Users/hector/ERA_work/historical-obs-platform/test_platform/scripts/3_qaqc_data/qaqc_eval_notebooks/GHCNh"
sys.path.append(ghcnh_lib_path)

In [None]:
# from GHCNh.GHCNh_lib import GHCNh  # If GHCNh is within current folder
from GHCNh_lib import GHCNh  # If GHCNh is was appended to path

In [None]:
%%time
ghcnh = GHCNh(stations_local=True)
ghcnh.select_wecc()
id = ghcnh.stations_df["id"].iloc[0]
ghcnh.read_data_from_url(id, save=True)
ghcnh.convert_df_to_gpd()
ghcnh.station_data.head(3)

In [None]:
lon = ghcnh.station_data.Longitude.mean()
lat = ghcnh.station_data.Latitude.mean()
print("{}, {:.5f}, {:.5f}".format(id, lon, lat))

In [None]:
fig, ax = plt.subplots(figsize=(9, 3))

ghcnh.station_data.plot(ax=ax, x="time", y="temperature")
ghcnh.station_data.plot(ax=ax, x="time", y="dew_point_temperature")
ax.set_title("{}  ({:.3f}, {:.3f})".format(id, lon, lat));

In [None]:
# initial test for identifying the event: large jumps on windspeed

In [None]:
# def return_ghcn_vars(ghcn_df, input_var):
#     '''
#     Given an input variable, return GHCNh location variables and all relevant data variables,
#     rather than utilizing the whole 240 cols, or having to know how ghcnh labels the cols.

#     input_var must follow ERA naming scheme (tas, tdps, ps, pr, etc.)
#     '''
#     ghcnh_vars = pd.read_csv('ghcnh_data_headers.csv')

#     # include station-ID, time, loc, elevation (cols 1-10)
#     stn_info_cols = ['Station_ID', 'Station_name',
#                      'Year','Month','Day','Hour','Minute',
#                      'Latitude','Longitude','Elevation']

#     var_cols = []
#     if input_var == 'tas':
#         varquery = 'temperature'

#     elif input_var == 'tdps' or 'tdps_derived':
#         varquery = 'dew_point_temperature'

#     elif input_var == 'ps' or 'psl':
#         varquery = 'station_level_pressure'

#     elif input_var == 'sfcWind_dir':
#         varquery = 'wind_direction'

#     elif input_var == 'sfcWind':
#         varquery = ['wind_speed', 'wind_gust']

#     elif input_var == 'hurs':
#         varquery = 'relative_humidity'

#     elif input_var == 'rsds':
#         print('GHCNh data does not have solar radiation data to evaluate against.')
#         varquery = ''

#     elif input_var == 'pr' or input_var == 'pr_1h' or input_var == 'pr_5min':
#         varquery = 'precipitation'

#     i = ghcn_df.query(

#     var_cols = [i for i in ghcnh_vars if varquery in i]
#     cols_to_return = stn_info_cols + var_cols
#     return ghcn_df[[cols_to_return]]

In [None]:
def return_ghcn_vars(ghcn_df, input_var):
    """
    Given an input variable, return GHCNh location variables and all relevant data variables,
    rather than utilizing the whole 240 cols, or having to know how ghcnh labels the cols.

    input_var must follow ERA naming scheme (tas, tdps, ps, pr, etc.)
    """
    ghcnh_vars = pd.read_csv("ghcnh_data_headers.csv")

    # include station-ID, time, loc, elevation (cols 1-10)
    stn_info_cols = [
        "Station_ID",
        "Station_name",
        "Year",
        "Month",
        "Day",
        "Hour",
        "Minute",
        "Latitude",
        "Longitude",
        "Elevation",
    ]

    vars = {
        "tas": "temperature",
        "tdps": "dew_point_temperature",
        "tdps_derived": "dew_point_temperature",
        "ps": "station_level_pressure",
        "psl": "station_level_pressure",
        "sfcWind_dir": "wind_direction",
        "sfcWind": "wind_speed",
        "tas": "temperature",
        "hurs": "wind_gust",
        "rsds": "N/A",
        "pr": "precipitation",
        "pr_1h": "precipitation",
        "pr_5min": "precipitation",
        "": "",
    }
    i = ghcn_df.columns.get_loc(vars[input_var])
    j = i + 6
    ghcn_df.iloc[:, i:j]

    return ghcn_df.iloc[:, i:j]

In [None]:
return_ghcn_vars(ghcnh.station_data, "tas").head(3)