## Setup

In [1]:
"""
This script performs qa/qc protocols for cleaned station data for ingestion into the Historical Observations Platform, and is
independent of network.
Approach:
(1) Remove duplicate stations
(2) Handle variables that report at different intervals and/or change frequency over time (convert to hourly?)
(3) QA/QC testing, including consistency checks, gaps, checks against climatological distributions, and cross variable checks.
(4) Case study analysis for accuracy -- SHOULD THIS BE A SEPARATE SCRIPT/PROCESS?

Inputs: Cleaned data for an individual network
Outputs: QA/QC-processed data for an individual network, priority variables, all times. Organized by station as .nc file.
"""

# Step 0: Environment set-up
# Import libraries
import os
import datetime
import pandas as pd
import xarray as xr
import boto3
import s3fs
from io import StringIO
import time
import tempfile
from mpi4py import MPI
import logging
from simplempi import simpleMPI

# Import all qaqc script functions
try:
    from qaqc_plot import *
    from qaqc_utils import *
    from qaqc_wholestation import *
    from qaqc_logic_checks import *
    from qaqc_buoy_check import *
    from qaqc_frequent import *
    from qaqc_unusual_gaps import *
    from qaqc_unusual_large_jumps import *
    from qaqc_climatological_outlier import *
    from qaqc_unusual_streaks import *
    from qaqc_deaccumulate import *
except Exception as e:
    print("Error importing qaqc script: {}".format(e))

from log_config import setup_logger

# ----------------------------------------------------------------------------
## Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

## Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"

# Define temporary directory in local drive for downloading data from S3 bucket
# If the directory doesn't exist, it will be created
# If we used zarr, this wouldn't be neccessary
temp_dir = "./tmp"
if not os.path.exists(temp_dir):
    os.mkdir(temp_dir)

## Global variables and functions

In [10]:
# ----------------------------------------------------------------------------
def setup_error_handling():
    """Sets-up error handling.

    Returns
    -------
    errors : dict
        dictionary of file, timing, and error message
    end_api : datetime
        time at beginnging of data download
    tiemstamp: datetime
        time at runtime
    """
    errors = {"File": [], "Time": [], "Error": []}  # Set up error handling
    end_api = datetime.datetime.now().strftime(
        "%Y%m%d%H%M"
    )  # Set end time to be current time at beginning of download: for error handling csv
    timestamp = datetime.datetime.utcnow().strftime("%m-%d-%Y, %H:%M:%S")
    return errors, end_api, timestamp

In [11]:
# ----------------------------------------------------------------------------
def print_qaqc_failed(
    errors, station=None, end_api=None, message=None, test=None, verbose=False
):
    """QAQC failure messaging

    Parameters
    ----------
    errors : dict
        dictionary of file, timing, and error message
    station : str, optional
        station name
    end_api : datetime, optional
        time at beginning of data download
    message : str, optional
        error message
    test : str, optional
        QAQC test name to include in error message
    verbose : bool, optional
        if True, provides runtime output to local terminal
    """
    logger.info(
        "{0} {1}, skipping station".format(station, message),
    )
    errors["File"].append(station)
    errors["Time"].append(end_api)
    errors["Error"].append("Failure on {}".format(test))
    return errors

In [22]:
def read_nc_from_s3_clean(network_name, station_id, temp_dir):
    """Read netcdf file containing station data for a single station of interest from AWS s3 bucket

    Parameters
    ----------
    network_name: str
        Name of network (i.e. "ASOSAWOS")
        Must correspond with a valid directory in the s3 bucket (i.e. "CAHYDRO", "CDEC", "ASOSAWOS")
    station_id: str
        Station identifier; i.e. the name of the netcdf file in the bucket (i.e. "ASOSAWOS_72012200114.nc")

    Returns
    -------
    station_data: xr.Dataset

    Notes
    -----
    The data is first downloaded from AWS into a tempfile, which is then deleted after xarray reads in the file
    I'd like to see us use a zarr workflow if possible to avoid this.

    """

    # Temp file for downloading from s3
    temp_file = tempfile.NamedTemporaryFile(
        dir=temp_dir, prefix="", suffix=".nc", delete=True
    )

    # Create s3 file system
    s3 = s3fs.S3FileSystem(anon=False)

    # Get URL to netcdf in S3
    s3_url = "s3://wecc-historical-wx/2_clean_wx/{}/{}.nc".format(
        network_name, station_id
    )

    # Read in the data using xarray
    s3_file_obj = s3.get(s3_url, temp_file.name)
    station_data = xr.open_dataset(temp_file.name, engine="h5netcdf").load()

    # Close temporary file
    temp_file.close()

    return station_data

In [23]:
def qaqc_ds_to_df_other(ds, verbose=False):
    """Converts xarray ds for a station to pandas df in the format needed for the pipeline

    Parameters
    ----------
    ds : xr.Dataset
        input data from the clean step
    verbose : bool, optional
        if True, provides runtime output to the terminal

    Returns
    -------
    df : pd.DataFrame
        converted xr.Dataset into dataframe
    MultiIndex : pd.Index
        multi-index of station and time
    attrs : list of str
        attributes from xr.Dataset
    var_attrs : list of str
        variable attributes from xr.Dataset
    era_qc_vars : list of str
        QAQC variables

    Notes
    -----
    This is the notebook friendly version (no logger statements).
    """
    ## Add qc_flag variable for all variables, including elevation;
    ## defaulting to nan for fill value that will be replaced with qc flag

    for key, val in ds.variables.items():
        if val.dtype == object:
            if key == "station":
                if str in [type(v) for v in ds[key].values]:
                    ds[key] = ds[key].astype(str)
            else:
                if str in [type(v) for v in ds.isel(station=0)[key].values]:
                    ds[key] = ds[key].astype(str)

    exclude_qaqc = [
        "time",
        "station",
        "lat",
        "lon",
        "qaqc_process",
        "sfcWind_method",
        "pr_duration",
        "pr_depth",
        "PREC_flag",
        "rsds_duration",
        "rsds_flag",
        "anemometer_height_m",
        "thermometer_height_m",
    ]  # lat, lon have different qc check

    raw_qc_vars = []  # qc_variable for each data variable, will vary station to station
    era_qc_vars = []  # our ERA qc variable
    old_era_qc_vars = []  # our ERA qc variable

    for var in ds.data_vars:
        if "q_code" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variable, need to keep for comparison, then drop
        if "_qc" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop
        if "_eraqc" in var:
            era_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop
            old_era_qc_vars.append(var)

    print(f"era_qc existing variables:\n{era_qc_vars}")
    n_qc = len(era_qc_vars)

    for var in ds.data_vars:
        if var not in exclude_qaqc and var not in raw_qc_vars and "_eraqc" not in var:
            qc_var = var + "_eraqc"  # variable/column label

            # if qaqc var does not exist, adds new variable in shape of original variable with designated nan fill value
            if qc_var not in era_qc_vars:
                print(f"nans created for {qc_var}")
                ds = ds.assign({qc_var: xr.ones_like(ds[var]) * np.nan})
                era_qc_vars.append(qc_var)

    print("{} created era_qc variables".format(len(era_qc_vars) - len(old_era_qc_vars)))
    if len(era_qc_vars) != n_qc:
        print("{}".format(np.setdiff1d(old_era_qc_vars, era_qc_vars)))

    # Save attributes to inheret them to the QAQC'ed file
    attrs = ds.attrs
    # var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            print("Filling anemometer_height_m with NaN.", flush=True)
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            print("Filling thermometer_height_m with NaN.", flush=True)
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    # Add time variables needed by multiple functions
    df["hour"] = pd.to_datetime(df["time"]).dt.hour
    df["day"] = pd.to_datetime(df["time"]).dt.day
    df["month"] = pd.to_datetime(df["time"]).dt.month
    df["year"] = pd.to_datetime(df["time"]).dt.year
    df["date"] = pd.to_datetime(df["time"]).dt.date

    return df  # , MultiIndex, attrs, var_attrs, era_qc_vars

In [13]:
# --------------------------------------------------------------------------------
def qaqc_ds_to_df(ds, verbose=False):
    """Converts xarray ds for a station to pandas df in the format needed for the pipeline

    Parameters
    ----------
    ds : xr.Dataset
        input data from the clean step
    verbose : bool, optional
        if True, provides runtime output to the terminal

    Returns
    -------
    df : pd.DataFrame
        converted xr.Dataset into dataframe
    MultiIndex : pd.Index
        multi-index of station and time
    attrs : list of str
        attributes from xr.Dataset
    var_attrs : list of str
        variable attributes from xr.Dataset
    era_qc_vars : list of str
        QAQC variables
    """

    ## Add qc_flag variable for all variables, including elevation;
    ## defaulting to nan for fill value that will be replaced with qc flag

    for key, val in ds.variables.items():
        if val.dtype == object:
            if key == "station":
                if str in [type(v) for v in ds[key].values]:
                    ds[key] = ds[key].astype(str)
            else:
                if str in [type(v) for v in ds.isel(station=0)[key].values]:
                    ds[key] = ds[key].astype(str)

    exclude_qaqc = [
        "time",
        "station",
        "lat",
        "lon",
        "qaqc_process",
        "sfcWind_method",
        "pr_duration",
        "pr_depth",
        "PREC_flag",
        "rsds_duration",
        "rsds_flag",
        "anemometer_height_m",
        "thermometer_height_m",
    ]  # lat, lon have different qc check

    raw_qc_vars = []  # qc_variable for each data variable, will vary station to station
    era_qc_vars = []  # our ERA qc variable
    # old_era_qc_vars = []  # our ERA qc variable

    for var in ds.data_vars:
        if "q_code" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variable, need to keep for comparison, then drop
        if "_qc" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop

    logger.info("Existing era_qc variables: {}".format(era_qc_vars))

    # only in-fill nans for valid variables
    for var in ds.data_vars:
        if var not in exclude_qaqc and var not in raw_qc_vars and "_eraqc" not in var:
            qc_var = var + "_eraqc"  # variable/column label

            # if qaqc var does not exist, adds new variable in shape of original variable with designated nan fill value
            if qc_var not in era_qc_vars:
                ds = ds.assign({qc_var: xr.ones_like(ds[var]) * np.nan})
                era_qc_vars.append(qc_var)
                logger.info(
                    "nans created for {}".format(qc_var),
                )
                ds = ds.assign({qc_var: xr.ones_like(ds[var]) * np.nan})

    n_qc = len(era_qc_vars)  # determine length of eraqc variables per station
    logger.info("Created {0} era_qc variables: {1}".format(n_qc, era_qc_vars))

    # Save attributes to inheret them to the QAQC'ed file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            logger.info("Filling anemometer_height_m with NaN.")
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan

    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            logger.info("Filling thermometer_height_m with NaN.")
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    # Add time variables needed by multiple functions
    df["hour"] = pd.to_datetime(df["time"]).dt.hour
    df["day"] = pd.to_datetime(df["time"]).dt.day
    df["month"] = pd.to_datetime(df["time"]).dt.month
    df["year"] = pd.to_datetime(df["time"]).dt.year
    df["date"] = pd.to_datetime(df["time"]).dt.date

    return df, MultiIndex, attrs, var_attrs, era_qc_vars

## Load test datasets

In [None]:
# Load test dataframe
ds_bhcc1 = read_nc_from_s3_clean("CNRFC", "CNRFC_BHCC1", temp_dir)

# convert to formatted pandas dataframe
df_bhcc1 = qaqc_ds_to_df(ds_bhcc1, verbose=False) #TODO: using ds to df in script

In [None]:
# Load test dataframe
ds_sapc1 = read_nc_from_s3_clean("CNRFC", "CNRFC_SAPC1", temp_dir)

# convert to formatted pandas dataframe
df_sapc1 = qaqc_ds_to_df_other(ds_sapc1, verbose=False) #TODO" using ds to df that I have modified

In [53]:
df = df_bhcc1

In [44]:
verbose=False
local=False

In [None]:
stn_to_qaqc = df.copy()  # Need to define before qaqc_pipeline, in case
new_df = qaqc_missing_vals(stn_to_qaqc, verbose=verbose)
new_df = qaqc_missing_latlon(stn_to_qaqc, verbose=verbose)
new_df = qaqc_within_wecc(stn_to_qaqc, verbose=verbose)
new_df = qaqc_elev_infill(
    stn_to_qaqc, verbose=verbose
)  # nan infilling must be before range check
new_df = qaqc_elev_range(stn_to_qaqc, verbose=verbose)
new_df = qaqc_pressure_units_fix(stn_to_qaqc, verbose=verbose)
new_df = qaqc_deaccumulate_precip(stn_to_qaqc, local=local)

In [154]:
def qaqc_world_record(df, verbose=False):
    """
    Checks if variables are outside North American world records.
    If outside minimum or maximum records, flags values.

    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline
    verbose : bool, optional
        if True, returns runtime output to terminal

    Returns
    -------
    If QAQC is successful, returns a dataframe with flagged values (see below for flag meaning)
    If QAQC fails, returns None

    Notes
    ------
    Flag meaning : 11,qaqc_world_record,Value outside of world record range

    References
    ----------
    [1] World records from HadISD protocol, cross-checked with WMO database
    [2] https://wmo.asu.edu/content/world-meteorological-organization-global-weather-climate-extremes-archive
    [3] Solar radiation specific: Rupp et al. 2022, Slater 2016
    [4] https://www.ncei.noaa.gov/access/monitoring/scec/records
    [5] https://www.weather.gov/media/owp/oh/hdsc/docs/TP2.pdf
    """

    print("Running: qaqc_world_record")

    try:
        T_X = {"North_America": 329.92}  # temperature, K
        T_N = {"North_America": 210.15}  # temperature, K
        D_X = {"North_America": 329.85}  # dewpoint temperature, K
        D_N = {"North_America": 173.15}  # dewpoint temperature, K
        W_X = {"North_America": 113.2}  # wind speed, m/s
        W_N = {"North_America": 0.0}  # wind speed, m/s
        R_X = {"North_America": 1500}  # solar radiation, W/m2
        R_N = {"North_America": -5}  # solar radiation, W/m2

        # for other non-record variables (wind direction, humidity)
        N_X = {"North_America": 360}  # wind direction, degrees
        N_N = {"North_America": 0}  # wind direction, degrees
        H_X = {"North_America": 100}  # humidity, max
        H_N = {"North_America": 0}  # humidity, min
        E_X = {"North_America": 6210.0}  # elevation, m
        E_N = {"North_America": -100}  # elevation, m

        # pressure, with elevation options
        S_X = {"North_America": 108330}  # pressure, Pa
        S_N = {"North_America": 87000}  # sea level pressure only, Pa
        SALT_N = {
            "North_America": 45960
        }  # non-sea level pressure, Pa, reduced min based on max elevation (6190 m)

        # precipitation, with variations depending on reporting interval
        P_X = {"North_America": 656}  # precipitation, mm, 24-hr rainfall
        PALT5_X = {
            "North_America": 31.8
        }  # precipitation, mm, 5-min rainfall, WECC-wide
        PALT15_X = {
            "North_America": 25.4
        }  # precipitation, mm, 15-min rainfall, specific to VALLEYWATER
        PACC_X = {
            "North_America": 10000
        }  # accumulated precipitation, mm, arbirtarily set to a high max value
        P_N = {"North_America": 0}  # precipitaiton, mm

        maxes = {
            "tas": T_X,
            "tdps": D_X,
            "tdps_derived": D_X,
            "sfcWind": W_X,
            "sfcWind_dir": N_X,
            "psl": S_X,
            "ps": S_X,
            "ps_derived": S_X,
            "ps_altimeter": S_X,
            "rsds": R_X,
            "pr": P_X,
            "pr_5min": PALT5_X,
            "pr_15min": PALT15_X,
            "pr_1h": P_X,
            "pr_24h": P_X,
            "pr_localmid": P_X,
            "accum_pr": PACC_X,
            "hurs": H_X,
            "elevation": E_X,
        }
        mins = {
            "tas": T_N,
            "tdps": D_N,
            "tdps_derived": D_N,
            "sfcWind": W_N,
            "sfcWind_dir": N_N,
            "psl": S_N,
            "ps": SALT_N,
            "ps_derived": SALT_N,
            "ps_altimeter": SALT_N,
            "rsds": R_N,
            "pr": P_N,
            "pr_5min": P_N,
            "pr_15min": P_N,
            "pr_1h": P_N,
            "pr_24h": P_N,
            "pr_localmid": P_N,
            "accum_pr": P_N,
            "hurs": H_N,
            "elevation": E_N,
        }

        # variable names to check against world record limits
        wr_vars = [
            "tas",
            "tdps",
            "tdps_derived",
            "sfcWind",
            "sfcWind_dir",
            "ps",
            "psl",
            "ps_altimeter",
            "ps_derived",
            "rsds",
            "pr",
            "pr_5min",
            "pr_15min",
            "pr_1h",
            "pr_24h",
            "pr_localmid",
            "accum_pr",
            "hurs",
            "elevation",
        ]
        for var in wr_vars:
            if var in list(df.columns):
                df_valid = grab_valid_obs(df, var)  # subset for valid obs
                isOffRecord = np.logical_or(
                    df_valid[var] < mins[var]["North_America"],
                    df_valid[var] > maxes[var]["North_America"],
                )
                if isOffRecord.any():
                    isOffRecord_true = isOffRecord[isOffRecord]
                    df.loc[df.index.isin(isOffRecord_true.index), var + "_eraqc"] = (
                        11  # see era_qaqc_flag_meanings.csv
                    )
                    print(
                        "Flagging {} observations exceeding world/regional records: {}".format(
                            sum(isOffRecord_true), var
                        )
                    )

        return df
    except Exception as e:
        print(
            "qaqc_world_record failed with Exception: {}".format(e),
        )
        return None

In [157]:
df = df_sapc1.copy()

In [158]:
test_df = qaqc_world_record(df)

Running: qaqc_world_record
Flagging 619 observations exceeding world/regional records: tas
Flagging 1307 observations exceeding world/regional records: tdps_derived
Flagging 11 observations exceeding world/regional records: sfcWind_dir
Flagging 491 observations exceeding world/regional records: pr


In [134]:
# tas range: 210.15 - 329.92

check = test_df[test_df["tas_eraqc"]==11]
check[["tas", "tas_eraqc"]]

Unnamed: 0,tas,tas_eraqc
3794,349.26,11.0
37546,349.26,11.0
43930,332.59,11.0
47450,351.48,11.0
48940,344.26,11.0
...,...,...
338738,522.04,11.0
338739,338.15,11.0
338741,1368.15,11.0
338743,472.04,11.0


In [140]:
check["tas"].min()

331.47999999999996

In [137]:
test_df[["tas", "tas_eraqc"]]

Unnamed: 0,tas,tas_eraqc
0,297.040,
1,297.590,
2,298.150,
3,299.820,
4,300.370,
...,...,...
559082,310.150,
559083,310.706,
559084,,
559085,,


In [135]:
df_sapc1[['tas','tas_eraqc']]

Unnamed: 0,tas,tas_eraqc
0,297.040,
1,297.590,
2,298.150,
3,299.820,
4,300.370,
...,...,...
559082,310.150,
559083,310.706,
559084,,
559085,,
