# QAQC flag counts 

## Environment set-up

In [1]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO
from functools import reduce

import inspect

import logging
# Create a simple logger that just prints to the console
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

plt.rcParams["figure.dpi"] = 300

In [2]:
# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [3]:
def merge_ds_to_df(ds):
    """Converts xarray ds for a station to pandas df in the format needed for processing.

    Parameters
    ----------
    ds: xr.Dataset
        Data object with information about each network and station
    verbose: boolean
        Flag as to whether to print runtime statements to terminal. Default is False. Set in ALLNETWORKS_merge.py run.

    Returns
    -------
    df: pd.DataFrame
        Table object with information about each network and station
    MultiIndex: pd.DataFrame (I think)
        Original multi-index of station and time, to be used on conversion back to ds
    attrs:
        Save ds attributes to inherent to the final merged file
    var_attrs:
        Save variable attributes to inherent to the final merged file
    """

    # Save attributes to inherent them to the final merged file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    df = ds.to_dataframe()

    # Save instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            logger.info("Filling anemometer_height_m with NaN.")
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            logger.info("Filling thermometer_height_m with NaN.")
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    return df, MultiIndex, attrs, var_attrs

## Set the stage

### Load Data

In [158]:
# url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_72493023230.zarr"
ds = xr.open_zarr(url)

In [159]:
df, MultiIndex, attrs, var_attrs = merge_ds_to_df(ds)

### Perform hourly standardization

In [6]:
def my_func(x):
    if len(x) == 0:
        return "nan"
    else:
        return ",".join(x.unique())

In [7]:
def merge_hourly_standardization(
    df: pd.DataFrame, var_attrs: dict, logger: logging.Logger
) -> tuple[pd.DataFrame, dict]:
    """Resamples meteorological variables to hourly timestep according to standard conventions.

    Parameters
    -----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline
    var_attrs: library
        attributes for sub-hourly variables
    logger : logging.Logger
        Logger instance for recording messages during processing.

    Returns
    -------
    df : pd.DataFrame | None
        returns a dataframe with all columns resampled to one hour (column name retained)
    var_attrs : dict | None
        returns variable attributes dictionary updated to note that sub-hourly variables are now hourly

    Notes
    -----
    Rules:
    1. Top of the hour: take the first value in each hour. Standard convention for temperature, dewpoint, wind speed, direction, relative humidity, air pressure.
    2. Summation across the hour: sum observations within each hour. Standard convention for precipitation and solar radiation.
    3. Constant across the hour: take the first value in each hour. This applied to variables that do not change.
    """

    logger.info(f"{inspect.currentframe().f_code.co_name}: Starting...")

    # Variables that remain constant within each hour
    constant_vars = [
        "time",
        "station",
        "lat",
        "lon",
        "elevation",
        "anemometer_height_m",
        "thermometer_height_m",
    ]

    # Aggregation across hour variables, standard meteorological convention: precipitation and solar radiation
    sum_vars = [
        "time",
        "pr",
        "pr_localmid",
        "pr_24h",
        "pr_1h",
        "pr_15min",
        "pr_5min",
        "rsds",
    ]

    # Top of the hour variables, standard meteorological convention: temperature, dewpoint temperature, pressure, humidity, winds
    instant_vars = [
        "hurs_derived",
        "time",
        "tas",
        "tas_derived",
        "tdps",
        "tdps_derived",
        "ps",
        "psl",
        "ps_altimeter",
        "ps_derived",
        "hurs",
        "sfcWind",
        "sfcWind_dir",
    ]

    # QAQC flags, which remain constants within each hour
    vars_to_remove = ["qc", "eraqc", "duration", "method", "flag", "depth", "process"]

    try:

        qaqc_vars = [
            var
            for var in df.columns
            if any(True for item in vars_to_remove if item in var)
        ]

        # Subset the dataframe according to rules
        constant_df = df[[col for col in constant_vars if col in df.columns]]

        qaqc_df = df[[col for col in qaqc_vars if col in df.columns if col != "time"]]
        qaqc_df = qaqc_df.astype(str)
        qaqc_df.insert(0, "time", df["time"])

        sum_df = df[[col for col in sum_vars if col in df.columns]]

        instant_df = df[[col for col in instant_vars if col in df.columns]]

        # Performing hourly aggregation, only if subset contains more than one (ie more than the 'time' time) column
        # This is to account for input dataframes that do not contain ALL subsets of variables defined above - just a subset of them.
        result_list = []
        if len(constant_df.columns) > 1:
            constant_result = constant_df.resample("1h", on="time").first()
            result_list.append(constant_result)

        if len(instant_df.columns) > 1:
            instant_result = instant_df.resample("1h", on="time").first()
            result_list.append(instant_result)

        if len(sum_df.columns) > 1:
            sum_result = sum_df.resample("1h", on="time").apply(
                lambda x: np.nan if x.isna().all() else x.sum(skipna=True)
            )
            result_list.append(sum_result)

        if len(qaqc_df.columns) > 1:
            qaqc_result = qaqc_df.resample("1h", on="time").apply(
                lambda x: my_func(x)
            )  # adding unique flags
            result_list.append(qaqc_result)

        # Aggregate and output reduced dataframe - this merges all dataframes defined
        # This function sets "time" to the index; reset index to return to original index
        result = reduce(
            lambda left, right: pd.merge(left, right, on=["time"], how="outer"),
            result_list,
        )
        result.reset_index(inplace=True)  # Convert time index --> column

        # Update attributes for sub-hourly variables
        sub_hourly_vars = [i for i in df.columns if "min" in i and "qc" not in i]
        for var in sub_hourly_vars:
            var_attrs[var]["standardization"] = (
                "{} has been standardized to an hourly timestep, but will retain its original name".format(
                    var
                )
            )
        logger.info(f"{inspect.currentframe().f_code.co_name}: Completed successfully")

        return result, var_attrs

    except Exception as e:
        logger.error(f"{inspect.currentframe().f_code.co_name}: Failed")
        raise e

In [36]:
# subsetting data to speed things up
df_sub = df[['time','ps_eraqc','pr_eraqc']]

In [170]:
df_st, var_attrs = merge_hourly_standardization(df, var_attrs, logger)

INFO:root:merge_hourly_standardization: Starting...
INFO:root:merge_hourly_standardization: Completed successfully


## Development

### Final Function

In [None]:
# include count of total number of observations

def eraqc_counts_hourly_timestep(df: pd.DataFrame, network: str, station: str) -> None:
    """
    Generates a dataframe of raw qaqc flag value counts for every variable,
    for the hourly timestep, after hourly standardization.
    Exports the dataframe as a csv to AWS.

    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline
    network: str
        network name
    station: str
        station name

    Returns
    -------
    None
    """
    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # filter df for only qaqc columns
    # also replace Nan values with 'no_flag' for two reasons:
    #   1. to enable us to count total observations for the success report
    #   2. to clarify what the Nan value indicates
    df_qaqc = df[eraqc_vars]

    # generate df of counts of each unique flag for each variable
    # fill all Nan values with 0, since Nan = no observations counted
    flag_counts = df_qaqc.apply(
        lambda x: x.str.split(",", expand=True).stack().value_counts()
    ).fillna(0)

    # rename columns
    flag_counts.columns = flag_counts.columns.str.replace("_eraqc", "", regex=True)

    # set all counts to integers, for readability
    flag_counts = flag_counts.astype(int)

    # rename index (i.e. eraqc values) and then reset index
    flag_counts = flag_counts.rename_axis("eraqc_flag_values")

    # replace 'nan' (a string) with 'no_flag', for clarity
    flag_counts = flag_counts.rename(index={"nan": "no_flag"})

    # add row with count of non_nan values per variable
    flag_vars = flag_counts.columns
    flag_counts.loc["non_nan_obs_count"] = df[flag_vars].notna().sum()

    # send file to AWS
    csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/{network}/eraqc_counts/{station}_flag_counts_hourly_standardized.csv"
    flag_counts.to_csv(csv_s3_filepath, index=True)

    return flag_counts

### Sandbox

In [171]:
    # identify _eraqc variables
eraqc_vars = [var for var in df_st.columns if "_eraqc" in var]

df_qaqc = df_st[eraqc_vars]

# generate df of counts of each unique flag for each variable
# fill all Nan values with 0, since Nan = no observations counted
flag_counts = df_qaqc.apply(
    lambda x: x.str.split(",", expand=True).stack().value_counts()
).fillna(0)

# rename columns
flag_counts.columns = flag_counts.columns.str.replace("_eraqc", "", regex=True)

# set all counts to integers, for readability
flag_counts = flag_counts.astype(int)

# rename index (i.e. eraqc values) and then reset index
flag_counts = flag_counts.rename_axis("eraqc_flag_values")

# replace 'nan' (a string) with 'no_flag', for clarity
flag_counts = flag_counts.rename(index={"nan": "no_flag"})

Add row with total obs per variable

In [None]:
# this will look ugly, but I'm just included it to have the information I need for the success report

flag_vars = flag_counts.columns

Unnamed: 0,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps
0,2.0,6.0,102100.0,102070.0,102090.0,,0.0,288.75,286.45
1,2.0,,102130.0,102080.0,102120.0,270.0,3.6,287.55,284.25
2,2.0,,102170.0,102120.0,102140.0,270.0,3.1,287.55,284.25
3,2.0,,102170.0,102130.0,102170.0,40.0,2.1,287.55,284.85
4,2.0,,102240.0,102200.0,102230.0,340.0,0.0,287.05,284.25
...,...,...,...,...,...,...,...,...,...
374011,27.0,0.0,101630.0,101300.0,101600.0,260.0,3.6,293.75,285.35
374012,27.0,0.0,101560.0,101230.0,101560.0,270.0,4.6,294.25,285.95
374013,27.0,0.0,101490.0,101170.0,101470.0,290.0,5.7,295.95,285.95
374014,27.0,0.0,101420.0,101100.0,101410.0,290.0,5.7,295.95,285.95


In [None]:
flag_counts.loc['non_nan_obs_count']= df_st[flag_vars].notna().sum()

In [195]:
flag_counts

Unnamed: 0_level_0,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps
eraqc_flag_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
21.0,0,0,0,7296,22079,0,0,0,0
23.0,0,0,24,3,23,0,0,10,200
26.0,0,0,0,0,0,0,0,94,332
27.0,0,0,0,0,0,0,16,0,14
28.0,0,0,0,0,0,0,0,0,25
no_flag,374016,374016,373994,366718,351914,374016,374000,373932,373508
test,338512,181934,329119,163315,172818,292987,333820,333437,332644


In [188]:
test = df_st[flag_vars].notna().apply(pd.Series.value_counts)
test

Unnamed: 0,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps
False,35504,192082,44897,210701,201198,81029,40196,40579,41372
True,338512,181934,329119,163315,172818,292987,333820,333437,332644


In [175]:
len(df_st[flag_vars])

374016

are any nan rows flagged?

## Testing

In [120]:
network = "ASOSAWOS"
station = "ASOSAWOS_72493023230"

In [138]:
# Run if set to flag_counts
flag_counts = eraqc_counts_hourly_timestep(df_st, network, station)
flag_counts

Unnamed: 0_level_0,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps
eraqc_flag_values,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
21.0,0,0,0,7296,22079,0,0,0,0
23.0,0,0,24,3,23,0,0,10,200
26.0,0,0,0,0,0,0,0,94,332
27.0,0,0,0,0,0,0,16,0,14
28.0,0,0,0,0,0,0,0,0,25
no_flag,374016,374016,373994,366718,351914,374016,374000,373932,373508


In [139]:
key = f"4_merge_wx/{network}/eraqc_counts/{station}_flag_counts_hourly_standardized.csv"

list_import = s3_cl.get_object(
    Bucket=bucket_name,
    Key=key,
)

flag_counts_table = pd.read_csv(BytesIO(list_import["Body"].read()))

In [140]:
flag_counts_table

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps
0,21.0,0,0,0,7296,22079,0,0,0,0
1,23.0,0,0,24,3,23,0,0,10,200
2,26.0,0,0,0,0,0,0,0,94,332
3,27.0,0,0,0,0,0,0,16,0,14
4,28.0,0,0,0,0,0,0,0,0,25
5,no_flag,374016,374016,373994,366718,351914,374016,374000,373932,373508


# Revisiting flag counts at the original timestep

I need to add non-counts for each variable.

In [None]:
def eraqc_counts_hourly_timestep(df: pd.DataFrame, network: str, station: str) -> None:
    """
    Generates a dataframe of raw qaqc flag value counts for every variable,
    for the hourly timestep, after hourly standardization.
    Exports the dataframe as a csv to AWS.

    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline
    network: str
        network name
    station: str
        station name

    Returns
    -------
    None
    """
    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # filter df for only qaqc columns
    # also replace Nan values with 'no_flag' for two reasons:
    #   1. to enable us to count total observations for the success report
    #   2. to clarify what the Nan value indicates
    df = df[eraqc_vars]

    # generate df of counts of each unique flag for each variable
    # fill all Nan values with 0, since Nan = no observations counted
    flag_counts = df.apply(
        lambda x: x.str.split(",", expand=True).stack().value_counts()
    ).fillna(0)

    # rename columns
    flag_counts.columns = flag_counts.columns.str.replace("_eraqc", "", regex=True)

    # set all counts to integers, for readability
    flag_counts = flag_counts.astype(int)

    # rename index (i.e. eraqc values) and then reset index
    flag_counts = flag_counts.rename_axis("eraqc_flag_values")

    # replace 'nan' (a string) with 'no_flag', for clarity
    flag_counts = flag_counts.rename(index={"nan": "no_flag"})

    # add row with count of non_nan values per variable
    flag_vars = flag_counts.columns
    flag_counts.loc["non_nan_obs_count"] = df[flag_vars].notna().sum()

    # send file to AWS
    csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/{network}/eraqc_counts/{station}_flag_counts_hourly_standardized.csv"
    flag_counts.to_csv(csv_s3_filepath, index=True)

    return flag_counts

In [198]:
df_filt = df[['time','ps','ps_eraqc']]

In [202]:
na_vals = df_filt[df_filt['ps'].isna()]

In [203]:
na_vals['ps_eraqc'].unique()

array([nan, 21.])

Do we include 'nan' values in the total number of observations? 
- nan values can be flagged, see above

The houry standardization process add rows within time gaps

In [208]:
# Variables that remain constant within each hour
constant_vars = [
    "time",
    "station",
    "lat",
    "lon",
    "elevation",
    "anemometer_height_m",
    "thermometer_height_m",
]

# Aggregation across hour variables, standard meteorological convention: precipitation and solar radiation
sum_vars = [
    "time",
    "pr",
    "pr_localmid",
    "pr_24h",
    "pr_1h",
    "pr_15min",
    "pr_5min",
    "rsds",
]

# Top of the hour variables, standard meteorological convention: temperature, dewpoint temperature, pressure, humidity, winds
instant_vars = [
    "hurs_derived",
    "time",
    "tas",
    "tas_derived",
    "tdps",
    "tdps_derived",
    "ps",
    "psl",
    "ps_altimeter",
    "ps_derived",
    "hurs",
    "sfcWind",
    "sfcWind_dir",
]

In [212]:
# there is a time gap at 1981-02-05 07:00:00

df_st_time_filt = df_st.loc[
    (df_st["time"] >= "1981-02-05 00:00:00") & (df_st["time"] < "1981-02-05 23:00:00")
]

In [214]:
df_st_time_filt[[col for col in constant_vars if col in df_st_time_filt.columns]]

Unnamed: 0,time,station,lat,lon,elevation,anemometer_height_m,thermometer_height_m
9624,1981-02-05 00:00:00,ASOSAWOS_72493023230,37.733,-122.2,2.0,10.06,
9625,1981-02-05 01:00:00,ASOSAWOS_72493023230,37.733,-122.2,2.0,10.06,
9626,1981-02-05 02:00:00,ASOSAWOS_72493023230,37.733,-122.2,2.0,10.06,
9627,1981-02-05 03:00:00,ASOSAWOS_72493023230,37.733,-122.2,2.0,10.06,
9628,1981-02-05 04:00:00,ASOSAWOS_72493023230,37.733,-122.2,2.0,10.06,
9629,1981-02-05 05:00:00,ASOSAWOS_72493023230,37.733,-122.2,2.0,10.06,
9630,1981-02-05 06:00:00,ASOSAWOS_72493023230,37.733,-122.2,2.0,10.06,
9631,1981-02-05 07:00:00,,,,,,
9632,1981-02-05 08:00:00,ASOSAWOS_72493023230,37.733,-122.2,2.0,10.06,
9633,1981-02-05 09:00:00,ASOSAWOS_72493023230,37.733,-122.2,2.0,10.06,


Within time gaps in the input dataframe, the standardization process does the following:
- inserts None for "station"
- inserts 'nan' for flag columns
- inserts NaN for all other columns

Is this what we want? Do we want to include data in the time gaps in the success report statistics?