# QAQC flag counts 

This function generates and exports a dataframe with counts of unique QAQC flag values per variable, in their native timestep, before hourly standardization.
These tables are used to produce the following QAQC flag statistics for the QAQC success report:

- % of all obs flagged

- % of obs per var flagged

- % of obs per network flagged

- % of flags per QA/QC


## Environment set-up

In [None]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

# New logger function
from merge_log_config import logger

plt.rcParams["figure.dpi"] = 300

In [None]:
# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"

In [None]:
def merge_ds_to_df(ds):
    """Converts xarray ds for a station to pandas df in the format needed for processing.

    Parameters
    ----------
    ds: xr.Dataset
        Data object with information about each network and station
    verbose: boolean
        Flag as to whether to print runtime statements to terminal. Default is False. Set in ALLNETWORKS_merge.py run.

    Returns
    -------
    df: pd.DataFrame
        Table object with information about each network and station
    MultiIndex: pd.DataFrame (I think)
        Original multi-index of station and time, to be used on conversion back to ds
    attrs:
        Save ds attributes to inherent to the final merged file
    var_attrs:
        Save variable attributes to inherent to the final merged file
    """

    # Save attributes to inherent them to the final merged file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    df = ds.to_dataframe()

    # Save instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            logger.info("Filling anemometer_height_m with NaN.")
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            logger.info("Filling thermometer_height_m with NaN.")
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    return df, MultiIndex, attrs, var_attrs

## Function

In [None]:
def eraqc_counts_original_timestep(df: pd.DataFrame, network: str, station: str) -> None:
    """
    Generates a dataframe of raw qaqc flag value counts for every variable,
    in their native timestep, before hourly standardization.
    Exports the dataframe as a csv to AWS.

    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline
    network: str
        network name
    station: str
        station name

    Returns
    -------
    None
    """
    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # filter df for only qaqc columns
    # also replace Nan values with 'no_flag' for two reasons:
    #   1. to enable us to count total observations for the success report
    #   2. to clarify what the Nan value indicates
    df = df[eraqc_vars].fillna('no_flag') 

    # generate df of counts of each unique flag for each variable
    # fill all Nan values with 0, since Nan = no observations counted
    flag_counts = df.apply(pd.Series.value_counts).fillna(0)

    # rename columns
    flag_counts.columns = flag_counts.columns.str.replace("_eraqc", "", regex=True)

    # rename index (i.e. eraqc values) and then reset index
    flag_counts = flag_counts.rename_axis("eraqc_flag_values").reset_index()

    # send file to AWS
    new_buffer = StringIO()
    flag_counts.to_csv(new_buffer, index=False)
    content = new_buffer.getvalue()
    key = f"4_merge_wx/{network}/eraqc_counts/original_timestep_{station}.csv"

    s3_cl.put_object(
        Bucket=bucket_name,
        Body=content,
        Key=key,
    )

    return None # flag_counts

### Run the function

In [None]:
# url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_72493023230.zarr"
ds = xr.open_zarr(url)

In [None]:
df, MultiIndex, attrs, var_attrs = merge_ds_to_df(ds)

In [None]:
network = 'ASOSAWOS'
station = "ASOSAWOS_72493023230"

In [None]:
# Run if set to None
# eraqc_counts(df, network, station)

In [None]:
# Run if set to flag_counts
flag_counts = eraqc_counts(df, network, station)
flag_counts

### CHECK: let's look at the output csv

In [None]:
key = f"4_merge_wx/{network}/eraqc_counts/original_timestep_{station}.csv"

list_import = s3_cl.get_object(
    Bucket=bucket_name,
    Key=key,
)

flag_counts_table = pd.read_csv(BytesIO(list_import["Body"].read()))

In [None]:
flag_counts_table