# Flag summary development

## Environment set-up

In [None]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

# New logger function
from merge_log_config import logger

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [2]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
cleandir = "2_clean_wx/"
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

## Function

In [None]:
def flag_summary(df: pd.DataFrame, timestep: str):
    """
    Generates summary of flags set on all QAQC tests.
    Returns 
    - list of unique flag values for each variable
    - % of total obs per variable that was flagged
    - total number of observations per variable
    - number of flagged observations per variable


    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline
    timestep: str
        can be set to "native" or "hourly"

    Returns
    -------
    pd.dataFrame
    """

    logger.info("Running: flag_summary")

    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # Select obs variables to plot and ignore originally accumulated variables (pr)
    obs_vars = [var.split("_e")[0] for var in eraqc_vars if "accum" not in var]

    

    for var in eraqc_vars:
        logger.info(
            "Flags set on {}: {}".format(var, df[var].unique()),
        )  # unique flag values
        logger.info(
            "Coverage of {} obs flagged: {} of {} obs ({}%)".format(
                var,
                len(df.loc[(df[var].isnull() == False)]),
                len(df),
                round((len(df.loc[(df[var].isnull() == False)]) / len(df)) * 100, 3),
            )
        )  # % of coverage flagged

    for var in obs_vars:
        try:
            flagged_timeseries_plot(df, var)
        except Exception as e:
            logger.info(
                "flagged_timeseries_plot failed for {} with Exception: {}".format(
                    var, e
                )
            )