# Flag summary development

## Environment set-up

In [3]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

# New logger function
from merge_log_config import logger

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [4]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
cleandir = "2_clean_wx/"
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

In [5]:
def merge_ds_to_df(ds):
    """Converts xarray ds for a station to pandas df in the format needed for processing.

    Parameters
    ----------
    ds: xr.Dataset
        Data object with information about each network and station
    verbose: boolean
        Flag as to whether to print runtime statements to terminal. Default is False. Set in ALLNETWORKS_merge.py run.

    Returns
    -------
    df: pd.DataFrame
        Table object with information about each network and station
    MultiIndex: pd.DataFrame (I think)
        Original multi-index of station and time, to be used on conversion back to ds
    attrs:
        Save ds attributes to inherent to the final merged file
    var_attrs:
        Save variable attributes to inherent to the final merged file
    """

    # Save attributes to inherent them to the final merged file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # Save instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            logger.info("Filling anemometer_height_m with NaN.")
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            logger.info("Filling thermometer_height_m with NaN.")
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    return df, MultiIndex, attrs, var_attrs

## Function

In [None]:
def flag_summary_native(df: pd.DataFrame):
    """
    Generates summary of flags set on all QAQC tests.
    Returns 
    - list of unique flag values for each variable
    - % of total obs per variable that was flagged
    - total number of observations per variable
    - number of flagged observations per variable


    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline

    Returns
    -------
    pd.dataFrame
    """

    logger.info("Running: succes_report_stats")

    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # number of total observations in station
    logger.info("Total observations: {}".format(len(df)))

    # 

    for var in eraqc_vars:
        # list of unique flags
        logger.info(
            "Flags set on {}: {}".format(var, df[var].unique()),
        ) 
        # % of observations flagged per variables
        logger.info(
            "Coverage of {} obs flagged: {} of {} obs ({}%)".format(
                var,
                len(df.loc[(df[var].isnull() == False)]),
                len(df),
                round((len(df.loc[(df[var].isnull() == False)]) / len(df)) * 100, 3),
            )
        )  

    # Construct dataframe of raw counts

    

In [9]:
#url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_69007093217.zarr"
ds = xr.open_zarr(url)

In [10]:
df, MultiIndex, attrs, var_attrs = merge_ds_to_df(ds)

In [12]:
eraqc_vars = [var for var in df.columns if "_eraqc" in var]

In [13]:
df[eraqc_vars]

Unnamed: 0,elevation_eraqc,pr_eraqc,ps_altimeter_eraqc,psl_eraqc,sfcWind_dir_eraqc,sfcWind_eraqc,tas_eraqc,tdps_eraqc
0,,,28.0,28.0,,,,28.0
1,,,28.0,28.0,,,,28.0
2,,,28.0,28.0,,,,28.0
3,,,28.0,28.0,,,,28.0
4,,,28.0,28.0,,,,28.0
...,...,...,...,...,...,...,...,...
14264,,,28.0,28.0,,,,28.0
14265,,,28.0,28.0,,,,28.0
14266,,,28.0,28.0,,,,28.0
14267,,,28.0,28.0,,,,28.0


In [15]:
for var in eraqc_vars:
    # list of unique flags
    print(
        "Flags set on {}: {}".format(var, df[var].unique()),
    )

    # # count of flagged observations
    # print(
    #     "Total observations for {}: ".format(
    #         var, len(df.loc[(df[var].isnull() == False)])
    #     )
    # )

    # % of observations flagged per variables
    print(
        "Coverage of {} obs flagged: {} of {} obs ({}%)".format(
            var,
            len(df.loc[(df[var].isnull() == False)]),
            len(df),
            round((len(df.loc[(df[var].isnull() == False)]) / len(df)) * 100, 3),
        )
    )

Flags set on elevation_eraqc: [nan]
Coverage of elevation_eraqc obs flagged: 0 of 14269 obs (0.0%)
Flags set on pr_eraqc: [nan]
Coverage of pr_eraqc obs flagged: 0 of 14269 obs (0.0%)
Flags set on ps_altimeter_eraqc: [28.]
Coverage of ps_altimeter_eraqc obs flagged: 14269 of 14269 obs (100.0%)
Flags set on psl_eraqc: [28.]
Coverage of psl_eraqc obs flagged: 14269 of 14269 obs (100.0%)
Flags set on sfcWind_dir_eraqc: [nan]
Coverage of sfcWind_dir_eraqc obs flagged: 0 of 14269 obs (0.0%)
Flags set on sfcWind_eraqc: [nan]
Coverage of sfcWind_eraqc obs flagged: 0 of 14269 obs (0.0%)
Flags set on tas_eraqc: [nan]
Coverage of tas_eraqc obs flagged: 0 of 14269 obs (0.0%)
Flags set on tdps_eraqc: [28.]
Coverage of tdps_eraqc obs flagged: 14269 of 14269 obs (100.0%)


In [None]:
def flag_summary_hourly(df: pd.DataFrame):
    """
    Generates summary of flags set on all QAQC tests.
    Returns
    - list of unique flag values for each variable
    - % of total obs per variable that was flagged
    - total number of observations per variable
    - number of flagged observations per variable


    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline

    Returns
    -------
    pd.dataFrame
    """

    logger.info("Running: succes_report_stats")

    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # number of total observations in station
    logger.info("Total observations: {}".format(len(df)))

    for var in eraqc_vars:
        # number of total observations per variables
        logger.info()

        # list of unique flags
        logger.info(
            "Flags set on {}: {}".format(var, df[var].unique()),
        )

        # count of

        # % of observations flagged per variables
        logger.info(
            "Coverage of {} obs flagged: {} of {} obs ({}%)".format(
                var,
                len(df.loc[(df[var].isnull() == False)]),
                len(df),
                round((len(df.loc[(df[var].isnull() == False)]) / len(df)) * 100, 3),
            )
        )