# QAQC flag counts 

## Environment set-up

In [None]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO
from functools import reduce

import inspect

import logging
# Create a simple logger that just prints to the console
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

plt.rcParams["figure.dpi"] = 300

In [53]:
# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"

In [54]:
def merge_ds_to_df(ds):
    """Converts xarray ds for a station to pandas df in the format needed for processing.

    Parameters
    ----------
    ds: xr.Dataset
        Data object with information about each network and station
    verbose: boolean
        Flag as to whether to print runtime statements to terminal. Default is False. Set in ALLNETWORKS_merge.py run.

    Returns
    -------
    df: pd.DataFrame
        Table object with information about each network and station
    MultiIndex: pd.DataFrame (I think)
        Original multi-index of station and time, to be used on conversion back to ds
    attrs:
        Save ds attributes to inherent to the final merged file
    var_attrs:
        Save variable attributes to inherent to the final merged file
    """

    # Save attributes to inherent them to the final merged file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    df = ds.to_dataframe()

    # Save instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            logger.info("Filling anemometer_height_m with NaN.")
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            logger.info("Filling thermometer_height_m with NaN.")
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    return df, MultiIndex, attrs, var_attrs

## Function

In [55]:
# include count of total number of observations

def eraqc_counts_hourly_timestep(df: pd.DataFrame, network: str, station: str) -> None:
    """
    Generates a dataframe of raw qaqc flag value counts for every variable,
    for the hourly timestep, after hourly standardization.
    Exports the dataframe as a csv to AWS.

    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline
    network: str
        network name
    station: str
        station name

    Returns
    -------
    None
    """
    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # filter df for only qaqc columns
    # also replace Nan values with 'no_flag' for two reasons:
    #   1. to enable us to count total observations for the success report
    #   2. to clarify what the Nan value indicates
    df = df[eraqc_vars].fillna('no_flag') 

    # generate df of counts of each unique flag for each variable
    # fill all Nan values with 0, since Nan = no observations counted
    flag_counts = df.apply(
        lambda x: x.str.split(",", expand=True).stack().value_counts()
    ).fillna(0)

    # add a row with total counts per variable
    # flag_counts['total_obs'] = 

    # rename columns
    flag_counts.columns = flag_counts.columns.str.replace("_eraqc", "", regex=True)

    # rename index (i.e. eraqc values) and then reset index
    flag_counts = flag_counts.rename_axis("eraqc_flag_values").reset_index()

    # send file to AWS
    csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/{network}/eraqc_counts/{station}_flag_counts_hourly_standardized.csv"
    flag_counts.to_csv(csv_s3_filepath, index=False)

    return flag_counts

### Load Data

In [56]:
# url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_72493023230.zarr"
ds = xr.open_zarr(url)

In [57]:
df, MultiIndex, attrs, var_attrs = merge_ds_to_df(ds)

### Perform hourly standardization

In [None]:
def my_func(x):
    if len(x) == 0:
        return "nan"
    else:
        return ",".join(x.unique())

In [None]:
def merge_hourly_standardization(
    df: pd.DataFrame, var_attrs: dict, logger: logging.Logger
) -> tuple[pd.DataFrame, dict]:
    """Resamples meteorological variables to hourly timestep according to standard conventions.

    Parameters
    -----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline
    var_attrs: library
        attributes for sub-hourly variables
    logger : logging.Logger
        Logger instance for recording messages during processing.

    Returns
    -------
    df : pd.DataFrame | None
        returns a dataframe with all columns resampled to one hour (column name retained)
    var_attrs : dict | None
        returns variable attributes dictionary updated to note that sub-hourly variables are now hourly

    Notes
    -----
    Rules:
    1. Top of the hour: take the first value in each hour. Standard convention for temperature, dewpoint, wind speed, direction, relative humidity, air pressure.
    2. Summation across the hour: sum observations within each hour. Standard convention for precipitation and solar radiation.
    3. Constant across the hour: take the first value in each hour. This applied to variables that do not change.
    """

    logger.info(f"{inspect.currentframe().f_code.co_name}: Starting...")

    # Variables that remain constant within each hour
    constant_vars = [
        "time",
        "station",
        "lat",
        "lon",
        "elevation",
        "anemometer_height_m",
        "thermometer_height_m",
    ]

    # Aggregation across hour variables, standard meteorological convention: precipitation and solar radiation
    sum_vars = [
        "time",
        "pr",
        "pr_localmid",
        "pr_24h",
        "pr_1h",
        "pr_15min",
        "pr_5min",
        "rsds",
    ]

    # Top of the hour variables, standard meteorological convention: temperature, dewpoint temperature, pressure, humidity, winds
    instant_vars = [
        "hurs_derived",
        "time",
        "tas",
        "tas_derived",
        "tdps",
        "tdps_derived",
        "ps",
        "psl",
        "ps_altimeter",
        "ps_derived",
        "hurs",
        "sfcWind",
        "sfcWind_dir",
    ]

    # QAQC flags, which remain constants within each hour
    vars_to_remove = ["qc", "eraqc", "duration", "method", "flag", "depth", "process"]

    try:

        qaqc_vars = [
            var
            for var in df.columns
            if any(True for item in vars_to_remove if item in var)
        ]

        # Subset the dataframe according to rules
        constant_df = df[[col for col in constant_vars if col in df.columns]]

        qaqc_df = df[[col for col in qaqc_vars if col in df.columns if col != "time"]]
        qaqc_df = qaqc_df.astype(str)
        qaqc_df.insert(0, "time", df["time"])

        sum_df = df[[col for col in sum_vars if col in df.columns]]

        instant_df = df[[col for col in instant_vars if col in df.columns]]

        # Performing hourly aggregation, only if subset contains more than one (ie more than the 'time' time) column
        # This is to account for input dataframes that do not contain ALL subsets of variables defined above - just a subset of them.
        result_list = []
        if len(constant_df.columns) > 1:
            constant_result = constant_df.resample("1h", on="time").first()
            result_list.append(constant_result)

        if len(instant_df.columns) > 1:
            instant_result = instant_df.resample("1h", on="time").first()
            result_list.append(instant_result)

        if len(sum_df.columns) > 1:
            sum_result = sum_df.resample("1h", on="time").apply(
                lambda x: np.nan if x.isna().all() else x.sum(skipna=True)
            )
            result_list.append(sum_result)

        if len(qaqc_df.columns) > 1:
            qaqc_result = qaqc_df.resample("1h", on="time").apply(
                lambda x: ",".join(x.unique())
            )  # adding unique flags
            result_list.append(qaqc_result)

        # Aggregate and output reduced dataframe - this merges all dataframes defined
        # This function sets "time" to the index; reset index to return to original index
        result = reduce(
            lambda left, right: pd.merge(left, right, on=["time"], how="outer"),
            result_list,
        )
        result.reset_index(inplace=True)  # Convert time index --> column

        # Update attributes for sub-hourly variables
        sub_hourly_vars = [i for i in df.columns if "min" in i and "qc" not in i]
        for var in sub_hourly_vars:
            var_attrs[var]["standardization"] = (
                "{} has been standardized to an hourly timestep, but will retain its original name".format(
                    var
                )
            )
        logger.info(f"{inspect.currentframe().f_code.co_name}: Completed successfully")

        return result, var_attrs

    except Exception as e:
        logger.error(f"{inspect.currentframe().f_code.co_name}: Failed")
        raise e

### Development

https://stackoverflow.com/questions/54544084/how-to-apply-series-value-counts-on-dataframe

https://note.nkmk.me/en/python-pandas-value-counts/

df['X'].apply(lambda x : len(np.unique(x.split(','))))

df_qc["ps_altimeter_eraqc"].str.split(",", expand=True).nunique(1)

business['Category'].str.split(',').apply(len)

https://stackoverflow.com/questions/60143292/at-column-count-word-in-comma-separated-sentence 

Why is there a space??

In [13]:
df_st['ps_altimeter_eraqc'].unique()

array(['nan', '23.0', '', 'nan,23.0', '23.0,nan'], dtype=object)

In [None]:
df_qc.columns
# more than nan
# ps_altimeter_eraqc
# ps_eraqc

Index(['elevation_eraqc', 'pr_eraqc', 'ps_altimeter_eraqc', 'ps_eraqc',
       'psl_eraqc', 'sfcWind_dir_eraqc', 'sfcWind_eraqc', 'tas_eraqc',
       'tdps_eraqc'],
      dtype='object')

In [84]:
df_qc["ps_eraqc"].unique()

array([nan, 21., 23.])

In [10]:
# QAQC flags, which remain constants within each hour
vars_to_remove = ["qc", "eraqc", "duration", "method", "flag", "depth", "process"]
qaqc_vars = [
    var for var in df.columns if any(True for item in vars_to_remove if item in var)
]
qaqc_df = df[[col for col in qaqc_vars if col in df.columns if col != "time"]]
qaqc_df = qaqc_df.astype(str)
qaqc_df.insert(0, "time", df["time"])

In [12]:
qaqc_df["ps_eraqc"].unique()

array(['nan', '21.0', '23.0'], dtype=object)

In [13]:
qaqc_df_filt = qaqc_df[['time','ps_eraqc']]

In [38]:
df_time_filt = qaqc_df_filt.loc[
    (qaqc_df_filt["time"] >= "1981-02-04") & (qaqc_df_filt["time"] < "1981-02-06")
]

In [40]:
df_time_filt['ps_eraqc'].unique()

array(['nan'], dtype=object)

In [58]:
df_time_filt

Unnamed: 0,time,ps_eraqc
10328,1981-02-04 00:00:00,
10329,1981-02-04 01:00:00,
10330,1981-02-04 02:00:00,
10331,1981-02-04 03:00:00,
10332,1981-02-04 04:00:00,
10333,1981-02-04 05:00:00,
10334,1981-02-04 06:00:00,
10335,1981-02-04 07:00:00,
10336,1981-02-04 08:00:00,
10337,1981-02-04 09:00:00,


In [49]:
qaqc_result_test = df_time_filt.resample("1h", on="time").apply(
    lambda x: my_func(x)
    # lambda x: ",".join(x.unique())
)

In [50]:
qaqc_result_test["ps_eraqc"].unique()

# empty string shows up 

array(['nan'], dtype=object)

In [51]:
qaqc_result_test

Unnamed: 0_level_0,ps_eraqc
time,Unnamed: 1_level_1
1981-02-04 00:00:00,
1981-02-04 01:00:00,
1981-02-04 02:00:00,
1981-02-04 03:00:00,
1981-02-04 04:00:00,
1981-02-04 05:00:00,
1981-02-04 06:00:00,
1981-02-04 07:00:00,
1981-02-04 08:00:00,
1981-02-04 09:00:00,


In [21]:
df_original_time_filt = df.loc[
    (df["time"] >= "1981-02-04") & (df["time"] < "1981-02-06")
]

In [None]:
df_original_time_filt[["time", "ps_eraqc"]]

In [26]:
df['ps_eraqc'].unique()

array([nan, 21., 23.])

Oh! I think I figured out why that space is appearing. For time gaps! When the data skips a few hours, or even days - there are no flags to concatenate. So they show up as spaces! Now let us confirm this by resamplign over a region with a time gap. And check if there is a time gap I the region I specified above.

Might work better if we convert flag values to numeric.

change our wording:
resampling or aggregating?
do we want to preserve the gaps but fill in gaps at the to of the hour?
looks like the resampler preserves gaps

clean up this notebook to be a demo of this situation/artefact/potential issue

Why are standardizing to an hourly timestep? We SHOULD do a TRUE resampling 

### Run the function

In [82]:
network = "ASOSAWOS"
station = "ASOSAWOS_72493023230"

In [84]:
# Run if set to flag_counts
flag_counts = eraqc_counts_hourly_timestep(df_st, network, station)
flag_counts

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps
0,,35504.0,35504.0,35504.0,35504.0,35504.0,35504.0,35504.0,35504.0,35504.0
1,21.0,0.0,0.0,0.0,7296.0,22079.0,0.0,0.0,0.0,0.0
2,23.0,0.0,0.0,24.0,3.0,23.0,0.0,0.0,10.0,200.0
3,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.0,332.0
4,27.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,14.0
5,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0
6,,338512.0,338512.0,338490.0,331214.0,316410.0,338512.0,338496.0,338428.0,338004.0


There are empty string sin flag columns
Function to take care of those empty strings

### CHECK: let's look at the output csv

In [None]:
key = f"4_merge_wx/{network}/eraqc_counts/original_timestep_{station}.csv"

list_import = s3_cl.get_object(
    Bucket=bucket_name,
    Key=key,
)

flag_counts_table = pd.read_csv(BytesIO(list_import["Body"].read()))

In [None]:
flag_counts_table