# Hourly Standardization Update

## Environment set-up

In [None]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
from functools import reduce

In [None]:
# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"

In [None]:
def merge_ds_to_df(ds):
    """Converts xarray ds for a station to pandas df in the format needed for processing.

    Parameters
    ----------
    ds: xr.Dataset
        Data object with information about each network and station
    verbose: boolean
        Flag as to whether to print runtime statements to terminal. Default is False. Set in ALLNETWORKS_merge.py run.

    Returns
    -------
    df: pd.DataFrame
        Table object with information about each network and station
    MultiIndex: pd.DataFrame (I think)
        Original multi-index of station and time, to be used on conversion back to ds
    attrs:
        Save ds attributes to inherent to the final merged file
    var_attrs:
        Save variable attributes to inherent to the final merged file
    """

    # Save attributes to inherent them to the final merged file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    df = ds.to_dataframe()

    # Save instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    return df, MultiIndex, attrs, var_attrs

## Load Data

In [None]:
# url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_72493023230.zarr"
ds = xr.open_zarr(url)

In [None]:
df, MultiIndex, attrs, var_attrs = merge_ds_to_df(ds)

## Functions

In [None]:
# -----------------------------------------------------------------------------
def qaqc_flag_fcn(x: str) -> str:
    """
    Used for resampling QAQC flag columns. Ensures that the final standardized dataframe
    does not contain any empty strings by returning 'nan' when given an empty input (i.e. in time gaps).

    Parameters
    -----------
    x : array_like
        sub-hourly timestep data

    Returns
    -------
    str : final flag value

    """
    if len(x) == 0:
        return "nan"
    else:
        return ",".join(x.unique())

In [None]:
# -----------------------------------------------------------------------------
def _modify_infill(df: pd.DataFrame, constant_vars: list) -> pd.DataFrame:
    """
    This function does two things:
    1. Flags rows that were infilled by resampling in the hourly standardization process, where
        there were time gaps in the input dataframe. These infilled rows will NOT count towards
        the total observations count when calculating flag rates for the success report
    2. Infills constant variables (ie those in "constant_vars") observations that were left empty because 
        they were in a time gap. They are infilled with the first non-nan value of each column, and set to
        np.nan if there are no non-nan values.

    Parameters
    -----------
    df : pd.Dataframe
        hourly standardized dataframe
    constant_vars: list
        variables that are constant throughout time

    Returns
    -------
    df : pd.Dataframe
        dataframe with updates added to rows infilled by hourly standardization

    """
    # Mask for rows where station is None (or np.nan)
    mask = df["station"].isnull()

    # Initialize dict to hold first non-NaN values
    first_valids = {}

    # Populate first_valids only for existing columns
    for col in constant_vars:
        if col in df.columns and col != "time":
            first_valids[col] = (
                df[col].dropna().iloc[0] if df[col].notna().any() else np.nan
            )

    # Update values in masked rows for existing columns
    for col, val in first_valids.items():
        df.loc[mask, col] = val
    
    # Add or update 'standardized_infill' column
    df["standardized_infill"] = np.where(mask, "y", "n")

    return df

In [None]:
# -----------------------------------------------------------------------------
def merge_hourly_standardization(
    df: pd.DataFrame, var_attrs: dict
) -> tuple[pd.DataFrame, dict]:
    """Resamples meteorological variables to hourly timestep according to standard conventions.

    Parameters
    -----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline
    var_attrs: library
        attributes for sub-hourly variables
    logger : logging.Logger
        Logger instance for recording messages during processing.

    Returns
    -------
    df : pd.DataFrame | None
        returns a dataframe with all columns resampled to one hour (column name retained)
    var_attrs : dict | None
        returns variable attributes dictionary updated to note that sub-hourly variables are now hourly

    Notes
    -----
    Rules:
    1. Top of the hour: take the first value in each hour. Standard convention for temperature, dewpoint, wind speed, direction, relative humidity, air pressure.
    2. Summation across the hour: sum observations within each hour. Standard convention for precipitation and solar radiation.
    3. Constant across the hour: take the first value in each hour. This applied to variables that do not change.
    """

    # Variables that remain constant within each hour
    constant_vars = [
        "time",
        "station",
        "lat",
        "lon",
        "elevation",
        "anemometer_height_m",
        "thermometer_height_m",
    ]

    # Aggregation across hour variables, standard meteorological convention: precipitation and solar radiation
    sum_vars = [
        "time",
        "pr",
        "pr_localmid",
        "pr_24h",
        "pr_1h",
        "pr_15min",
        "pr_5min",
        "rsds",
    ]

    # Top of the hour variables, standard meteorological convention: temperature, dewpoint temperature, pressure, humidity, winds
    instant_vars = [
        "hurs_derived",
        "time",
        "tas",
        "tas_derived",
        "tdps",
        "tdps_derived",
        "ps",
        "psl",
        "ps_altimeter",
        "ps_derived",
        "hurs",
        "sfcWind",
        "sfcWind_dir",
    ]

    # QAQC flags, which remain constants within each hour
    qaqc_var_pieces = ["qc", "eraqc", "duration", "method", "flag", "depth", "process"]

    try:

        qaqc_vars = [
            var for var in df.columns if any(item in var for item in qaqc_var_pieces)
        ]

        # Subset the dataframe according to rules
        constant_df = df[[col for col in constant_vars if col in df.columns]]

        qaqc_df = df[[col for col in qaqc_vars if col in df.columns if col != "time"]]
        qaqc_df = qaqc_df.astype(str)
        qaqc_df.insert(0, "time", df["time"])

        sum_df = df[[col for col in sum_vars if col in df.columns]]

        instant_df = df[[col for col in instant_vars if col in df.columns]]

        # Performing hourly aggregation, only if subset contains more than one (ie more than the 'time' time) column
        # This is to account for input dataframes that do not contain ALL subsets of variables defined above - just a subset of them.
        result_list = []
        if len(constant_df.columns) > 1:
            constant_result = constant_df.resample("1h", on="time").first()
            result_list.append(constant_result)

        if len(instant_df.columns) > 1:
            instant_result = instant_df.resample("1h", on="time").first()
            result_list.append(instant_result)

        if len(sum_df.columns) > 1:
            sum_result = sum_df.resample("1h", on="time").apply(
                lambda x: np.nan if x.isna().all() else x.sum(skipna=True)
            )
            result_list.append(sum_result)

        if len(qaqc_df.columns) > 1:
            qaqc_result = qaqc_df.resample("1h", on="time").apply(
                lambda x: qaqc_flag_fcn(x)
            )  # concatenating unique flags
            result_list.append(qaqc_result)

        # Aggregate and output reduced dataframe - this merges all dataframes defined
        # This function sets "time" to the index; reset index to return to original index
        result = reduce(
            lambda left, right: pd.merge(left, right, on=["time"], how="outer"),
            result_list,
        )
        result.reset_index(inplace=True)  # Convert time index --> column

        # Infill constant values and flag rows added through resampling
        result = _modify_infill(result, constant_vars)

        # Update attributes for sub-hourly variables
        sub_hourly_vars = [i for i in df.columns if "min" in i and "qc" not in i]
        for var in sub_hourly_vars:
            var_attrs[var]["standardization"] = (
                "{} has been standardized to an hourly timestep, but will retain its original name".format(
                    var
                )
            )

        return result, var_attrs

    except Exception as e:
        print("Failed")
        raise e

## Testing

In [None]:
df_test, var_attrs_test = merge_hourly_standardization(df, var_attrs)

In [None]:
# don't need to change this - this df will just used to find time gaps, 
# easier to view with fewer columns
df_gaps = df[["time", "station"]]

In [None]:
# Calculate time differences between consecutive rows
# (I know I probably don't need to define that extra "timestamp" 
# column, but I was getting an error without it and didn't want 
# to investigate it when this is only for testing)

df_gaps["timestamp"] = pd.to_datetime(df_gaps["time"])
df_gaps["time_diff"] = df_gaps["timestamp"].diff()
threshold = pd.Timedelta(minutes=60)
gaps = df_gaps[df_gaps["time_diff"] > threshold]

gaps

In [None]:
# Now filtert the standardized and original dataframes to a time gap
df_time_filt_test = df_test.loc[
    (df_test["time"] >= "1981-02-05 05:00:00	") & (df_test["time"] < "1981-02-05 10:00:00	")
]

df_time_filt = df.loc[
    (df["time"] >= "1981-02-05 05:00:00	")
    & (df["time"] < "1981-02-05 10:00:00	")
]

In [None]:
df_time_filt

In [None]:
df_time_filt_test

In [None]:
var_attrs_test