# merge_hourly_standardization debugging 
Demonstrate issues with merge_hourly_standardization function. <br>Date: May 18, 2025

In [36]:
import xarray as xr 
import pandas as pd
import numpy as np

from merge_hourly_standardization import merge_hourly_standardization
from merge_derive_missing_vars import merge_derive_missing_vars, _input_var_check, _calc_dewpointtemp, _add_derived_var_attrs
from merge_clean_vars import merge_reorder_vars, merge_drop_vars

### 1. Read in a sample zarr, setup for input to function
1. Make a simple logger that just prints to the console (required input to function)
2. Read in zarr as xr.Dataset 
3. Convert xr.Dataset --> pd.DataFrame (required input to function)
4. Change MultiIndex DataFrame to a flat (single-level) DataFrame, with `time` and `station` as normal columns 
5. Input DataFrame, variable attributes, and logger to `merge_hourly_standardization`

In [2]:
# Create a simple logger that just prints to the console
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()  

# Read in zarr as xarray object 
s3_path = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_69007093217.zarr/"
ds = xr.open_zarr(s3_path)
var_attrs = {
        var: ds[var].attrs for var in list(ds.data_vars.keys())
    }  # Attributes from each variable

# Convert to dataframe 
df = ds.to_dataframe()

# Set index MultiIndex --> normal df 
df.reset_index(inplace=True)

# Input dataframe to merge_hourly_standardization function 
df_after_merge, var_attrs = merge_hourly_standardization(df, var_attrs, logger)

# Derive missing vars 
df_after_missing_vars, var_attrs = merge_derive_missing_vars(df_after_merge, var_attrs)

# Part 5: Drops raw _qc variables (DECISION TO MAKE) or provide code to filter 
df_final, var_attrs = merge_drop_vars(df_after_missing_vars, var_attrs)

# Part 6: Re-orders variables into final preferred order
df_final = merge_reorder_vars(df_after_missing_vars)

INFO:aiobotocore.credentials:Found credentials in environment variables.
  base = data.astype(np.int64)
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
INFO:root:merge_hourly_standardization: Completed successfully


Running merge_derive_missing_vars...
tdps_derived is present in station, no derivation necessary.
Calculating hurs_derived...
merge_derive_missing_vars failed with exception: '>' not supported between instances of 'str' and 'int'


TypeError: cannot unpack non-iterable NoneType object

In [17]:
len(df["hurs_derived" + "_eraqc"].unique()) > 1

False

In [None]:
df = df_after_merge.copy()
new_var_attrs = var_attrs.copy()
df_columns = list(df.columns)

def _is_sublist(l1, l2):
    """
    Check if all elements of l1 are present in l2.

    Parameters
    ----------
    l1 : list
        List of elements to check.
    l2 : list
        List to check against.

    Returns
    -------
    bool
        True if all elements of l1 are in l2, False otherwise.
    """
    return all(item in l2 for item in l1)

if not _is_sublist(["tdps","tdps_derived"], df_columns) and _is_sublist(["tas","hurs"], df_columns):
    print(f"Calculating tdps_derived...")  # convert to logger when set-up
    df["tdps_derived"] = _calc_dewpointtemp(df["tas"], df["hurs"])
    # synergistic flag check
    #df = derive_synergistic_flag(df, "tdps_derived", "tas", "hurs")
    # add new variable attributes
    new_var_attrs = _add_derived_var_attrs(
        derived_var="tdps_derived",
        source_var="tdps",
        input_vars=["tas", "hurs"],
        var_attrs=new_var_attrs,
    )

else:
    print("tdps_derived is present in station, no derivation necessary.")

tdps_derived is present in station, no derivation necessary.


In [None]:
#derive_synergistic_flag(df, "hurs_derived", "tas", "tdps_derived")
var1 = "tas"
var_to_flag = "hurs_derived"
#df.loc[df[var1 + "_eraqc"] > 0]

In [None]:
df2 = df.copy()
df2[var1 + "_eraqc"] = df[var1 + "_eraqc"].replace("nan", np.nan).replace(["nan", ""], np.nan)
df2.loc[df2[var1 + "_eraqc"] > 0, var_to_flag + "_eraqc"] = 38
#df2[var1 + "_eraqc"] = pd.to_numeric(df2[var1 + "_eraqc"])
#df2[var1 + "_eraqc"].dropna()
#print(df2[var1 + "_eraqc"].dropna().unique())

In [None]:
# set up _eraqc variable for new derived variable
df[var_to_flag + "_eraqc"] = np.nan

# identify if var 1 has flags
if len(df[var1 + "_eraqc"].unique()) > 1:
    # flags are present
    df.loc[df[var1 + "_eraqc"] > 0, var_to_flag + "_eraqc"] = (
        38  # see 3_qaqc_data/era_qaqc_flag_meanings.csv
    )

if len(df[var2 + "_eraqc"].unique()) > 1:
    df.loc[df[var2 + "_eraqc"] > 0, var_to_flag + "_eraqc"] = (
        38  # see 3_qaqc_data/era_qaqc_flag_meanings.csv
    )

return df

In [None]:
## Identify vars that can be derived
def merge_derive_missing_vars(
    df: pd.DataFrame, var_attrs: dict
) -> tuple[pd.DataFrame, dict] | None:
    """
    Identifies if any variables can be derived with other input variables.
    If success, variable is derived in the correct unit, attribtues are updated,
    and any flags from the input variables are synergistically flagged.
    If failure, variable is not derived.

    Parameters
    ----------
    df : pd.DataFrame
        input dataframe
    var_attrs : dict
        attributes of input variables

    Returns
    -------
    If success: pd.DataFrame with newly added derived variable, and updated variable attributes
    If failure: None
    """
    print("Running merge_derive_missing_vars...")  # conver to logger when ready

    # vars that can be derived
    derive_vars = [
        "hurs",
        "tas",
    ]  # tdps not included here, kept separate because of tdps_derived

    # initialize update vars dictionary
    new_var_attrs = var_attrs.copy()

    try:
        # var is missing
        # check if required inputs are available
        if "tdps" not in df.columns and "tdps_derived" not in df.columns:
            if _input_var_check(df, var1="tas", var2="hurs") == True:
                print(f"Calculating tdps_derived...")  # convert to logger when set-up
                df["tdps_derived"] = _calc_dewpointtemp(df["tas"], df["hurs"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "tdps_derived", "tas", "hurs")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="tdps_derived",
                    source_var="tdps",
                    input_vars=["tas", "hurs"],
                    var_attrs=new_var_attrs,
                )

        else:
            print("tdps_derived is present in station, no derivation necessary.")

        # first check if station has any vars that can be derived
        for item in derive_vars:
            if item in df.columns:
                print(
                    f"{item} is present in station, no derivation necessary."
                )  # convert to logger when set-up
                continue

            if item == "hurs" and _input_var_check(df, var1="tas", var2="tdps") == True:
                print(f"Calculating {item}_derived...")  # convert to logger when set-up
                df["hurs_derived"] = _calc_relhumid(df["tas"], df["tdps"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "hurs_derived", "tas", "tdps")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="hurs_derived",
                    source_var="hurs",
                    input_vars=["tas", "tdps"],
                    var_attrs=new_var_attrs,
                )

            elif (
                item == "hurs"
                and _input_var_check(df, var1="tas", var2="tdps_derived") == True
            ):
                print(
                    f"Calculating {item}_derived ..."
                )  # convert to logger when set-up
                df["hurs_derived"] = _calc_relhumid(df["tas"], df["tdps_derived"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "hurs_derived", "tas", "tdps_derived")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="tdps_derived",
                    source_var="tdps",
                    input_vars=["tas", "hurs"],
                    var_attrs=new_var_attrs,
                )

            elif (
                item == "tas" and _input_var_check(df, var1="hurs", var2="tdps") == True
            ):
                print(f"Calculating {item}_derived...")  # convert to logger when set-up
                df["tas_derived"] = _calc_airtemp(df["hurs"], df["tdps"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "tas_derived", "hurs", "tdps")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="tas_derived",
                    source_var="tas",
                    input_vars=["hurs", "tdps"],
                    var_attrs=new_var_attrs,
                )

            elif (
                item == "tas"
                and _input_var_check(df, var1="hurs", var2="tdps_derived") == True
            ):
                print(
                    f"Calculating {item}_derived ...."
                )  # convert to logger when set-up
                df["tas_derived"] = _calc_airtemp(df["hurs"], df["tdps_derived"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "tas_derived", "tas", "tdps_derived")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="tas_derived",
                    source_var="tas",
                    input_vars=["hurs", "tdps_derived"],
                    var_attrs=new_var_attrs,
                )

            else:
                print(
                    f"{item} is missing the required input variables. {item}_derived not calculated."
                )  # convert to logger when set-up

        return df, new_var_attrs

    except Exception as e:
        print(
            f"merge_derive_missing_vars failed with exception: {e}"
        )  # convert to logger version when ready
        return None


def _input_var_check(df: pd.DataFrame, var1: str, var2: str) -> bool:
    """
    Flexible check if required secondary input variables are available to derive a primary variable.

    Parameters
    ----------
    df : pd.DataFrame
        input dataframe to check against
    var1 : str
        name of secondary input var 1
    var2 : str
        name of secondary input var 2

    Returns
    -------
    bool
        True if all required input vars present; False if not
    """

    if var1 in df.columns and var2 in df.columns:
        return True
    else:
        return False


def derive_synergistic_flag(
    df: pd.DataFrame, var_to_flag: str, var1: str, var2: str
) -> pd.DataFrame:
    """Synergistically flags the derived variable if the input variables also have flags.

    Parameters
    ----------
    df : pd.DataFrame
        input df to identify flags
    var_to_flag : str
        name of variable to check and flag
    var1 : str
        name of secondary input var 1
    var2 : str
        name of secondary input var 2

    Returns
    -------
    df : pd.DataFrame
        df with synergistic flags applied, if applicable

    Notes
    -----
    Flag meaning : 38,derive_synergistic_flag,At least one input variable to derived variable has a flag placed. Input variable and derived variable are synergistically flagged

    """
    # set up _eraqc variable for new derived variable
    df[var_to_flag + "_eraqc"] = np.nan

    # identify if var 1 has flags
    if len(df[var1 + "_eraqc"].unique()) > 1:
        # flags are present
        df.loc[df[var1 + "_eraqc"] > 0, var_to_flag + "_eraqc"] = (
            38  # see 3_qaqc_data/era_qaqc_flag_meanings.csv
        )

    if len(df[var2 + "_eraqc"].unique()) > 1:
        df.loc[df[var2 + "_eraqc"] > 0, var_to_flag + "_eraqc"] = (
            38  # see 3_qaqc_data/era_qaqc_flag_meanings.csv
        )

    return df


def _add_derived_var_attrs(
    derived_var: str, source_var: str, input_vars: list[str], var_attrs: dict
) -> dict:
    """Creates data attributes for new derived variable and adds to var_attrs.

    Parameters
    ----------
    derived_var : str
        variable name of new derived variable
    source_var : str
        variable name of the variable it "derives"
    input_vars : list[str]
        variable names of input variable
    var_attrs : dict
        attributes for all variables

    Returns
    -------
    var_attrs : dict
        updated variable attributes dictionary with new vars
    """

    # support for naming, units
    if source_var == "tdps":
        long_name = "derived_dew_point_temperature"
        units = "K"
    elif source_var == "tas":
        long_name = "derived_air_temperature"
        units = "K"
    elif source_var == "hurs":
        long_name = "derived_relative_humidity"
        units = "percent"

    # add new attributes -- var_attrs are stored as dict of each var dict
    derived_var_dict = {
        "long_name": long_name,
        "units": units,
        "ancillary_variables": f"{input_vars[0]}, {input_vars[1]}",
        "comment": "Derived in merge_derive_missing_vars.",
    }

    # add new var dictionary to existing var_attrs dict
    var_attrs[derived_var] = derived_var_dict
    return var_attrs


## Derived variable calculations
def _calc_dewpointtemp(tas: pd.Series, hurs: pd.Series) -> pd.Series:
    """Calculates dew point temperature, method 1

    Parameters
    ----------
    tas : pd.Series
        air temperature, K
    hurs: pd.Series
        relative humidity, % or 0-100

    Returns
    -------
    tdps : pd.Series
        dewpoint temperature, K

    Notes
    -----
    Rounded to 3 decimal places to be consistent with input raw data sig figs
    """
    es = 0.611 * np.exp(
        5423 * ((1 / 273) - (1 / tas))
    )  # calculates saturation vapor pressure
    e_vap = (
        es * hurs
    ) / 100.0  # calculates vapor pressure, IF NOT ALREADY OBSERVED -- will need ifelse statement
    tdps = (
        (1 / 273) - 0.0001844 * np.log(e_vap / 0.611)
    ) ** -1  # calculates dew point temperature, units = K
    return np.round(tdps, decimals=3)


def _calc_airtemp(hurs: pd.Series, tdps: pd.Series) -> pd.Series:
    """Calculate air temperature

    Parameters
    ----------
    hurs : pd.Series
        relative humidity, % or 0-100
    tdps : pd.Series
        dewpoint temperature, K

    Returns
    -------
    tas : pd.Series
        air temperature, K

    References
    ----------
    [1] August-Roche-Magnus Approximation

    Notes
    -----
    Rounded to 3 decimal places to be consistent with input raw data sig figs
    """

    # tdps must be in degC, not K for this equation
    tdps_degC = tdps - 273.15

    # apply approximation to calculate tas in degC
    tas_degC = (
        243.04
        * (((17.625 * tdps_degC) / (243.04 + tdps_degC)) - np.log(hurs / 100))
        / (17.625 + np.log(hurs / 100) - ((17.625 * tdps_degC) / (243.04 + tdps_degC)))
    )

    # convert back to K
    tas_K = tas_degC + 273.15

    return np.round(tas_K, decimals=3)


def _calc_relhumid(tas: pd.Series, tdps: pd.Series) -> pd.Series:
    """Calculate relative humidity

    Parameters
    ----------
    tas : pd.Series
        air temperature, K
    tdps : pd.Series
        dewpoint temperature, K

    Returns
    -------
    hurs : pd.Series
        relative humidity, % (0-100)

    Notes
    -----
    Rounded to 3 decimal places to be consistent with input raw data sig figs
    """

    es = 0.611 * np.exp(
        5423 * ((1 / 273) - (1 / tas))
    )  # calculates saturation vapor pressure using air temp
    e_vap = 0.611 * np.exp(
        5423 * ((1 / 273) - (1 / tdps))
    )  # calculates vapor pressure using dew point temp
    hurs = 100 * (e_vap / es)
    return np.round(hurs, decimals=3)

In [None]:
df_after_merge

### 2. Observe the original DataFrame

In [None]:
df.head()

### 3. Obeserve the output DataFrames

In [None]:
# df after merge_hourly_standardization function 
df_after_merge.head()

In [None]:
# df after merge_derive_missing_vars function 
df_after_merge.head()

In [None]:
df_final.head()

In [None]:
list(var_attrs.keys())

In [None]:
df_final.columns