In [1]:
import pandas as pd
import numpy as np
import xarray as xr
import boto3

In [2]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
cleandir = '2_clean_wx/'
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

In [3]:
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_72681024131.zarr"
ds = xr.open_zarr(url)
var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

{'anemometer_height_m': {},
 'elevation': {'ancillary_variables': 'elevation_eraqc',
  'long_name': 'station_elevation',
  'positive': 'up',
  'standard_name': 'height_above_mean_sea_level',
  'units': 'meter'},
 'elevation_eraqc': {},
 'lat': {},
 'lon': {},
 'pr': {'ancillary_variables': 'pr_qc pr_depth_qc pr_duration, pr_eraqc',
  'comment': '',
  'long_name': 'precipitation_accumuation',
  'units': 'mm/?'},
 'pr_depth_qc': {'flag_meanings': 'See QA/QC csv for network.',
  'flag_values': '1 2 3 4 5 6 7 8 E I J 9'},
 'pr_duration': {'ancillary_variables': 'pr pr_qc pr_depth_qc',
  'long_name': 'precipitation measurement interval'},
 'pr_eraqc': {},
 'pr_qc': {'flag_meanings': 'See QA/QC csv for network.',
  'flag_values': '0 1 2 3 4 5 6 7 9 A I M P R U'},
 'ps': {'ancillary_variables': 'ps_qc ps_altimeter ps_altimeter_qc, ps_eraqc',
  'comment': 'Converted from hPa to Pa.',
  'long_name': 'station_air_pressure',
  'standard_name': 'air_pressure',
  'units': 'Pa'},
 'ps_altimeter': {'

In [4]:
df = ds.to_dataframe().reset_index()
df.head(3)

  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
  base = data.astype(np.int64)


Unnamed: 0,station,time,anemometer_height_m,elevation,elevation_eraqc,lat,lon,pr,pr_depth_qc,pr_duration,...,sfcWind_eraqc,sfcWind_method,sfcWind_qc,tas,tas_eraqc,tas_qc,tdps,tdps_eraqc,tdps_qc,thermometer_height_m
0,ASOSAWOS_72681024131,1980-01-01 00:00:00,10.06,865.0,,43.567,-116.217,0.0,9.0,0 days 01:00:00,...,,N,5,275.35,,5,272.05,,5,
1,ASOSAWOS_72681024131,1980-01-01 01:00:00,10.06,865.0,,43.567,-116.217,0.0,9.0,0 days 01:00:00,...,,N,5,274.85,,5,272.55,,5,
2,ASOSAWOS_72681024131,1980-01-01 01:05:00,10.06,871.0,,43.567,-116.217,,,NaT,...,,N,1,,,9,,,9,


In [39]:
## Identify vars that can be derived
def merge_derive_missing_vars(
    df: pd.DataFrame, var_attrs: dict
) -> tuple[pd.DataFrame, dict] | None:
    """
    Identifies if any variables can be derived with other input variables.
    If success, variable is derived in the correct unit, attribtues are updated,
    and any flags from the input variables are synergistically flagged.
    If failure, variable is not derived.

    Parameters
    ----------
    df : pd.DataFrame
        input dataframe
    var_attrs : dict
        attributes of input variables

    Returns
    -------
    If success: pd.DataFrame with newly added derived variable, and updated variable attributes
    If failure: None
    """
    print("Running merge_derive_missing_vars...")  # conver to logger when ready

    # vars that can be derived
    derive_vars = ["hurs", "tas"]  # only tdps, not tdps_derived

    new_var_attrs = var_attrs.copy()

    try:
        # var is missing
        # check if required inputs are available
        if "tdps" not in df.columns and "tdps_derived" not in df.columns:
            if _input_var_check(df, var1="tas", var2="hurs") == True:
                print(
                    f"Calculating {item}_derived..."
                )  # convert to logger when set-up
                df["tdps_derived"] = _calc_dewpointtemp(df["tas"], df["hurs"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "tdps_derived", "tas", "hurs")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="tdps_derived",
                    source_var="tdps",
                    input_vars=["tas", "hurs"],
                    var_attrs=var_attrs,
                )
    
        else:
            print("tdps_derived is present in station, no derivation necessary.")

        # first check if station has any vars that can be derived, excluding tdps
        for item in derive_vars:
            if item in df.columns:
                print(
                    f"{item} is present in station, no derivation necessary."
                )  # convert to logger when set-up
                continue

            elif item == "hurs" and _input_var_check(df, var1="tas", var2="tdps") == True:
                print(f"Calculating {item}_derived...")  # convert to logger when set-up
                df["hurs_derived"] = _calc_relhumid(df["tas"], df["tdps"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "hurs_derived", "tas", "tdps")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="hurs_derived",
                    source_var="hurs",
                    input_vars=["tas", "tdps"],
                    var_attrs=new_var_attrs,
                )

            elif (
                item == "hurs"
                and _input_var_check(df, var1="tas", var2="tdps_derived") == True
            ):
                print(
                    f"Calculating {item}_derived ..."
                )  # convert to logger when set-up
                df["hurs_derived"] = _calc_relhumid(df["tas"], df["tdps_derived"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "hurs_derived", "tas", "tdps_derived")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="tdps_derived",
                    source_var="tdps",
                    input_vars=["tas", "hurs"],
                    var_attrs=new_var_attrs,
                )

            elif (
                item == "tas" and _input_var_check(df, var1="hurs", var2="tdps") == True
            ):
                print(f"Calculating {item}_derived...")  # convert to logger when set-up
                df["tas_derived"] = _calc_airtemp(df["hurs"], df["tdps"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "tas_derived", "hurs", "tdps")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="tas_derived",
                    source_var="tas",
                    input_vars=["hurs", "tdps"],
                    var_attrs=new_var_attrs,
                )

            elif (
                item == "tas"
                and _input_var_check(df, var1="hurs", var2="tdps_derived") == True
            ):
                print(
                    f"Calculating {item}_derived ...."
                )  # convert to logger when set-up
                df["tas_derived"] = _calc_airtemp(df["hurs"], df["tdps_derived"])
                # synergistic flag check
                df = derive_synergistic_flag(df, "tas_derived", "tas", "tdps_derived")
                # add new variable attributes
                new_var_attrs = _add_derived_var_attrs(
                    derived_var="tas_derived",
                    source_var="tas",
                    input_vars=["hurs", "tdps_derived"],
                    var_attrs=new_var_attrs,
                )

            else:
                print(
                    f"{item} is missing the required input variables. {item}_derived not calculated."
                )  # convert to logger when set-up

        return df, new_var_attrs

    except Exception as e:
        print(
            f"merge_derive_missing_vars failed with exception: {e}"
        )  # convert to logger version when ready
        return None

In [51]:
def _add_derived_var_attrs(
    derived_var: str, source_var: str, input_vars: list[str], var_attrs: dict
) -> dict:
    """Creates data attributes for new derived variable and adds to var_attrs.

    Parameters
    ----------
    derived_var : str
        variable name of new derived variable
    source_var : str
        variable name of the variable it "derives"
    input_vars : list[str]
        variable names of input variable
    var_attrs : dict
        attributes for all variables

    Returns
    -------
    var_attrs : dict
        updated variable attributes dictionary with new vars
    """

    # support for naming, units
    if source_var == "tdps":
        long_name = "derived_dew_point_temperature"
        units = "K"
    elif source_var == "tas":
        long_name = "derived_air_temperature"
        units = "K"
    elif source_var == "hurs":
        long_name = "derived_relative_humidity"
        units = "percent"

    print(long_name, units)

    # add new attributes -- var_attrs are stored as dict of each var dict
    derived_var_dict = {
        "long_name" : long_name,
        "units" : units,
        "ancillary_variables": f"{input_vars[0]}, {input_vars[1]}",
        "comment" : "Derived in merge_derive_missing_vars."
    }

    # add new var dictionary to existing var_attrs dict
    var_attrs[derived_var] = derived_var_dict

    return var_attrs

In [52]:
def _input_var_check(df: pd.DataFrame, var1: str, var2: str) -> bool:
    """
    Flexible check if required secondary input variables are available to derive a primary variable.

    Parameters
    ----------
    df : pd.DataFrame
        input dataframe to check against
    var1 : str
        name of secondary input var 1
    var2 : str
        name of secondary input var 2

    Returns
    -------
    bool
        True if all required input vars present; False if not
    """

    if var1 in df.columns and var2 in df.columns:
        return True
    else:
        return False

def derive_synergistic_flag(df: pd.DataFrame, var_to_flag: str, var1: str, var2: str) -> pd.DataFrame:
    """Synergistically flags the derived variable if the input variables also have flags.

    Parameters
    ----------
    df : pd.DataFrame
        input df to identify flags
    var_to_flag : str
        name of variable to check and flag
    var1 : str
        name of secondary input var 1
    var2 : str
        name of secondary input var 2
    
    Returns
    -------
    df : pd.DataFrame
        df with synergistic flags applied, if applicable
    """
    # set up _eraqc variable for new derived variable
    df[var_to_flag + "_eraqc"] = np.nan

    # identify if var 1 has flags
    if len(df[var1 + "_eraqc"].unique()) > 1:
        # flags are present
        df.loc[df[var1 + "_eraqc"] > 0, var_to_flag + "_eraqc"] = 38 # see qaqc flag meanings

    if len(df[var2 + "_eraqc"].unique()) > 1:
        df.loc[df[var2 + "_eraqc"] > 0, var_to_flag + "_eraqc"] = 38 # see qaqc flag meanings

    return df

## Derived variable calculations
def _calc_dewpointtemp(tas: pd.Series, hurs: pd.Series) -> pd.Series:
    """Calculates dew point temperature, method 1

    Parameters
    ----------
    tas : pd.Series
        air temperature, K
    hurs: pd.Series
        relative humidity, % or 0-100

    Returns
    -------
    tdps : pd.Series
        dewpoint temperature, K
    """
    es = 0.611 * np.exp(
        5423 * ((1 / 273) - (1 / tas))
    )  # calculates saturation vapor pressure
    e_vap = (
        es * hurs
    ) / 100.0  # calculates vapor pressure, IF NOT ALREADY OBSERVED -- will need ifelse statement
    tdps = (
        (1 / 273) - 0.0001844 * np.log(e_vap / 0.611)
    ) ** -1  # calculates dew point temperature, units = K
    return np.round(tdps, decimals=3)


def _calc_airtemp(hurs: pd.Series, tdps: pd.Series) -> pd.Series:
    """Calculate air temperature

    Parameters
    ----------
    hurs : pd.Series
        relative humidity, % or 0-100
    tdps : pd.Series
        dewpoint temperature, K

    Returns
    -------
    tas : pd.Series
        air temperature, K

    Notes
    ------
    [1] August-Roche-Magnus Approximation
    """

    # tdps must be in degC, not K for this equation
    tdps_degC = tdps - 273.15

    # apply approximation to calculate tas in degC
    tas_degC = (
        243.04
        * (((17.625 * tdps_degC) / (243.04 + tdps_degC)) - np.log(hurs / 100))
        / (17.625 + np.log(hurs / 100) - ((17.625 * tdps_degC) / (243.04 + tdps_degC)))
    )

    # convert back to K
    tas_K = tas_degC + 273.15

    return np.round(tas_K, decimals=3)


def _calc_relhumid(tas: pd.Series, tdps: pd.Series) -> pd.Series:
    """Calculate relative humidity

    Parameters
    ----------
    tas : pd.Series
        air temperature, K
    tdps : pd.Series
        dewpoint temperature, K

    Returns
    -------
    hurs : pd.Series
        relative humidity, % (0-100)
    """

    es = 0.611 * np.exp(
        5423 * ((1 / 273) - (1 / tas))
    )  # calculates saturation vapor pressure using air temp
    e_vap = 0.611 * np.exp(
        5423 * ((1 / 273) - (1 / tdps))
    )  # calculates vapor pressure using dew point temp
    hurs = 100 * (e_vap / es)
    return np.round(hurs, decimals=3)

In [53]:
df2, new_attrs = merge_derive_missing_vars(df, var_attrs)
df2

Running merge_derive_missing_vars...
tdps_derived is present in station, no derivation necessary.
Calculating hurs_derived...
1
2
derived_relative_humidity percent
3
tas is present in station, no derivation necessary.


Unnamed: 0,station,time,anemometer_height_m,elevation,elevation_eraqc,lat,lon,pr,pr_depth_qc,pr_duration,...,sfcWind_qc,tas,tas_eraqc,tas_qc,tdps,tdps_eraqc,tdps_qc,thermometer_height_m,hurs_derived,hurs_derived_eraqc
0,ASOSAWOS_72681024131,1980-01-01 00:00:00,10.06,865.0,,43.567,-116.217,0.0,9.0,0 days 01:00:00,...,5,275.35,,5,272.05,,5,,78.749,
1,ASOSAWOS_72681024131,1980-01-01 01:00:00,10.06,865.0,,43.567,-116.217,0.0,9.0,0 days 01:00:00,...,5,274.85,,5,272.55,,5,,84.662,
2,ASOSAWOS_72681024131,1980-01-01 01:05:00,10.06,871.0,,43.567,-116.217,,,NaT,...,1,,,9,,,9,,,
3,ASOSAWOS_72681024131,1980-01-01 02:00:00,10.06,865.0,,43.567,-116.217,0.0,9.0,0 days 01:00:00,...,5,274.25,,5,272.55,,5,,88.397,
4,ASOSAWOS_72681024131,1980-01-01 03:00:00,10.06,865.0,,43.567,-116.217,0.0,9.0,0 days 01:00:00,...,5,274.85,,5,273.15,,5,,88.444,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470204,ASOSAWOS_72681024131,2022-08-31 19:53:00,10.06,874.0,,43.567,-116.240,0.0,9.0,0 days 01:00:00,...,5,310.95,,5,273.75,,5,,9.349,
470205,ASOSAWOS_72681024131,2022-08-31 20:53:00,10.06,874.0,,43.567,-116.240,0.0,9.0,0 days 01:00:00,...,5,313.15,,5,273.75,,5,,8.271,
470206,ASOSAWOS_72681024131,2022-08-31 21:53:00,10.06,874.0,,43.567,-116.240,0.0,9.0,0 days 01:00:00,...,5,313.15,,5,272.05,,5,,7.308,
470207,ASOSAWOS_72681024131,2022-08-31 22:53:00,10.06,874.0,,43.567,-116.240,0.0,9.0,0 days 01:00:00,...,5,313.75,,5,274.25,,5,,8.295,


In [54]:
new_attrs

{'anemometer_height_m': {},
 'elevation': {'ancillary_variables': 'elevation_eraqc',
  'long_name': 'station_elevation',
  'positive': 'up',
  'standard_name': 'height_above_mean_sea_level',
  'units': 'meter'},
 'elevation_eraqc': {},
 'lat': {},
 'lon': {},
 'pr': {'ancillary_variables': 'pr_qc pr_depth_qc pr_duration, pr_eraqc',
  'comment': '',
  'long_name': 'precipitation_accumuation',
  'units': 'mm/?'},
 'pr_depth_qc': {'flag_meanings': 'See QA/QC csv for network.',
  'flag_values': '1 2 3 4 5 6 7 8 E I J 9'},
 'pr_duration': {'ancillary_variables': 'pr pr_qc pr_depth_qc',
  'long_name': 'precipitation measurement interval'},
 'pr_eraqc': {},
 'pr_qc': {'flag_meanings': 'See QA/QC csv for network.',
  'flag_values': '0 1 2 3 4 5 6 7 9 A I M P R U'},
 'ps': {'ancillary_variables': 'ps_qc ps_altimeter ps_altimeter_qc, ps_eraqc',
  'comment': 'Converted from hPa to Pa.',
  'long_name': 'station_air_pressure',
  'standard_name': 'air_pressure',
  'units': 'Pa'},
 'ps_altimeter': {'