# Updating attributes


## Environment set-up

In [None]:
import datetime
import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO

from merge_log_config import logger

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [6]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

## Investigate

In [10]:
def merge_ds_to_df(ds):
    """Converts xarray ds for a station to pandas df in the format needed for processing.

    Parameters
    ----------
    ds: xr.Dataset
        Data object with information about each network and station
    verbose: boolean
        Flag as to whether to print runtime statements to terminal. Default is False. Set in ALLNETWORKS_merge.py run.

    Returns
    -------
    df: pd.DataFrame
        Table object with information about each network and station
    MultiIndex: pd.DataFrame (I think)
        Original multi-index of station and time, to be used on conversion back to ds
    attrs:
        Save ds attributes to inherent to the final merged file
    var_attrs:
        Save variable attributes to inherent to the final merged file
    """

    # Save attributes to inherent them to the final merged file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # Save instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            logger.info("Filling anemometer_height_m with NaN.")
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            logger.info("Filling thermometer_height_m with NaN.")
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    return df, MultiIndex, attrs, var_attrs

In [40]:
url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"  
# url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_69007093217.zarr"
ds = xr.open_zarr(url)

In [41]:
df, MultiIndex, attrs, var_attrs = merge_ds_to_df(ds)

In [42]:
var_attrs

{'anemometer_height_m': {},
 'elevation': {'ancillary_variables': 'elevation_eraqc',
  'long_name': 'station_elevation',
  'positive': 'up',
  'standard_name': 'height_above_mean_sea_level',
  'units': 'meter'},
 'elevation_eraqc': {},
 'lat': {},
 'lon': {},
 'pr_15min': {'ancillary_variables': 'pr_15min_eraqc',
  'comment': 'Precipitation accumulated in previous 15 minutes.',
  'long_name': '15_minute_precipitation_amount',
  'units': 'mm/15min'},
 'pr_15min_eraqc': {},
 'raw_qc': {},
 'thermometer_height_m': {}}

In [45]:
var_attrs['pr_15min']['name'] = 'love'

In [46]:
var_attrs

{'anemometer_height_m': {},
 'elevation': {'ancillary_variables': 'elevation_eraqc',
  'long_name': 'station_elevation',
  'positive': 'up',
  'standard_name': 'height_above_mean_sea_level',
  'units': 'meter'},
 'elevation_eraqc': {},
 'lat': {},
 'lon': {},
 'pr_15min': {'ancillary_variables': 'pr_15min_eraqc',
  'comment': 'Precipitation accumulated in previous 15 minutes.',
  'long_name': '15_minute_precipitation_amount',
  'units': 'mm/15min',
  'name': 'love'},
 'pr_15min_eraqc': {},
 'raw_qc': {},
 'thermometer_height_m': {}}

In [43]:
sub_hourly_vars = [i for i in df.columns if "min" in i and "qc" not in i]

In [44]:
sub_hourly_vars

['pr_15min']

In [51]:
for var in sub_hourly_vars:
    var_attrs[var]['new_comment'] = '{} has been standardized to an hourly timestep, but will retain its original name'.format(var)

In [52]:
var_attrs

{'anemometer_height_m': {},
 'elevation': {'ancillary_variables': 'elevation_eraqc',
  'long_name': 'station_elevation',
  'positive': 'up',
  'standard_name': 'height_above_mean_sea_level',
  'units': 'meter'},
 'elevation_eraqc': {},
 'lat': {},
 'lon': {},
 'pr_15min': {'ancillary_variables': 'pr_15min_eraqc',
  'comment': 'Precipitation accumulated in previous 15 minutes.',
  'long_name': '15_minute_precipitation_amount',
  'units': 'mm/15min',
  'name': 'love',
  'new_comment': 'pr_15min has been standardized to an hourly timestep, but will retain its original name'},
 'pr_15min_eraqc': {},
 'raw_qc': {},
 'thermometer_height_m': {}}