# Remove unnecesary variables


## Environment set-up

In [None]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [None]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
cleandir = "2_clean_wx/"
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

In [None]:
def merge_ds_to_df(ds):
    """Converts xarray ds for a station to pandas df in the format needed for processing.

    Parameters
    ----------
    ds: xr.Dataset
        Data object with information about each network and station
    verbose: boolean
        Flag as to whether to print runtime statements to terminal. Default is False. Set in ALLNETWORKS_merge.py run.

    Returns
    -------
    df: pd.DataFrame
        Table object with information about each network and station
    MultiIndex: pd.DataFrame (I think)
        Original multi-index of station and time, to be used on conversion back to ds
    attrs:
        Save ds attributes to inherent to the final merged file
    var_attrs:
        Save variable attributes to inherent to the final merged file
    """

    # Save attributes to inherent them to the final merged file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # Save instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            logger.info("Filling anemometer_height_m with NaN.")
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            logger.info("Filling thermometer_height_m with NaN.")
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    return df, MultiIndex, attrs, var_attrs

## Testing

### Load in datasets for testing

In [None]:
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_69007093217.zarr"
ds_a = xr.open_zarr(url)

In [None]:
url = "s3://wecc-historical-wx/3_qaqc_wx/MARITIME/MARITIME_ANVC1.zarr"
ds_m = xr.open_zarr(url)

In [None]:
url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"
ds_v = xr.open_zarr(url)

In [29]:
df, MultiIndex, attrs, var_attrs = merge_ds_to_df(ds_a)

### Define function

In [None]:
def delete_vars(df: pd.DataFrame, var_attrs: dict) -> tuple[pd.DataFrame, dict]:
    """
    Keep “_eraqc” vars and drop the following variables
    
        - qaqc_process
        - pr_duration
        - pr_depth
        - PREC_flag
        - rsds_duration
        - rsds_flag
        - q_code
        - any "_qc" or "method "variable

    Parameters
    ------
    df: pd.DataFrame
        station data
    var_attrs: dict
        variable attributes

    Returns
    -------
    if success:
        df: pd.DataFrame
        var_attrs: dict

    if failure:
        None
    """
    drop_vars_keywords = [
        "qaqc_process",
        "pr_duration",
        "pr_depth",
        "PREC_flag",
        "rsds_duration",
        "rsds_flag",
        "_qc",
        "method"
    ]

    # Select variables that contain the keywords defined above
    drop_vars = [
        i for keyword in drop_vars_keywords for i in df.columns if keyword in i
    ]

    # Remove those variables
    df = df.drop(columns=drop_vars)

    # Remove the attributes of the dropped variables
    for key in drop_vars_keywords:
        if key in var_attrs:
            del var_attrs[key]

    return df, var_attrs

### Test

In [33]:
df_test, attrs_test = delete_vars(df,var_attrs)

In [34]:
df_test.columns

Index(['time', 'anemometer_height_m', 'elevation', 'elevation_eraqc', 'lat',
       'lon', 'pr', 'pr_eraqc', 'ps_altimeter', 'ps_altimeter_eraqc', 'psl',
       'psl_eraqc', 'sfcWind', 'sfcWind_dir', 'sfcWind_dir_eraqc',
       'sfcWind_eraqc', 'tas', 'tas_eraqc', 'tdps', 'tdps_eraqc',
       'thermometer_height_m', 'station'],
      dtype='object')

In [35]:
attrs_test

{'anemometer_height_m': {},
 'elevation': {'ancillary_variables': 'elevation_eraqc',
  'long_name': 'station_elevation',
  'positive': 'up',
  'standard_name': 'height_above_mean_sea_level',
  'units': 'meter'},
 'elevation_eraqc': {},
 'lat': {},
 'lon': {},
 'pr': {'ancillary_variables': 'pr_qc pr_depth_qc pr_duration, pr_eraqc',
  'comment': '',
  'long_name': 'precipitation_accumuation',
  'units': 'mm/?'},
 'pr_depth_qc': {'flag_meanings': 'See QA/QC csv for network.',
  'flag_values': '1 2 3 4 5 6 7 8 E I J 9'},
 'pr_eraqc': {},
 'pr_qc': {'flag_meanings': 'See QA/QC csv for network.',
  'flag_values': '0 1 2 3 4 5 6 7 9 A I M P R U'},
 'ps_altimeter': {'ancillary_variables': 'ps ps_qc ps_altimeter ps_altimeter_qc, ps_altimeter_eraqc',
  'comment': 'Converted from hPa to Pa. The pressure value to which an aircraft altimeter is set so that it will indicate the altitude relative to mean sea level of an aircraft on the ground at the location for which the value was determined.',
  '