## Setup

In [1]:


# necessary for using for loop ending at last day of month
import calendar

import boto3
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO
import scipy.stats as stats

import s3fs
import tempfile  # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket
import os
from shapely.geometry import Point

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

# New logger function
from log_config import logger

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

In [2]:
# -----------------------------------------------------------------------------
## Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# -----------------------------------------------------------------------------
## Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"
wecc_terr = (
    "s3://wecc-historical-wx/0_maps/WECC_Informational_MarineCoastal_Boundary_land.shp"
)
wecc_mar = "s3://wecc-historical-wx/0_maps/WECC_Informational_MarineCoastal_Boundary_marine.shp"
# Define temporary directory in local drive for downloading data from S3 bucket
# If the directory doesn't exist, it will be created
# If we used zarr, this wouldn't be neccessary
temp_dir = "./tmp"
if not os.path.exists(temp_dir):
    os.mkdir(temp_dir)
def open_log_file_merge(file):
    global log_file
    log_file = file
def read_nc_from_s3(network_name, station_id, temp_dir):
    """Read netcdf file containing station data for a single station of interest from AWS s3 bucket

    Parameters
    ----------
    network_name: str
        Name of network (i.e. "ASOSAWOS")
        Must correspond with a valid directory in the s3 bucket (i.e. "CAHYDRO", "CDEC", "ASOSAWOS")
    station_id: str
        Station identifier; i.e. the name of the netcdf file in the bucket (i.e. "ASOSAWOS_72012200114.nc")

    Returns
    -------
    station_data: xr.Dataset

    Notes
    -----
    The data is first downloaded from AWS into a tempfile, which is then deleted after xarray reads in the file
    I'd like to see us use a zarr workflow if possible to avoid this.

    """
# -----------------------------------------------------------------------------
    # Temp file for downloading from s3
    temp_file = tempfile.NamedTemporaryFile(
        dir=temp_dir, prefix="", suffix=".nc", delete=True
    )
# -----------------------------------------------------------------------------
    # Create s3 file system
    s3 = s3fs.S3FileSystem(anon=False)

    # Get URL to netcdf in S3
    s3_url = "s3://wecc-historical-wx/2_clean_wx/{}/{}.nc".format(
        network_name, station_id
    )

    # Read in the data using xarray
    s3_file_obj = s3.get(s3_url, temp_file.name)
    station_data = xr.open_dataset(temp_file.name, engine="h5netcdf").load()

    # Close temporary file
    temp_file.close()

    return station_data
# -----------------------------------------------------------------------------
def qaqc_ds_to_df(ds, verbose=False):
    ## Add qc_flag variable for all variables, including elevation;
    ## defaulting to nan for fill value that will be replaced with qc flag

    for key, val in ds.variables.items():
        if val.dtype == object:
            if key == "station":
                if str in [type(v) for v in ds[key].values]:
                    ds[key] = ds[key].astype(str)
            else:
                if str in [type(v) for v in ds.isel(station=0)[key].values]:
                    ds[key] = ds[key].astype(str)

    exclude_qaqc = [
        "time",
        "station",
        "lat",
        "lon",
        "qaqc_process",
        "sfcWind_method",
        "pr_duration",
        "pr_depth",
        "PREC_flag",
        "rsds_duration",
        "rsds_flag",
        "anemometer_height_m",
        "thermometer_height_m",
    ]  # lat, lon have different qc check

    raw_qc_vars = []  # qc_variable for each data variable, will vary station to station
    era_qc_vars = []  # our ERA qc variable
    old_era_qc_vars = []  # our ERA qc variable

    for var in ds.data_vars:
        if "q_code" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variable, need to keep for comparison, then drop
        if "_qc" in var:
            raw_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop
        if "_eraqc" in var:
            era_qc_vars.append(
                var
            )  # raw qc variables, need to keep for comparison, then drop
            old_era_qc_vars.append(var)

    print(f"era_qc existing variables:\n{era_qc_vars}")
    n_qc = len(era_qc_vars)

    for var in ds.data_vars:
        if var not in exclude_qaqc and var not in raw_qc_vars and "_eraqc" not in var:
            qc_var = var + "_eraqc"  # variable/column label

            # if qaqc var does not exist, adds new variable in shape of original variable with designated nan fill value
            if qc_var not in era_qc_vars:
                print(f"nans created for {qc_var}")
                ds = ds.assign({qc_var: xr.ones_like(ds[var]) * np.nan})
                era_qc_vars.append(qc_var)

    print("{} created era_qc variables".format(len(era_qc_vars) - len(old_era_qc_vars)))
    if len(era_qc_vars) != n_qc:
        print("{}".format(np.setdiff1d(old_era_qc_vars, era_qc_vars)))

    # Save attributes to inheret them to the QAQC'ed file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            print("Filling anemometer_height_m with NaN.", flush=True)
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            print("Filling thermometer_height_m with NaN.", flush=True)
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    # Add time variables needed by multiple functions
    df["hour"] = pd.to_datetime(df["time"]).dt.hour
    df["day"] = pd.to_datetime(df["time"]).dt.day
    df["month"] = pd.to_datetime(df["time"]).dt.month
    df["year"] = pd.to_datetime(df["time"]).dt.year
    df["date"] = pd.to_datetime(df["time"]).dt.date

    return df  # , MultiIndex, attrs, var_attrs, era_qc_vars

# -----------------------------------------------------------------------------
def printf(*args, verbose=True, log_file=None, **kwargs):
    import datetime

    tLog = lambda: datetime.datetime.utcnow().strftime("%m-%d-%Y %H:%M:%S") + " : \t"
    args = [str(a) for a in args]

    if verbose:
        if log_file is not None:
            print(" ".join([tLog(), *args]), **kwargs) or print(
                " ".join([tLog(), *args]), file=log_file, **kwargs
            )
        else:
            print(" ".join([tLog(), *args]), **kwargs)
    else:
        if log_file is not None:
            print(" ".join([tLog(), *args]), file=log_file, **kwargs)
        else:
            pass

## Load Data

In [184]:
# load in single dc file from AWS
ds = read_nc_from_s3("ASOSAWOS", "ASOSAWOS_72494523293", temp_dir)
# ["ASOSAWOS_74948400395", "ASOSAWOS_74509023244", "ASOSAWOS_72494523293"]

# convert to formatted pandas dataframe
df = qaqc_ds_to_df(ds, verbose=False)


era_qc existing variables:
[]
nans created for ps_eraqc
nans created for tas_eraqc
nans created for tdps_eraqc
nans created for pr_eraqc
nans created for sfcWind_eraqc
nans created for sfcWind_dir_eraqc
nans created for elevation_eraqc
nans created for ps_altimeter_eraqc
nans created for psl_eraqc
9 created era_qc variables
[]


In [86]:
df.columns

Index(['time', 'ps', 'tas', 'tdps', 'pr', 'sfcWind', 'sfcWind_dir',
       'elevation', 'qaqc_process', 'ps_qc', 'ps_altimeter', 'ps_altimeter_qc',
       'psl', 'psl_qc', 'tas_qc', 'tdps_qc', 'pr_qc', 'pr_duration',
       'pr_depth_qc', 'sfcWind_qc', 'sfcWind_method', 'sfcWind_dir_qc', 'lat',
       'lon', 'ps_eraqc', 'tas_eraqc', 'tdps_eraqc', 'pr_eraqc',
       'sfcWind_eraqc', 'sfcWind_dir_eraqc', 'elevation_eraqc',
       'ps_altimeter_eraqc', 'psl_eraqc', 'anemometer_height_m',
       'thermometer_height_m', 'station', 'hour', 'day', 'month', 'year',
       'date'],
      dtype='object')

In [22]:
# keep only the precipitation column(s) and time
keep_vars = ['time','pr','pr_5min','pr_1h']
df_sum = df[[col for col in keep_vars if col in df.columns]]

In [157]:
# sum daily precipitation values
daily_sum = df_sum.resample("1D", on="time").sum()
print(daily_sum.head())

             pr
time           
1980-01-01  0.0
1980-01-02  0.0
1980-01-03  0.0
1980-01-04  0.0
1980-01-05  0.0


In [158]:
# TODO: will need to define these columns in the function

daily_sum['year'] = daily_sum.index.year
daily_sum['month'] = daily_sum.index.month
daily_sum['day'] = daily_sum.index.day
print(daily_sum.head())

             pr  year  month  day
time                             
1980-01-01  0.0  1980      1    1
1980-01-02  0.0  1980      1    2
1980-01-03  0.0  1980      1    3
1980-01-04  0.0  1980      1    4
1980-01-05  0.0  1980      1    5


In [159]:
df_year = daily_sum.loc[daily_sum["year"] == 2016] 
print(df_year)

              pr  year  month  day
time                              
2016-01-01   0.0  2016      1    1
2016-01-02   0.0  2016      1    2
2016-01-03   0.0  2016      1    3
2016-01-04   0.3  2016      1    4
2016-01-05  28.4  2016      1    5
...          ...   ...    ...  ...
2016-12-27   0.0  2016     12   27
2016-12-28   0.0  2016     12   28
2016-12-29   0.0  2016     12   29
2016-12-30   0.0  2016     12   30
2016-12-31   0.0  2016     12   31

[366 rows x 4 columns]


In [169]:
month = 1
monthly_df = df_year.loc[df_year["month"] == month]
#print(monthly_df)


In [170]:
current_day = monthly_df.loc[monthly_df['day'] == 18]
current_obs = current_day['pr']
print(monthly_df)
#print(current_day)
#print(current_obs)

              pr  year  month  day
time                              
2016-01-01   0.0  2016      1    1
2016-01-02   0.0  2016      1    2
2016-01-03   0.0  2016      1    3
2016-01-04   0.3  2016      1    4
2016-01-05  28.4  2016      1    5
2016-01-06  48.1  2016      1    6
2016-01-07  18.4  2016      1    7
2016-01-08   0.8  2016      1    8
2016-01-09   0.3  2016      1    9
2016-01-10   0.3  2016      1   10
2016-01-11   0.6  2016      1   11
2016-01-12   0.0  2016      1   12
2016-01-13   2.8  2016      1   13
2016-01-14   2.3  2016      1   14
2016-01-15   0.3  2016      1   15
2016-01-16  13.2  2016      1   16
2016-01-17   3.0  2016      1   17
2016-01-18  62.7  2016      1   18
2016-01-19  29.4  2016      1   19
2016-01-20  11.2  2016      1   20
2016-01-21   0.0  2016      1   21
2016-01-22  19.5  2016      1   22
2016-01-23  22.3  2016      1   23
2016-01-24   1.5  2016      1   24
2016-01-25   0.0  2016      1   25
2016-01-26   0.0  2016      1   26
2016-01-27   0.0  20

In [181]:
diff_max = monthly_df['pr'].apply(lambda row: current_obs - row).max()
print(diff_max > 60)

time
2016-01-18    True
Freq: D, dtype: bool


In [216]:
def gap_check(df, vars_to_check, year, threshold, plot=True, verbose=False, local=False):
    """
    gap check
        - compare all precipitation obs in a single month, all years
        - sums observations to daily timestep, then checks each daily sum to every other sum in that month
        - flags days on which the sum is 300m more than any other daily observation in that month
    Goal: flags precipitation values that are at least 300 mm larger than all other precipitation totals 
          for a given station and calendar month.

    Input:
    ------
        df [pd.DataFrame]: station dataset converted to dataframe through QAQC pipeline
        vars_to_check [list]: list of variables to run test on

    Output:
    -------
        df [pd.DataFrame]: QAQC dataframe with flagged values (see below for flag meaning)

    Notes:
    ------
    PRELIMINARY: Thresholds/decisions may change with refinement.
    """
    ### Filter df to precipitation variables and sum daily observations
    # TODO: when is year specified in frequent gaps check?

    df = df.loc[df["year"] == year] 
    df_daily_sum = df.resample("1D", on="time").sum()

    ### For each variable, in each month, compare each daily sum to every other daily sum
    for var in vars_to_check:


        # if len(df) == 0 or df[var].isnull().all() == True:
        #     print('no precip in {}/{}'.format(month,year))
        # continue  # variable has no valid data

        df_var = df_daily_sum[var]

        for month in range(1, 13):
            # Select month data
            monthly_df = df_var.loc[df_var.index.month == month]
            print(monthly_df)

            # Now to iterate over each day in the current month
            end_day = calendar.monthrange(year, month)[1]

            for day in range(1,end_day):
                #print('Compare each day sum to every other day sum in a given month.')

                current_obs = monthly_df.loc[monthly_df.index.day == day]
                #print(monthly_df)
        
                diff = monthly_df.apply(lambda row: current_obs - row)#.max

                # flag = diff > threshold

    return diff

In [215]:
precip_vars = ['pr'] #,'pr_5min','pr_15min','pr_1h']
year = 2016 
threshold = 60 

output = gap_check(df ,precip_vars, year, threshold)

time
2016-01-01     0.0
2016-01-02     0.0
2016-01-03     0.0
2016-01-04     0.3
2016-01-05    28.4
2016-01-06    48.1
2016-01-07    18.4
2016-01-08     0.8
2016-01-09     0.3
2016-01-10     0.3
2016-01-11     0.6
2016-01-12     0.0
2016-01-13     2.8
2016-01-14     2.3
2016-01-15     0.3
2016-01-16    13.2
2016-01-17     3.0
2016-01-18    62.7
2016-01-19    29.4
2016-01-20    11.2
2016-01-21     0.0
2016-01-22    19.5
2016-01-23    22.3
2016-01-24     1.5
2016-01-25     0.0
2016-01-26     0.0
2016-01-27     0.0
2016-01-28     0.0
2016-01-29     0.0
2016-01-30     5.7
2016-01-31     3.4
Freq: D, Name: pr, dtype: float64


  df_daily_sum = df.resample("1D", on="time").sum()


KeyError: 'pr'

In [None]:
                # Index to flag finds where df_month is out of the distribution
                # index_to_flag = (df_month < low) | (df_month > high)

                # # Since grouping, the index of df_month is years
                # years_to_flag = df_month[index_to_flag].index

                # flag all obs in that day
                # bad = np.logical_and(df["month"] == month, df["year"].isin(years_to_flag))
                # df.loc[bad, var + "_eraqc"] = 21  