# Find weather stations that set flags
From the AWS data catalog of station data, find the subset of weather stations that set flags 
<br>Output a DataFrame

In [1]:
import xarray as xr 
import pandas as pd 
import numpy as np
import s3fs
import tempfile # Used for downloading (and then deleting) netcdfs to local drive from s3 bucket
import os
import time # Used for progress bar 
import sys # Used for progress bar 

# Silence runtime warnings
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+
    """
    Print a progress bar to console 

    References
    ----------
    https://stackoverflow.com/questions/3160699/python-progress-bar
    
    """
    count = len(it)
    start = time.time() # time estimate start
    def show(j):
        x = int(size*j/count)
        # time estimate calculation and string
        remaining = ((time.time() - start) / j) * (count - j)        
        mins, sec = divmod(remaining, 60) # limited to minutes
        time_str = f"{int(mins):02}:{sec:03.1f}"
        print(f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count} Est wait {time_str}", end='\r', file=out, flush=True)
    show(0.1) # avoid div/0 
    for i, item in enumerate(it):
        yield item
        show(i+1)
    print("\n", flush=True, file=out)

In [3]:
# If we used zarr, this wouldn't be neccessary 
temp_dir = "./tmp"
if not os.path.exists(temp_dir): 
    os.mkdir(temp_dir)

In [None]:
# Read in a list of weather stations 
# We just use the names of the stations to filter 
# Ideally I'd like to see this moved to the AWS bucket 
train_stns = pd.read_csv('../qaqc_training_station_list_events.csv')
train_stns.head()

In [5]:
def read_nc_from_s3(network_name, station_id, temp_dir):
    """Read netcdf file containing station data for a single station of interest from AWS s3 bucket 

    Parameters
    ----------
    network_name: str 
        Name of network (i.e. "ASOSAWOS")
        Must correspond with a valid directory in the s3 bucket (i.e. "CAHYDRO", "CDEC", "ASOSAWOS")
    station_id: str
        Station identifier; i.e. the name of the netcdf file in the bucket (i.e. "ASOSAWOS_72012200114.nc")
    
    Returns 
    -------
    station_data: xr.Dataset 
    
    Notes
    -----
    The data is first downloaded from AWS into a tempfile, which is then deleted after xarray reads in the file 
    I'd like to see us use a zarr workflow if possible to avoid this. 

    """

    # Temp file for downloading from s3
    temp_file = tempfile.NamedTemporaryFile(
        dir = temp_dir, 
        prefix = "", 
        suffix = ".nc",
        delete = True
    )

    # Create s3 file system 
    s3 = s3fs.S3FileSystem(anon=False)

    # Get URL to netcdf in S3
    s3_url = 's3://wecc-historical-wx/3_qaqc_wx_dev/{}/{}.nc'.format(network_name, station_id)

    # Read in the data using xarray 
    s3_file_obj = s3.get(s3_url, temp_file.name)
    station_data = xr.open_dataset(temp_file.name, engine='h5netcdf').load()

    # Close temporary file 
    temp_file.close()

    return station_data 

In [6]:
def find_flags(station_ds): 
    """Find unique flags in a Dataset.
    Filters through flag variables; assumes flag variables contain the substring '_eraqc'

    Parameters
    ----------
    station_ds: xr.Dataset 
        Dataset containing station data, with each variable as a unique data variable 
    
    Returns 
    -------
    unique_flags: list or None
        List of unique flag values found in station_ds 
        Returns None if no flags found for any variable 
    
    """

    # Get the string names of the eraqc flag variables 
    era_flag_var_names = [var for var in station_ds.data_vars if '_eraqc' in var]

    # Subset Dataset to just contain flag variables 
    station_flags_ds = station_ds[era_flag_var_names]

    # Check for nulls in the flag variables 
    # If all are null, that means the station set no flags for this event! 
    all_null = station_flags_ds.to_array().isnull().all().item()
    unique_flags = None

    # If there are some flags set, find out what they are 
    if not all_null: 
        # Stack all the variables, since we don't care which variables the flags belong to 
        # Makes it easier to search the array (fewer lines of code :)
        # Works like np.flatten, but on an xarray object 
        stacked = station_flags_ds.to_array().stack({"everything":["variable","time"]})

        # Drop all non-null values 
        # i.e. [nan, nan, 23, nan, 17] --> [23, 17]
        all_flags = stacked.where(~stacked.isnull(), drop=True)

        # Get unique flag values as integers
        # nan is treated as a float so the .isnull() step converted floats to ints (I think)
        # i.e. [23.0, 23.0, 23.0, 17.0, 23.0, 19.0, 19.0] --> [23, 17, 19]
        unique_flags = list(np.unique(all_flags.values).astype(int))

    return unique_flags

In [None]:
event_start_date = "2007-10-20"
event_end_date = "2007-10-24"
stn_subset = train_stns[:5]




stations_with_flags = {"network":[],"era-id":[],"flags":[]}

# Loop through each station to look for flags 
for i in progressbar(range(len(stn_subset))):
    # Get info for one station 
    network_name, station_id = stn_subset.iloc[i][["network","era-id"]]

    # Read in the data from AWS as an xarray Dataset 
    station_ds = read_nc_from_s3(
        network_name=network_name, 
        station_id=station_id, 
        temp_dir=temp_dir
        )

    # Reduce dimension of object 
    # "station" is a singleton dimension
    station_ds = station_ds.squeeze()

    # Subset Dataset to event time period 
    station_ds = station_ds.sel(time=slice(event_start_date, event_end_date))

    unique_flags = find_flags(station_ds)

    if unique_flags is not None: 
        stations_with_flags["network"].append(network_name) 
        stations_with_flags["era-id"].append(station_id)
        stations_with_flags["flags"].append(unique_flags)

In [None]:
pd.DataFrame(stations_with_flags)