# Flag summary development

## Environment set-up

In [1]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt

# New logger function
from merge_log_config import logger

# Silence warnings
import warnings
from shapely.errors import ShapelyDeprecationWarning

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings(
    "ignore", category=ShapelyDeprecationWarning
)  # Warning is raised when creating Point object from coords. Can't figure out why.

plt.rcParams["figure.dpi"] = 300

In [2]:
# AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")

## AWS buckets
bucket = "wecc-historical-wx"
cleandir = "2_clean_wx/"
qaqcdir = "3_qaqc_wx/"
mergedir = "4_merge_wx/"

In [3]:
def merge_ds_to_df(ds):
    """Converts xarray ds for a station to pandas df in the format needed for processing.

    Parameters
    ----------
    ds: xr.Dataset
        Data object with information about each network and station
    verbose: boolean
        Flag as to whether to print runtime statements to terminal. Default is False. Set in ALLNETWORKS_merge.py run.

    Returns
    -------
    df: pd.DataFrame
        Table object with information about each network and station
    MultiIndex: pd.DataFrame (I think)
        Original multi-index of station and time, to be used on conversion back to ds
    attrs:
        Save ds attributes to inherent to the final merged file
    var_attrs:
        Save variable attributes to inherent to the final merged file
    """

    # Save attributes to inherent them to the final merged file
    attrs = ds.attrs
    var_attrs = {var: ds[var].attrs for var in list(ds.data_vars.keys())}

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        df = ds.to_dataframe()

    # Save instrumentation heights
    if "anemometer_height_m" not in df.columns:
        try:
            df["anemometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.anemometer_height_m
            )
        except:
            logger.info("Filling anemometer_height_m with NaN.")
            df["anemometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass
    if "thermometer_height_m" not in df.columns:
        try:
            df["thermometer_height_m"] = (
                np.ones(ds["time"].shape) * ds.thermometer_height_m
            )
        except:
            logger.info("Filling thermometer_height_m with NaN.")
            df["thermometer_height_m"] = np.ones(len(df)) * np.nan
        finally:
            pass

    # De-duplicate time axis
    df = df[~df.index.duplicated()].sort_index()

    # Save station/time multiindex
    MultiIndex = df.index
    station = df.index.get_level_values(0)
    df["station"] = station

    # Station pd.Series to str
    station = station.unique().values[0]

    # Convert time/station index to columns and reset index
    df = df.droplevel(0).reset_index()

    return df, MultiIndex, attrs, var_attrs

## Function

### Native timestep

In [None]:
    logger.info("Running: succes_report_stats")

    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # number of total observations in station
    logger.info("Total observations: {}".format(len(df)))

    # 

    for var in eraqc_vars:
        # list of unique flags
        logger.info(
            "Flags set on {}: {}".format(var, df[var].unique()),
        ) 
        # % of observations flagged per variables
        logger.info(
            "Coverage of {} obs flagged: {} of {} obs ({}%)".format(
                var,
                len(df.loc[(df[var].isnull() == False)]),
                len(df),
                round((len(df.loc[(df[var].isnull() == False)]) / len(df)) * 100, 3),
            )
        )  


In [None]:
def flag_summary_native(df: pd.DataFrame, network: str, station: str) -> None:
    """
    Generates summary of flags set on all QAQC tests.
    Returns 
    - list of unique flag values for each variable
    - % of total obs per variable that was flagged
    - total number of observations per variable
    - number of flagged observations per variable

    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline

    Returns
    -------
    pd.dataFrame
    """
    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # list all possible flag values
    eraqc_flags = list(range(1, 38))

    for i in eraqc_vars:
        # iterate 
        

    


    # Save file to station bucket
    # new_buffer = StringIO()
    # stations.to_csv(new_buffer, index=False)
    # content = new_buffer.getvalue()
    # s3_cl.put_object(
    #     Bucket=BUCKET_NAME,
    #     Body=content,
    #     Key=QAQC_WX + network + station_name + "/stationlist_{}_qaqc.csv".format(network),
    # )


        
    return None

In [8]:
#url = "s3://wecc-historical-wx/3_qaqc_wx/VALLEYWATER/VALLEYWATER_6001.zarr"
url = "s3://wecc-historical-wx/3_qaqc_wx/ASOSAWOS/ASOSAWOS_69007093217.zarr"
ds = xr.open_zarr(url)

In [35]:
df, MultiIndex, attrs, var_attrs = merge_ds_to_df(ds)

In [19]:
eraqc_flags = list(range(1,38))
eraqc_vars = [var for var in df.columns if "_eraqc" in var]

In [48]:
csv = pd.DataFrame(columns=eraqc_flags)

In [46]:
csv

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,28,29,30,31,32,33,34,35,36,37


In [None]:
for i in eraqc_vars:
    for flag in eraqc_flags:
        csv['i']
        print("flag {}".format(flag))
        print(sum(df[i] == flag))
    
     df.loc[len(df)] = row1

flag 1
0
flag 2
0
flag 3
0
flag 4
0
flag 5
0
flag 6
0
flag 7
0
flag 8
0
flag 9
0
flag 10
0
flag 11
0
flag 12
0
flag 13
0
flag 14
0
flag 15
0
flag 16
0
flag 17
0
flag 18
0
flag 19
0
flag 20
0
flag 21
0
flag 22
0
flag 23
0
flag 24
0
flag 25
0
flag 26
0
flag 27
0
flag 28
0
flag 29
0
flag 30
0
flag 31
0
flag 32
0
flag 33
0
flag 34
0
flag 35
0
flag 36
0
flag 37
0
flag 1
0
flag 2
0
flag 3
0
flag 4
0
flag 5
0
flag 6
0
flag 7
0
flag 8
0
flag 9
0
flag 10
0
flag 11
0
flag 12
0
flag 13
0
flag 14
0
flag 15
0
flag 16
0
flag 17
0
flag 18
0
flag 19
0
flag 20
0
flag 21
0
flag 22
0
flag 23
0
flag 24
0
flag 25
0
flag 26
0
flag 27
0
flag 28
0
flag 29
0
flag 30
0
flag 31
0
flag 32
0
flag 33
0
flag 34
0
flag 35
0
flag 36
0
flag 37
0
flag 1
0
flag 2
0
flag 3
0
flag 4
0
flag 5
0
flag 6
0
flag 7
0
flag 8
0
flag 9
0
flag 10
0
flag 11
0
flag 12
0
flag 13
0
flag 14
0
flag 15
0
flag 16
0
flag 17
0
flag 18
0
flag 19
0
flag 20
0
flag 21
0
flag 22
0
flag 23
0
flag 24
0
flag 25
0
flag 26
0
flag 27
0
flag 28
14269
fla

### Hourly

In [None]:
def flag_summary_hourly(df: pd.DataFrame):
    """
    Generates summary of flags set on all QAQC tests.
    Returns
    - list of unique flag values for each variable
    - % of total obs per variable that was flagged
    - total number of observations per variable
    - number of flagged observations per variable


    Parameters
    ----------
    df : pd.DataFrame
        station dataset converted to dataframe through QAQC pipeline

    Returns
    -------
    pd.dataFrame
    """

    logger.info("Running: succes_report_stats")

    # identify _eraqc variables
    eraqc_vars = [var for var in df.columns if "_eraqc" in var]

    # number of total observations in station
    logger.info("Total observations: {}".format(len(df)))

    for var in eraqc_vars:
        # number of total observations per variables
        logger.info()

        # list of unique flags
        logger.info(
            "Flags set on {}: {}".format(var, df[var].unique()),
        )

        # count of

        # % of observations flagged per variables
        logger.info(
            "Coverage of {} obs flagged: {} of {} obs ({}%)".format(
                var,
                len(df.loc[(df[var].isnull() == False)]),
                len(df),
                round((len(df.loc[(df[var].isnull() == False)]) / len(df)) * 100, 3),
            )
        )