# Success Report Statistics

per station 
- total % flagged
- % flagged per variable
- % flagged per QAQC flag

NEED: per station raw counts tables

per network
- % flagged per variable (highest and lowest)
- % flagged per QAQC flag (highest and lowest)
- % flagged per station (highest and lowest)

NEED: raw counts per variable, raw counts per QAQC flag, raw counts per station

HOW: sum total and flagged per station, variable, and QAQC flag

total
- % flagged per network
- % flagged per variable (highest and lowest)
- % flagged per QAQC flag (highest and lowest)

NEED: raw counts per variable, raw counts per QAQC flag, raw counts per station

HOW: sum total and flagged per network, variable, and QAQC flag


## Environment set-up

In [None]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO
from functools import reduce

import inspect

import logging
# Create a simple logger that just prints to the console
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

plt.rcParams["figure.dpi"] = 300

In [None]:
# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"
stations_csv_path = f"s3://{bucket_name}/2_clean_wx/temp_clean_all_station_list.csv"
qaqc_dir = "3_qaqc_wx"
merge_dir = "4_merge_wx"

## Functions

In [None]:
def _pairwise_sum(flag_df_1, flag_df_2) -> pd.DataFrame:
    """
    Sums two input flag count dataframes. This is a helper function for sum_flag_counts(). 

    Parameters
    ----------
    flag_df_1: pd.DataFrame
        dataframe of previously summed station flag counts
    flag_df_2: pd.DataFrame
        flag counts dataframes for next station

    Returns
    -------
    summed_df: pd.DataFrame

    """
    if len(flag_df_1) == 0:
        return flag_df_2
    else:
        total_df = pd.concat([flag_df_1, flag_df_2])

        summed_df = total_df.groupby('eraqc_flag_values', as_index=False).sum()
        return summed_df

In [None]:
def network_sum_flag_counts(network: str, timestep: str) -> None:
    """
    Sums all station QAQC flag counts in a network for a given timestep (hourly or native) and sends to AWS. 
    These counts are used to generate statistics for the QAQC success report.

    Parameters
    ----------
    network: str
        network name
    timestep: str
        if set to 'hourly', merge all hourly QAQC flag count tables
        if set to 'native', merge all native timestep QAQC flag count tables

    Returns
    -------
    None

    """
    ## Setup

    if timestep not in ("hourly", "native"):
        print("invalid timestep: ", timestep)
        return None

    # the function iteratively adds in flag counts to this dataframe
    summed_counts_df = []

    # point to folder containing station flag count CSVs
    flags_prefix = f"{merge_dir}/{network}/eraqc_counts_{timestep}_timestep"

    ## Merge flag counts

    # loop through all CSVs are the given level
    for item in s3.Bucket(bucket_name).objects.filter(Prefix=flags_prefix):
        obj = s3_cl.get_object(Bucket=bucket_name, Key=item.key)
        flags = pd.read_csv(obj["Body"])
        # the CSV is empty
        if flags.empty:
            continue
        # the CSV is not empty
        else:
            # send current dataframe and dataframe of previously summed counts to helper function
            summed_counts_df = _pairwise_sum(summed_counts_df, flags)

    ## Send final counts file to AWS as CSV

    csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/per_network_flag_counts/{network}_flag_counts_{timestep}_timestep.csv"
    # summed_counts_df.to_csv(csv_s3_filepath, index=False)
    print(
        f"Sending summed counts dataframe for {network} to: {csv_s3_filepath}"
    )

    return summed_counts_df  # None

In [None]:
def total_sum_flag_counts(timestep: str) -> None:
    """
    Sums all network-level QAQC flag counts for a given timestep (hourly or native) and sends to AWS. 
    These counts are used to generate statistics for the QAQC success report.

    Parameters
    ----------
    timestep: str
        if set to 'hourly', merge all hourly QAQC flag count tables
        if set to 'native', merge all native timestep QAQC flag count tables

    Returns
    -------
    None

    """
    ## Setup

    if timestep not in ("hourly", "native"):
        print("invalid timestep: ", timestep)
        return None

    # the function iteratively adds in flag counts to this dataframe
    summed_counts_df = []

    # point to folder containing network-level flag count CSVs
    flags_prefix = f"{merge_dir}/per_network_flag_counts"

    ## Merge flag counts

    # loop through all networks CSVs
    for item in s3.Bucket(bucket_name).objects.filter(Prefix=flags_prefix):
        obj = s3_cl.get_object(Bucket=bucket_name, Key=item.key)
        flags = pd.read_csv(obj["Body"])
        # the CSV is empty
        if flags.empty:
            continue
        # the CSV is not empty
        else:
            # send current dataframe and dataframe of previously summed counts to helper function
            summed_counts_df = _pairwise_sum(summed_counts_df, flags)

    ## Send final counts file to AWS as CSV

    csv_s3_filepath = (
        f"s3://wecc-historical-wx/4_merge_wx/total_flag_counts_{timestep}_timestep.csv"
    )
    # summed_counts_df.to_csv(csv_s3_filepath, index=False)
    print(f"Sending final summed counts dataframe for to: {csv_s3_filepath}")

    return summed_counts_df  # None

## Run the function

After the merge step is compelte, run network_sum_flag_counts() for all networks. Then run total_sum_flag_counts() a single timee. 

In [None]:
network = "ASOSAWOS"
timestep = "hourly"

In [None]:
network_result = network_sum_flag_counts(network, timestep)

In [None]:
total_result = total_sum_flag_counts(timestep)

## Check

In [None]:
counts_hourly_network = pd.read_csv(
    f"s3://wecc-historical-wx/4_merge_wx/per_network_flag_counts/ASOSAWOS_flag_counts_hourly_timestep.csv"
)
counts_native_network = pd.read_csv(
    f"s3://wecc-historical-wx/4_merge_wx/per_network_flag_counts/ASOSAWOS_flag_counts_native_timestep.csv"
)

counts_hourly_total = pd.read_csv(
    f"s3://wecc-historical-wx/4_merge_wx/per_network_flag_counts/ASOSAWOS_flag_counts_hourly_timestep.csv"
)

counts_native_total = pd.read_csv(
    f"s3://wecc-historical-wx/4_merge_wx/total_flag_counts_native_timestep.csv"
)

Load in previously generate flag counts tables, as a reference.

In [None]:
station1 = "ASOSAWOS_72493023230"
station2 = "ASOSAWOS_69007093217"

In [None]:
key1_native = f"4_merge_wx/{network}/eraqc_counts_native_timestep/{station1}_flag_counts_native_timestep.csv"
key1_hourly = f"4_merge_wx/{network}/eraqc_counts_hourly_timestep/{station1}_flag_counts_hourly_standardized.csv"

key2_native = f"4_merge_wx/{network}/eraqc_counts_native_timestep/{station2}_flag_counts_native_timestep.csv"
key2_hourly = f"4_merge_wx/{network}/eraqc_counts_hourly_timestep/{station2}_flag_counts_hourly_standardized.csv"

In [None]:
flag_counts1_hourly = pd.read_csv(f"s3://wecc-historical-wx/{key1_hourly}")
flag_counts1_native = pd.read_csv(f"s3://wecc-historical-wx/{key1_native}")

flag_counts2_hourly = pd.read_csv(f"s3://wecc-historical-wx/{key2_hourly}")
flag_counts2_native = pd.read_csv(f"s3://wecc-historical-wx/{key2_native}")

In [None]:
flag_counts1_hourly

In [None]:
flag_counts1_native

# Scraps

In [None]:
def sum_flag_counts(network: str, timestep:str, level: str) -> None:
    """
    Sums all QAQC flag counts at a given level (across stations or across networks) and for a given timestep (hourly or native)
    and sends to AWS. These counts are used to generate statistics for the QAQC success report.

    Parameters
    ----------
    network: str
        network name, only used when 'level' = 'network'
    timestep: str
        if set to 'hourly', merge all hourly QAQC flag count tables
        if set to 'native', merge all native timestep QAQC flag count tables
    level: str
        if set to 'network', merge across all station flag count tables
        if set to 'total', merge across all network flag count tables

    Returns
    -------
    None
    
    """
    ## Setup

    if timestep not in ('hourly','native'):
        print('invalid timestep: ',timestep) 
        return None

    # the function iteratively adds in flag counts to this dataframe
    summed_counts_df = []

    ## Assign AWS level and final CSV destiation depening on 'level' argument

    if level == 'network':
        # AWS level to loop through
        flags_prefix = f"{merge_dir}/{network}/eraqc_counts_{timestep}_timestep"
        # where to send the final CSV
        csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/per_network_flag_counts/{network}_flag_counts_{timestep}_timestep.csv"
    
    elif level == 'total':
        # AWS level to loop through
        flags_prefix = f"{merge_dir}/per_network_flag_counts"
        # where to send the final CSV
        csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/total_flag_counts_{timestep}_timestep.csv"
    
    else: 
        print('invalid level: ', level)
        return None

    ## Merge flag counts

    # loop through all CSVs are the given level
    for item in s3.Bucket(bucket_name).objects.filter(Prefix=flags_prefix):
        obj = s3_cl.get_object(Bucket=bucket_name, Key=item.key)
        flags = pd.read_csv(obj["Body"])
        # the CSV is empty
        if flags.empty:
            continue
        # the CSV is not empty
        else:
            # send current dataframe and dataframe of previously summed counts to helper function
            summed_counts_df = _pairwise_sum(summed_counts_df, flags)

    ## Send final counts file to AWS as CSV
    
    summed_counts_df.to_csv(csv_s3_filepath, index=False)
    print('Sending final summed counts dataframe to: ', csv_s3_filepath)

    return summed_counts_df # None