# Success Report Statistics

per station 
- total % flagged
- % flagged per variable
- % flagged per QAQC flag

NEED: per station raw counts tables

per network
- % flagged per variable (highest and lowest)
- % flagged per QAQC flag (highest and lowest)
- % flagged per station (highest and lowest)

NEED: raw counts per variable, raw counts per QAQC flag, raw counts per station

HOW: sum total and flagged per station, variable, and QAQC flag

total
- % flagged per network
- % flagged per variable (highest and lowest)
- % flagged per QAQC flag (highest and lowest)

NEED: raw counts per variable, raw counts per QAQC flag, raw counts per station

HOW: sum total and flagged per network, variable, and QAQC flag


## Environment set-up

In [None]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO
from functools import reduce

import inspect

import logging
# Create a simple logger that just prints to the console
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

plt.rcParams["figure.dpi"] = 300

In [None]:
# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"
stations_csv_path = f"s3://{bucket_name}/2_clean_wx/temp_clean_all_station_list.csv"
qaqc_dir = "3_qaqc_wx"
merge_dir = "4_merge_wx"

## Functions

In [None]:
def _pairwise_sum(flag_df_1, flag_df_2) -> pd.DataFrame:
    """
    Sums two input flag count dataframes. This is a helper function for sum_flag_counts(). 

    Parameters
    ----------
    flag_df_1: pd.DataFrame
        dataframe of previously summed station flag counts
    flag_df_2: pd.DataFrame
        flag counts dataframes for next station

    Returns
    -------
    summed_df: pd.DataFrame

    """
    if len(flag_df_1) == 0:
        return flag_df_2
    else:
        total_df = pd.concat([flag_df_1, flag_df_2])

        summed_df = total_df.groupby('eraqc_flag_values', as_index=False).sum()
        return summed_df

In [27]:
def sum_flag_counts(network: str, timestep:str, level: str) -> None:
    """
    Sums all QAQC flag counts at a given level (across stations or across networks) and for a given timestep (hourly or native)
    and sends to AWS. These counts are used to generate statistics for the QAQC success report.

    Parameters
    ----------
    network: str
        network name
    timestep: str
        if set to 'hourly', merge all hourly QAQC flag count tables
        if set to 'native', merge all native timestep QAQC flag count tables
    level: str
        if set to 'network', merge across all station flag count tables
        if set to 'total', merge across all network flag count tables

    Returns
    -------
    None
    
    """
    if timestep not in ('hourly','native'):
        print('invalid timestep: ',timestep) 
        return None

    # store summed flag counts here
    # the function iteratively adds in flag counts to this dataframe
    summed_counts_df = []

    if level == 'network':
        # AWS level to loop through
        flags_prefix = f"{merge_dir}/{network}/eraqc_counts_{timestep}_timestep"
        # where to send the final CSV
        csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/per_network_flag_counts/{network}_flag_counts_{timestep}_timestep.csv"
    
    elif level == 'total':
        # AWS level to loop through
        flags_prefix = f"{merge_dir}/per_network_flag_counts"
        # where to send the final CSV
        csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/total_flag_counts_{timestep}_timestep.csv"
    else: 
        print('invalid level: ', level)
        return None

    ## merge timestep flag counts

    # loop through all CSVs are the given level
    for item in s3.Bucket(bucket_name).objects.filter(Prefix=flags_prefix):
        obj = s3_cl.get_object(Bucket=bucket_name, Key=item.key)
        flags = pd.read_csv(obj["Body"])
        # the CSV is empty
        if flags.empty:
            continue
        # the CSV is not empty
        else:
            # send current dataframe and dataframe of previously summed counts to helper function
            summed_counts_df = _pairwise_sum(summed_counts_df, flags)

    ## send file to AWS
    summed_counts_df.to_csv(csv_s3_filepath, index=True)
    print('Sending final summed counts dataframe to: ', csv_s3_filepath)

    return summed_counts_df # None

## Sandbox

Run the function

In [34]:
network = "ASOSAWOS"
level = 'total'
timestep = 'native'

result = sum_flag_counts(network, timestep, level)

Sending final summed counts dataframe to:  s3://wecc-historical-wx/4_merge_wx/total_flag_counts_native_timestep.csv


In [35]:
result

Unnamed: 0.1,eraqc_flag_values,Unnamed: 0,elevation,pr,ps_altimeter,psl,sfcWind_dir,sfcWind,tas,tdps,hurs_derived,ps
0,21.0,0,0,0,0,44630,0,0,0,0,0.0,21849.0
1,23.0,2,0,0,49,51,0,0,20,401,0.0,7.0
2,26.0,4,0,0,0,0,0,0,193,664,0.0,0.0
3,27.0,6,0,0,0,0,0,32,0,30,0.0,0.0
4,28.0,8,0,0,25926,25926,0,0,0,25976,0.0,0.0
5,38.0,5,0,0,0,0,0,0,0,0,673.0,0.0
6,no_flag,11,780330,780330,754357,709723,780330,780298,780137,753322,349579.0,732549.0
7,total_obs_count,13,780330,780330,780330,780330,780330,780330,780330,780330,350169.0,754404.0


Troubleshoot issue at 'total' level, with 'Unnamed: 0' column

In [None]:
key_native = f"s3://wecc-historical-wx/4_merge_wx/per_network_flag_counts/{network}_flag_counts_hourly_timestep.csv"
key_hourly = f"s3://wecc-historical-wx/4_merge_wx/per_network_flag_counts/{network}_flag_counts_native_timestep.csv"

Load in previously generate flag counts tables, as a reference.

In [None]:
station1 = "ASOSAWOS_72493023230"
station2 = "ASOSAWOS_69007093217"

In [12]:
key1_native = f"4_merge_wx/{network}/eraqc_counts_native_timestep/{station1}_flag_counts_native_timestep.csv"
key1_hourly = f"4_merge_wx/{network}/eraqc_counts_hourly_timestep/{station1}_flag_counts_hourly_standardized.csv"

key2_native = f"4_merge_wx/{network}/eraqc_counts_native_timestep/{station2}_flag_counts_native_timestep.csv"
key2_hourly = f"4_merge_wx/{network}/eraqc_counts_hourly_timestep/{station2}_flag_counts_hourly_standardized.csv"

In [13]:
flag_counts1_hourly = pd.read_csv(f"s3://wecc-historical-wx/{key1_hourly}")
flag_counts1_native = pd.read_csv(f"s3://wecc-historical-wx/{key1_native}")

flag_counts2_hourly = pd.read_csv(f"s3://wecc-historical-wx/{key2_hourly}")
flag_counts2_native = pd.read_csv(f"s3://wecc-historical-wx/{key2_native}")

INFO:aiobotocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [18]:
flag_counts1_hourly

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps,hurs_derived
0,21.0,0,0,0,7296,22079,0,0,0,0,0
1,23.0,0,0,24,3,23,0,0,10,200,0
2,26.0,0,0,0,0,0,0,0,94,332,0
3,27.0,0,0,0,0,0,0,16,0,14,0
4,28.0,0,0,0,0,0,0,0,0,25,0
5,38.0,0,0,0,0,0,0,0,0,0,673
6,no_flag,338512,338512,338490,331214,316410,338512,338496,338428,338004,337922
7,total_obs_count,338512,338512,338512,338512,338512,338512,338512,338512,338512,338512


In [21]:
flag_counts1_native

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps
0,no_flag,415892,415892,415867,401335,393313,415892,415876,415783,415318
1,23.0,0,0,25,4,28,0,0,10,201
2,21.0,0,0,0,14553,22551,0,0,0,0
3,27.0,0,0,0,0,0,0,16,0,16
4,26.0,0,0,0,0,0,0,0,99,332
5,28.0,0,0,0,0,0,0,0,0,25
6,total_obs_count,415892,415892,415892,415892,415892,415892,415892,415892,415892
