# Generate summed flag count tables

This notebook creates QAQC flag counts csv files per network from the corresponding eraqc_counts_timestep files that were generated as a part of the final processing step for stations within the Historical Data Pipeline. These tables are used to then generate statistics for the QAQC success report.

This is carried out in two steps:

1. Generate the per-network QAQC flag count tables, at native and hourly timesteps

2. Generates one flag count table that sums all per-network tables, at native and hourly timesteps


Using the following functions:


- _pairwise_sum(): helper function that merges two input flag tables, used by network_sum_flag_counts() and total_sum_flag_counts().

- network_sum_flag_counts(): sums all station flag count tables for a given network, creating one flag count table for that network

- generate_station_tables(): runs network_sum_flag_counts() for every network

- total_sum_flag_counts(): sums all network flag count tables, creating one final flag count table 

## Step 0: Environment set-up

In [1]:
import time
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO
from functools import reduce

import inspect

import logging
# Create a simple logger that just prints to the console
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

plt.rcParams["figure.dpi"] = 300

In [2]:
# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"
stations_csv_path = f"s3://{bucket_name}/2_clean_wx/temp_clean_all_station_list.csv"
qaqc_dir = "3_qaqc_wx"
merge_dir = "4_merge_wx"

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


### The functions

In [3]:
def _pairwise_sum(flag_df_1, flag_df_2) -> pd.DataFrame:
    """
    Sums two input flag count dataframes. This is a helper function for sum_flag_counts(). 

    Parameters
    ----------
    flag_df_1: pd.DataFrame
        dataframe of previously summed station flag counts
    flag_df_2: pd.DataFrame
        flag counts dataframes for next station

    Returns
    -------
    summed_df: pd.DataFrame

    """
    if len(flag_df_1) == 0:
        return flag_df_2
    else:
        total_df = pd.concat([flag_df_1, flag_df_2])

        summed_df = total_df.groupby('eraqc_flag_values', as_index=False).sum()
        return summed_df

In [4]:
def _format_table(
    summed_counts: pd.DataFrame, flag_table: pd.DataFrame
) -> pd.DataFrame:
    """
    A helper function that sums

    Parameters
    ----------
    summed_counts: pd.DataFrame
        dataframe of summed station flag counts
    flag_table: pd.DataFrame
        flag counts dataframes for next station

    Returns
    -------
    summed_df: pd.DataFrame

    """
    ## Format flag meanings df
    flag_table = flag_table.rename(columns={"Flag_value": "eraqc_flag_values"})

    ## Format summed counts df
    summed_counts["eraqc_flag_values"] = summed_counts["eraqc_flag_values"].str.replace(
        ".0", "", regex=True
    )
    summed_counts["eraqc_flag_values"] = summed_counts["eraqc_flag_values"].apply(
        lambda x: int(x) if x not in ["no_flag", "total_obs_count"] else x
    )

    ## Merge the the counts and flag meanings dataframes

    merged_dfs = summed_counts.merge(flag_table, on="eraqc_flag_values", how="outer")

    ## Format final dataframe

    # order by flag value, in descending numerical order
    final_format = (
        merged_dfs.groupby(
            merged_dfs.eraqc_flag_values.apply(type) != str, group_keys=True
        )
        .apply(lambda g: g.sort_values("eraqc_flag_values"))
        .reset_index(drop=True)
    )

    # move string flag value entries to the bottom
    final_format = final_format.loc[
        pd.to_numeric(final_format["eraqc_flag_values"], errors="coerce").sort_values().index
    ]

    return final_format

In [5]:
def network_sum_flag_counts(network: str, timestep: str) -> None:
    """
    Sums all station QAQC flag counts in a network for a given timestep (hourly or native) and sends to AWS. 
    These counts are used to generate statistics for the QAQC success report.

    Parameters
    ----------
    network: str
        network name
    timestep: str
        if set to 'hourly', merge all hourly QAQC flag count tables
        if set to 'native', merge all native timestep QAQC flag count tables

    Returns
    -------
    None

    """
    ## Setup

    # read in flag meanings CSV

    flag_meanings = pd.read_csv("era_qaqc_flag_meanings.csv")

    # only run for a valid "timestep" input
    if timestep not in ("hourly", "native"):
        print("invalid timestep: ", timestep)
        return None

    # the function iteratively adds in flag counts to this dataframe
    summed_counts_df = []

    # point to folder containing station flag count CSVs
    flags_prefix = f"{merge_dir}/{network}/eraqc_counts_{timestep}_timestep"

    ## Merge flag counts

    # loop through all CSVs are the given level
    for item in s3.Bucket(bucket_name).objects.filter(Prefix=flags_prefix):
        obj = s3_cl.get_object(Bucket=bucket_name, Key=item.key)
        flags = pd.read_csv(obj["Body"])
        # the CSV is empty
        if flags.empty:
            continue
        # the CSV is not empty
        else:
            # send current dataframe and dataframe of previously summed counts to helper function
            summed_counts_df = _pairwise_sum(summed_counts_df, flags)

    counts_final =_format_table(summed_counts_df, flag_meanings)

    ## Send final counts file to AWS as CSV

    csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/per_network_flag_counts_{timestep}_timestep/{network}_flag_counts_{timestep}_timestep.csv"
    counts_final.to_csv(csv_s3_filepath, index=False)
    print(
        f"Sending summed counts dataframe for {network} to: {csv_s3_filepath}"
    )

    return None

In [6]:
def total_sum_flag_counts(timestep: str) -> None:
    """
    Sums all network-level QAQC flag counts for a given timestep (hourly or native) and sends to AWS. 
    These counts are used to generate statistics for the QAQC success report.

    Parameters
    ----------
    timestep: str
        if set to 'hourly', merge all hourly QAQC flag count tables
        if set to 'native', merge all native timestep QAQC flag count tables

    Returns
    -------
    None

    """
    ## Setup

    # only run for a valid "timestep" input
    if timestep not in ("hourly", "native"):
        print("invalid timestep: ", timestep)
        return None

    # the function iteratively adds in flag counts to this dataframe
    summed_counts_df = []

    # point to folder containing network-level flag count CSVs
    flags_prefix = f"{merge_dir}/per_network_flag_counts_{timestep}_timestep"

    ## Merge flag counts

    # loop through all networks CSVs
    for item in s3.Bucket(bucket_name).objects.filter(Prefix=flags_prefix):
        obj = s3_cl.get_object(Bucket=bucket_name, Key=item.key)
        flags = pd.read_csv(obj["Body"])
        # the CSV is empty
        if flags.empty:
            continue
        # the CSV is not empty
        else:
            # send current dataframe and dataframe of previously summed counts to helper function
            print(f'summing for {item.key}')
            summed_counts_df = _pairwise_sum(summed_counts_df, flags)

    ## Send final counts file to AWS as CSV
    if len(summed_counts_df) == 0:
        return None
    else:
        csv_s3_filepath = (
            f"s3://wecc-historical-wx/4_merge_wx/total_flag_counts_{timestep}_timestep.csv"
        )
        #summed_counts_df.to_csv(csv_s3_filepath, index=False)
        print(f"Sending final summed counts dataframe for to: {csv_s3_filepath}")

        return summed_counts_df # None

In [7]:
def generate_station_tables(timestep: str) -> None:
    """
    Runs network_sum_flag_counts() for every network.

    Parameters
    ----------
    timestep: str
        if set to 'hourly', merge all hourly QAQC flag count tables
        if set to 'native', merge all native timestep QAQC flag count tables

    Returns
    -------
    None

    """
    # record start time
    start_time = time.time()

    # only run for a valid "timestep" input
    if timestep not in ("hourly", "native"):
        print("invalid timestep: ", timestep)
        return None

    station_list = pd.read_csv(stations_csv_path)
    network_list = station_list["network"].unique()

    for network in network_list:
        network_sum_flag_counts(network, timestep)

    # record end time
    end_time = time.time()

    # output time elapsed
    time_elapsed = (end_time - start_time) / 60
    print(f"{time_elapsed} minutes")

    return None

## Step 1: Generate flag sum tables for ever network

First, loop through every network, combining each of their station flag count tables into one table. The result is one flag count table at each timestep - native and hourly - for every network.

This will take around 1 hour to run for both timesteps. 

began at 12:05

In [None]:
generate_station_tables('native')

# 22 minutes for 27 networks

In [None]:
generate_station_tables("hourly")

### Check

In [15]:
network_1 = "ASOSAWOS"
network_2 = 'CW3E'
station = "ASOSAWOS_72479694128"
timestep = 'native'

In [13]:
# key = f"{merge_dir}/{network}/eraqc_counts_{timestep}_timestep/{station}_flag_counts_{timestep}_timestep.csv"
key_1 = f"{merge_dir}/per_network_flag_counts_{timestep}_timestep/{network_1}_flag_counts_{timestep}_timestep.csv"
flag_table_1 = pd.read_csv(f"s3://wecc-historical-wx/{key_1}")

In [16]:
key_2 = f"{merge_dir}/per_network_flag_counts_{timestep}_timestep/{network_2}_flag_counts_{timestep}_timestep.csv"
flag_table_2 = pd.read_csv(f"s3://wecc-historical-wx/{key_2}")

In [18]:
flag_table_1.head(1)

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,psl,sfcWind_dir,sfcWind,tas,tdps,ps,accum_pr,rsds,hurs,QAQC_function,Flag_meaning
0,1,,,,,,,,,,,,,spurious_buoy_check,Suspect observation (i.e. buoy reports wind du...


In [19]:
flag_table_2.head(1)

Unnamed: 0,eraqc_flag_values,elevation,hurs,pr,psl,rsds,sfcWind_dir,sfcWind,tas,accum_pr,QAQC_function,Flag_meaning
0,1,,,,,,,,,,spurious_buoy_check,Suspect observation (i.e. buoy reports wind du...


In [47]:
what = _pairwise_sum(flag_table_1, flag_table_2)

  summed_df = total_df.groupby('eraqc_flag_values', as_index=False).sum()


In [48]:
what.head(1)

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,psl,sfcWind_dir,sfcWind,tas,tdps,ps,accum_pr,rsds,hurs
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# order by flag value, in descending numerical order
final_format = (
    what.groupby(
        what.eraqc_flag_values.apply(type) != str, group_keys=True
    )
    .apply(lambda g: g.sort_values("eraqc_flag_values"))
    .reset_index(drop=True)
)

In [56]:
# order by flag value, in descending numerical order
test = (
    what.groupby(what.eraqc_flag_values.apply(type) != str, group_keys=True)
    .apply(lambda g: g+g)
    .reset_index(drop=True)
)

In [58]:
final_format

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,psl,sfcWind_dir,sfcWind,tas,tdps,ps,accum_pr,rsds,hurs
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11,0.0,0.0,45.0,1842.0,0.0,0.0,711141.0,647508.0,0.0,0.0,829883.0,58045.0
3,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61546.0,0.0,0.0,0.0,0.0
5,14,0.0,0.0,0.0,0.0,1023.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,15,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Step 2: Generate total flag sum table

Now combine all the network flag count tables generated in step 1 into one final flag count table. First at the hourly timestep, and then at the native timestep.

Step 1 must be complete before moving on to this step.

In [None]:
total_sum_flag_counts('native')

In [None]:
total_sum_flag_counts('hourly')