# Order Stations by Data Quality

This notebook sorts HDP stations by their QC percentage. It does this by calculating the number of QAQC flags set per station.
- High values indicate that a station has a large number of flagged observations.
- Low values indicate that a station has a few number of flagged observations.


In [None]:
import time
import boto3
import pandas as pd

# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
BUCKET_NAME = "wecc-historical-wx"
MERGE_DIR = "4_merge_wx"
stations_csv_path = f"s3://{BUCKET_NAME}/{MERGE_DIR}/all_network_stationlist_merge.csv"

In [None]:
def _pairwise_count(
    flag_df: pd.DataFrame, running_count_df, station_name: str
) -> pd.DataFrame:
    """
    Generates consolidated flag counts dataframe for input per-variable station flag counts dataframe and then adds it to the running flag count dataframe.
    Helper function for station_counts_table().

    Parameters
    ----------
    flag_df: pd.DataFrame
        flag rates dataframe for next station
    running_count_df: pd.DataFrame
        dataframe of previously added station flag counts

    Returns
    -------
    counts_df_merged: pd.DataFrame

    """
    # Make the eraqc_flag_values column the index
    flag_df = flag_df.set_index("eraqc_flag_values")

    # Count up the flagged observations - so counts in all but the "no_flag" and "total_obs_count" rows
    subset = flag_df[flag_df.index.isin(["no_flag", "total_obs_count"])]
    counts_sum = subset.sum(axis=1)

    # Transpose
    counts_sum_df = counts_sum.to_frame()
    counts_sum_df = counts_sum_df.transpose()

    # Select only the two relevant columns
    counts_sum_df[["no_flag", "total_obs_count"]]

    # Calculate the fraction of flagged observations, as the station "confidence"
    counts_sum_df["qc_percent"] = (
        1 - counts_sum_df["no_flag"] / counts_sum_df["total_obs_count"]
    )

    # Add in station name to index
    counts_sum_df.index = [station_name]

    counts_sum_df = counts_sum_df.reset_index()

    if len(running_count_df) == 0:
        return counts_sum_df

    else:
        counts_df_merged = pd.merge(counts_sum_df, running_count_df, how="outer")
        return counts_df_merged

In [None]:
def station_counts_table(timestep: str) -> None:
    """
    Generates a table of total flag counts per station.

    Parameters
    ----------
    timestep: str
        if set to 'hourly', generate flag count table from hourly flag counts
        if set to 'native', generate flag count table from native flag counts

    Returns
    -------
    None
    """
    # Record start time
    start_time = time.time()

    ## Setup

    # Only run for a valid "timestep" input
    if timestep not in ("hourly", "native"):
        print("invalid timestep: ", timestep)
        return None

    # List of networks to iterate over
    station_list = pd.read_csv(stations_csv_path)
    network_list = station_list["network"].unique()

    # The function iteratively adds in flag counts to this dataframe
    flag_count_df = []

    for network in network_list:
        # Point to folder containing station flag count CSVs
        flags_prefix = f"{MERGE_DIR}/{network}/eraqc_counts_{timestep}_timestep"

        ## Merge flag counts

        # Loop through all CSVs at the given level
        for item in s3.Bucket(BUCKET_NAME).objects.filter(Prefix=flags_prefix):
            obj = s3_cl.get_object(Bucket=BUCKET_NAME, Key=item.key)
            flags = pd.read_csv(obj["Body"])
            station_name = item.key.split(flags_prefix + "/")[1].split("_flag")[0]
            # the CSV is empty
            if flags.empty:
                continue
            # the CSV is not empty
            else:
                # Send current dataframe and dataframe of previously generated rates to helper function
                flag_count_df = _pairwise_count(flags, flag_count_df, station_name)

    # Rename "index" column as "station"
    flag_count_df = flag_count_df.rename(columns={"index": "station"})

    # Sort by "qc_percent"
    flag_counts_sorted = flag_count_df.sort_values(
        by="qc_percent", ascending=False
    )

    ## Send final flag rates file to AWS as CSV
    csv_s3_filepath = (
        f"s3://wecc-historical-wx/4_merge_wx/station_{timestep}_confidence.csv"
    )

    print(f"Sending {timestep} timestep station flag counts CSV to: {csv_s3_filepath}")
    flag_counts_sorted.to_csv(csv_s3_filepath, index=False)

    ## Output time elapsed
    end_time = time.time()
    time_elapsed = (end_time - start_time) / 60
    print(f"{time_elapsed} minutes")

    return None

In [None]:
station_counts_table('hourly') # takes ~20-40 minutes

In [None]:
station_counts_table("native")  # expected to take ~30 minutes