# Success Report Statistics

per station 
- total % flagged
- % flagged per variable
- % flagged per QAQC flag

NEED: per station raw counts tables

per network
- % flagged per variable (highest and lowest)
- % flagged per QAQC flag (highest and lowest)
- % flagged per station (highest and lowest)

NEED: raw counts per variable, raw counts per QAQC flag, raw counts per station
HOW: sum total and flagged per station, variable, and QAQC flag

total
- % flagged total
- % flagged per network
- % flagged per variable (highest and lowest)
- % flagged per QAQC flag (highest and lowest)

NEED: raw counts per variable, raw counts per QAQC flag, raw counts per station
HOW: sum total and flagged per network, variable, and QAQC flag


## Environment set-up

In [12]:
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from io import BytesIO, StringIO
from functools import reduce

import inspect

import logging
# Create a simple logger that just prints to the console
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

plt.rcParams["figure.dpi"] = 300

In [13]:
# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
bucket_name = "wecc-historical-wx"
stations_csv_path = f"s3://{bucket_name}/2_clean_wx/temp_clean_all_station_list.csv"
qaqc_dir = "3_qaqc_wx"
merge_dir = "4_merge_wx"

## Setup

In [4]:
stations_df = pd.read_csv(stations_csv_path)

INFO:aiobotocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [14]:
network = "ASOSAWOS"
station1 = "ASOSAWOS_72493023230"
station2 = "ASOSAWOS_69007093217"

In [17]:
key1_native = f"4_merge_wx/{network}/eraqc_counts_native_timestep/{station1}_flag_counts_native_timestep.csv"
key1_hourly = f"4_merge_wx/{network}/eraqc_counts_hourly_timestep/{station1}_flag_counts_hourly_standardized.csv"

key2_native = f"4_merge_wx/{network}/eraqc_counts_native_timestep/{station2}_flag_counts_native_timestep.csv"
key2_hourly = f"4_merge_wx/{network}/eraqc_counts_hourly_timestep/{station2}_flag_counts_hourly_standardized.csv"

In [18]:
flag_counts1_hourly = pd.read_csv(f"s3://wecc-historical-wx/{key1_hourly}")
flag_counts1_native = pd.read_csv(f"s3://wecc-historical-wx/{key1_native}")

flag_counts2_hourly = pd.read_csv(f"s3://wecc-historical-wx/{key2_hourly}")
flag_counts2_native = pd.read_csv(f"s3://wecc-historical-wx/{key2_native}")

In [24]:
flag_counts2_native

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,psl,sfcWind_dir,sfcWind,tas,tdps
0,28.0,0,0,14269,14269,0,0,0,14269
1,no_flag,14269,14269,0,0,14269,14269,14269,0
2,total_obs_count,14269,14269,14269,14269,14269,14269,14269,14269


In [22]:
flag_counts1_native

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps
0,no_flag,415892,415892,415867,401335,393313,415892,415876,415783,415318
1,23.0,0,0,25,4,28,0,0,10,201
2,21.0,0,0,0,14553,22551,0,0,0,0
3,27.0,0,0,0,0,0,0,16,0,16
4,26.0,0,0,0,0,0,0,0,99,332
5,28.0,0,0,0,0,0,0,0,0,25
6,total_obs_count,415892,415892,415892,415892,415892,415892,415892,415892,415892


In [None]:
df = pd.concat([
    flag_counts2_native,
    flag_counts1_native
])

result=df.groupby('eraqc_flag_values', as_index=False).sum()
result

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,psl,sfcWind_dir,sfcWind,tas,tdps,ps
0,21.0,0,0,0,22551,0,0,0,0,14553.0
1,23.0,0,0,25,28,0,0,10,201,4.0
2,26.0,0,0,0,0,0,0,99,332,0.0
3,27.0,0,0,0,0,0,16,0,16,0.0
4,28.0,0,0,14269,14269,0,0,0,14294,0.0
5,no_flag,430161,430161,415867,393313,430161,430145,430052,415318,401335.0
6,total_obs_count,430161,430161,430161,430161,430161,430161,430161,430161,415892.0


In [26]:
test = df_concat = pd.merge(flag_counts1_native, flag_counts2_native, how="outer")
test

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,psl,sfcWind_dir,sfcWind,tas,tdps
0,no_flag,415892,415892,415867,401335.0,393313,415892,415876,415783,415318
1,23.0,0,0,25,4.0,28,0,0,10,201
2,21.0,0,0,0,14553.0,22551,0,0,0,0
3,27.0,0,0,0,0.0,0,0,16,0,16
4,26.0,0,0,0,0.0,0,0,0,99,332
5,28.0,0,0,0,0.0,0,0,0,0,25
6,total_obs_count,415892,415892,415892,415892.0,415892,415892,415892,415892,415892
7,28.0,0,0,14269,,14269,0,0,0,14269
8,no_flag,14269,14269,0,,0,14269,14269,14269,0
9,total_obs_count,14269,14269,14269,,14269,14269,14269,14269,14269


In [28]:
empty = []

In [31]:
len(empty)

0

Generate a network-level summary table

- same csv as station-level but sum of all stations (so variable vs. qaqc flag) -> collapse all csvs
- produce statistics (per station)
    - % flagged total
    - % flagged per variable
    - % flagged per QAQC flag

What will this second output look like?
table 1:
- station as columns
    - % flagged
    - most flagged QAQC flag
    - most flagged variable
table 2: 
- variables as columns
    - % flagged
    - most flagged QAQC flag
table 3: 
- QAQC flags as columns
    - % flagged

Those 3 tables are up for discussion - is there a cleaner way to do this?

But FOR SURE produce that table of sums across all stations

In [44]:
def _pairwise_sum(df_1, df_2) -> pd.DataFrame:
    """
    Sums all station flag count tables into one network-level raw flag count table.

    Parameters
    ----------
    network: str
        network name

    Returns
    -------
    None
    """
    if len(df_1)==0:
        return df_2
    else:
        df = pd.concat([
        flag_counts2_native,
        flag_counts1_native
        ])

        result=df.groupby('eraqc_flag_values', as_index=False).sum()
        return result

In [None]:
def network_sum_counts(network: str, timestep:str, level: str) -> None:
    """
    Sums all station flag count tables into one network-level raw flag count table.

    Parameters
    ----------
    network: str
        network name

    Returns
    -------
    None
    """

    flagsdf = []
    if level == 'network':
        flags_prefix = f"{merge_dir}/{network}/eraqc_counts_{timestep}_timestep"
    elif level == 'total':
        flags_prefix = (
            f"{merge_dir}/{network}/{network}_flag_counts_{timestep}_timestep.csv"
        )
    else: 
        print('invalid level')
        return None

    print('prefix: ', flags_prefix)

    ## merge native timestep flag counts
    for item in s3.Bucket(bucket_name).objects.filter(Prefix=flags_prefix):
        obj = s3_cl.get_object(Bucket=bucket_name, Key=item.key)
        flags = pd.read_csv(obj["Body"])
        if flags.empty:  # If file empty
            continue
        else:
            flagsdf = _pairwise_sum(flagsdf, flags)

    # send file to AWS
    # csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/{network}/{network}_flag_counts_{timestep}_timestep.csv"
    # flagsdf.to_csv(csv_s3_filepath, index=True)

    return flagsdf

In [50]:
timestep = 'hourly'
level = 'love'
sum_result = network_sum_counts(network,timestep,level)

invalid level


In [36]:
sum_result

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,psl,sfcWind_dir,sfcWind,tas,tdps,ps
0,21.0,0,0,0,22551,0,0,0,0,14553.0
1,23.0,0,0,25,28,0,0,10,201,4.0
2,26.0,0,0,0,0,0,0,99,332,0.0
3,27.0,0,0,0,0,0,16,0,16,0.0
4,28.0,0,0,14269,14269,0,0,0,14294,0.0
5,no_flag,430161,430161,415867,393313,430161,430145,430052,415318,401335.0
6,total_obs_count,430161,430161,430161,430161,430161,430161,430161,430161,415892.0


## useful bits of code

In [None]:
    # Read the CSV file containing station data
    csv_filepath = "s3://wecc-historical-wx/2_clean_wx/temp_clean_all_station_list.csv"
    stations_df = pd.read_csv(csv_filepath)

    # Filter the dataframe to only include rows corresponding to the specified network
    # And, only cleaned stations
    network_df = stations_df[
        (stations_df["network"] == network) & (stations_df["cleaned"] == "Y")
    ]

    # Check if nothing is returned. Raise ValueError and print useful message.
    if len(network_df) == 0:
        unique_networks = ", ".join(stations_df["network"].unique())  # Unique networks
        raise ValueError(
            f"No stations found for network: {network}. Available networks: {unique_networks}"
        )
    