# Data Quality Visualizations

In [36]:
import time
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import contextily as cx

# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
BUCKET_NAME = "wecc-historical-wx"
QAQC_DIR = "3_qaqc_wx"
MERGE_DIR = "4_merge_wx"
stations_csv_path = f"s3://{BUCKET_NAME}/{QAQC_DIR}/all_network_stationlist_qaqc.csv"
shapepath = "s3://wecc-historical-wx/0_maps/tl_2021_us_state"

## Final functions

In [None]:
def _pairwise_rate(flag_df: pd.DataFrame, running_rate_df, station_name: str) -> pd.DataFrame:
    """
    Generates flag rates dataframe for input flag counts dataframe and then adds it to the running flag rate dataframe.
    Helper function for network_rate_tables() and station_rate_table().

    Parameters
    ----------
    flag_df: pd.DataFrame
        flag rates dataframe for next station
    running_rate_df: pd.DataFrame
        dataframe of previously added station flag rates

    Returns
    -------
    rates_df_merged: pd.DataFrame

    """
    # Make the eraqc_flag_values column the index
    flag_df = flag_df.set_index("eraqc_flag_values")

    # Count up the flagged observations - so counts in all but the "no_flag" and "total_obs_count" rows
    subset = flag_df[~flag_df.index.isin(["no_flag", "total_obs_count"])]
    totals = subset.sum(numeric_only=True)
    flag_df.loc["total_flag"] = pd.Series(totals)

    # And then use those total to calculate the per-variable flag rates
    frac = flag_df.loc["total_flag"] / flag_df.loc["total_obs_count"]
    flag_df.loc["flag_rate"] = pd.Series(frac)

    # Keep only the rate
    rates_df = flag_df.loc[["flag_rate"]]
    rates_df = rates_df.rename(index={"flag_rate": station_name})

    rates_df = rates_df.reset_index()

    # Finally, append column of total observation count
    flag_df = flag_df.reset_index()
    total_obs = flag_df[flag_df['eraqc_flag_values']=='total_obs_count'].iloc[0,1]
    rates_df['total_obs_count'] = total_obs

    if len(running_rate_df) == 0:
        return rates_df

    else:
        rates_df_merged = pd.merge(rates_df, running_rate_df, how="outer")
        return rates_df_merged

In [52]:
def network_rate_tables(timestep: str) -> None:
    """
    Generates a table of flag rates per network and uploads it to AWS.

    Parameters
    ----------
    timestep: str
        if set to 'hourly', merge all hourly QAQC flag count tables
        if set to 'native', merge all native timestep QAQC flag count tables

    Returns
    -------
    None

    """
    ## Setup 

    # Only run for a valid "timestep" input
    if timestep not in ("hourly", "native"):
        print("invalid timestep: ", timestep)
        return None

    # The function iteratively adds in flag counts to this dataframe
    flag_rate_df = []

    # Point to folder containing station flag count CSVs
    flags_prefix = f"{MERGE_DIR}/per_network_flag_counts_{timestep}_timestep"

    ## Merge flag counts

    # Loop through all CSVs are the given level
    for item in s3.Bucket(BUCKET_NAME).objects.filter(Prefix=flags_prefix):
        obj = s3_cl.get_object(Bucket=BUCKET_NAME, Key=item.key)
        flags = pd.read_csv(obj["Body"])
        station_name = item.key.split(flags_prefix + "/")[1].split("_flag")[0]
        # the CSV is empty
        if flags.empty:
            continue
        # the CSV is not empty
        else:
            # Remove "QAQC_function" and "Flag_meaning" columns - we don't need these
            flags = flags.drop(["QAQC_function", "Flag_meaning"], axis=1)

            # Send current dataframe and dataframe of previously generated rates to helper function
            flag_rate_df = _pairwise_rate(flags, flag_rate_df, station_name)

    # Change "eraqc_flag_values" to "stations"
    flag_rate_df = flag_rate_df.rename(columns={"eraqc_flag_values": "networks"})

    ## Send final flag rates file to AWS as CSV
    csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/network_{timestep}_flag_rates.csv"
    flag_rate_df.to_csv(csv_s3_filepath, index=False)
    print(f"Sending station flag rates CSV to: {csv_s3_filepath}")

    return None

In [50]:
network_rate_tables('hourly')

Sending station flag rates CSV to: s3://wecc-historical-wx/4_merge_wx/network_hourly_flag_rates.csv


In [51]:
def station_rate_tables(timestep: str) -> None:
    """
    Generates a table of flag rates per station.

    Parameters
    ----------
    timestep: str
        if set to 'hourly', merge all hourly QAQC flag count tables
        if set to 'native', merge all native timestep QAQC flag count tables
    Returns
    -------
    None

    """
    ## Setup 
    
    # Only run for a valid "timestep" input
    if timestep not in ("hourly", "native"):
        print("invalid timestep: ", timestep)
        return None

    # List of networks to iterate over
    network_list = ['VCAPCD','CDEC']

    # The function iteratively adds in flag counts to this dataframe
    flag_rate_df = []

    for network in network_list:
        # Point to folder containing station flag count CSVs
        flags_prefix = f"{MERGE_DIR}/{network}/eraqc_counts_{timestep}_timestep"

        ## Merge flag counts

        # Loop through all CSVs at the given level
        for item in s3.Bucket(BUCKET_NAME).objects.filter(Prefix=flags_prefix):
            obj = s3_cl.get_object(Bucket=BUCKET_NAME, Key=item.key)
            flags = pd.read_csv(obj["Body"])
            station_name = item.key.split(flags_prefix + "/")[1].split("_flag")[0]
            # the CSV is empty
            if flags.empty:
                continue
            # the CSV is not empty
            else:
                # Send current dataframe and dataframe of previously generated rates to helper function
                flag_rate_df = _pairwise_rate(flags, flag_rate_df, station_name)

        # Change "eraqc_flag_values" to "stations"
        flag_rate_df = flag_rate_df.rename(columns={"eraqc_flag_values": "networks"})

        ## Send final flag rates file to AWS as CSV
        csv_s3_filepath = (
            f"s3://wecc-historical-wx/4_merge_wx/station_{timestep}_flag_rates.csv"
        )
        flag_rate_df.to_csv(csv_s3_filepath, index=False)
        print(f"Sending network flag rates CSV to: {csv_s3_filepath}")

    return None

In [None]:
station_rate_tables()

## Map

### Merge with station list

In [None]:
station_list = pd.read_csv(stations_csv_path)
sub_station_list = station_list[station_list['network']==network]

In [None]:
sub_station_list

In [None]:
merged_list = sub_station_list.merge(flag_rate_df, on="era-id")

In [None]:
merged_list

In [None]:
map_list = merged_list

# Format dates in datetime format (this gets lost in import).
map_list["start-date"] = pd.to_datetime(map_list["start-date"], utc=True)
map_list["end-date"] = pd.to_datetime(map_list["end-date"], utc=True)

# Make a geodataframe.
gdf = gpd.GeoDataFrame(
    map_list,
    geometry=gpd.points_from_xy(map_list.longitude, map_list.latitude),
)
gdf.set_crs(epsg=4326, inplace=True)  # Set CRS

# Project data to match base tiles.
gdf_wm = gdf.to_crs(epsg=3857)  # Web mercator

# Read in geometry of continental US.
us = gpd.read_file(shapepath)

# Remove territories, AK, HI
rem_list = ["HI", "AK", "MP", "GU", "AS", "PR", "VI"]
us = us.loc[us.STUSPS.isin(rem_list) == False]

# Use to clip stations
us = us.to_crs(epsg=3857)
gdf_us = gdf_wm.clip(us)



In [None]:
# Plot
ax = gdf_us.plot(
    "tas",
    figsize=(15, 15),
    alpha=1,
    markersize=3,
    legend=True,
    cmap="nipy_spectral",
)
cx.add_basemap(ax, source=cx.providers.CartoDB.Positron)
ax.set_axis_off()