# Data Quality Visualizations

In [None]:
import time
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import contextily as cx

# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
BUCKET_NAME = "wecc-historical-wx"
QAQC_DIR = "3_qaqc_wx"
MERGE_DIR = "4_merge_wx"
stations_csv_path = f"s3://{BUCKET_NAME}/{QAQC_DIR}/all_network_stationlist_qaqc.csv"
shapepath = "s3://wecc-historical-wx/0_maps/tl_2021_us_state"

## Station-wise flag rates

In [None]:
def _pairwise_rate(flag_df_1: pd.DataFrame, flag_df_2: pd.DataFrame,station_name: str) -> pd.DataFrame:
    """
    Sums two input flag count dataframes. This is a helper function for sum_flag_counts().

    Parameters
    ----------
    flag_df_1: pd.DataFrame
        dataframe of previously summed station flag counts
    flag_df_2: pd.DataFrame
        flag counts dataframes for next station

    Returns
    -------
    summed_df: pd.DataFrame

    """
    flag_df_1 = flag_df_1.set_index("eraqc_flag_values")
    subset = flag_df_1[~flag_df_1.index.isin(["no_flag", "total_obs_count"])]

    totals = subset.sum(numeric_only=True)
    flag_df_1.loc["total_flag"] = pd.Series(totals)

    frac = flag_df_1.loc["total_flag"] / flag_df_1.loc["total_obs_count"]
    flag_df_1.loc["frac"] = pd.Series(frac)

    rates_df = flag_df_1.loc[["frac"]]
    rates_df = rates_df.rename(index={"frac": station_name})

    rates_df = rates_df.reset_index()

    # append column of total observation count
    flag_df_1 = flag_df_1.reset_index()
    total_obs = flag_df_1[flag_df_1['eraqc_flag_values']=='total_obs_count'].iloc[0,1]
    rates_df['total_obs_count'] = total_obs

    if len(flag_df_2) == 0:
        return rates_df

    else:
        rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
        return rates_df_merge

In [None]:
network = "VCAPCD"

vectorization, column-wise computation

include "total counts" column, counts for each station (can be referenced later)

use append for the station-wise rate tables
merge (?) for network-wise rate tables -> will take care of this for you
- if use append, would need to be exlicit about how to handle new columns (tell it to fill with NAs) -> perhaps more control

=> using merge for a single row should not be necessary

2D data may not be sufficient anymore
- we'll have x num of different variable names


include station type in map (buoy vs land (airport, mountain, etc.)) -> point shape


go with sparse dataframe, with total counts included (ie merge)


color = flag rate
point size = total obs count

In [None]:
# the function iteratively adds in flag counts to this dataframe
flag_rate_df = []

# point to folder containing station flag count CSVs
flags_prefix = f"{MERGE_DIR}/{network}/eraqc_counts_native_timestep"  # /per_network_flag_counts_native_timestep/"

## Merge flag counts

# loop through all CSVs are the given level
for item in s3.Bucket(BUCKET_NAME).objects.filter(Prefix=flags_prefix):
    obj = s3_cl.get_object(Bucket=BUCKET_NAME, Key=item.key)
    flags = pd.read_csv(obj["Body"])
    station_name = item.key.split(flags_prefix + "/")[1].split("_flag")[0]
    # the CSV is empty
    if flags.empty:
        continue
    # the CSV is not empty
    else:
        # send current dataframe and dataframe of previously summed counts to helper function
        flag_rate_df = _pairwise_rate(flags, flag_rate_df,station_name)
# print(station_name)

In [None]:
flag_rate_df = flag_rate_df.rename(columns={"eraqc_flag_values": "era-id"})

In [None]:
flag_rate_df = flag_rate_df.drop("elevation", axis=1)

In [None]:
flag_rate_df

### Merge with station list

In [None]:
station_list = pd.read_csv(stations_csv_path)
sub_station_list = station_list[station_list['network']==network]

In [None]:
sub_station_list

In [None]:
merged_list = sub_station_list.merge(flag_rate_df, on="era-id")

In [None]:
merged_list

next steps: extract station name

## Network-wise flag rates

In [None]:
network_flag_path_ = f"s3://{BUCKET_NAME}/{MERGE_DIR}/per_network_flag_counts_native_timestep/VCAPCD_flag_counts_native_timestep.csv"
flag_df = pd.read_csv(network_flag_path)

In [None]:
flag_df

In [None]:
# the function iteratively adds in flag counts to this dataframe
flag_rate_df = []

# point to folder containing station flag count CSVs
flags_prefix = f"{MERGE_DIR}/per_network_flag_counts_native_timestep"

## Merge flag counts

# loop through all CSVs are the given level
for item in s3.Bucket(BUCKET_NAME).objects.filter(Prefix=flags_prefix):
    obj = s3_cl.get_object(Bucket=BUCKET_NAME, Key=item.key)
    flags = pd.read_csv(obj["Body"])
    station_name = item.key.split(flags_prefix + "/")[1].split("_flag")[0]
    # the CSV is empty
    if flags.empty:
        continue
    # the CSV is not empty
    else:
        # send current dataframe and dataframe of previously summed counts to helper function
        flag_rate_df = _pairwise_rate(flags, flag_rate_df, station_name)

In [None]:
flag_rate_df

## Main functions

In [121]:
def network_rate_tables() -> None:
    """
    Generates flag rates tables at either the station or network level.

    Parameters
    ----------
    flag_df_1: pd.DataFrame
        dataframe of previously summed station flag counts
    flag_df_2: pd.DataFrame
        flag counts dataframes for next station

    Returns
    -------
    summed_df: pd.DataFrame
    """

    # the function iteratively adds in flag counts to this dataframe
    flag_rate_df = []

    # point to folder containing station flag count CSVs
    flags_prefix = f"{MERGE_DIR}/per_network_flag_counts_native_timestep"

    ## Merge flag counts

    # loop through all CSVs are the given level
    for item in s3.Bucket(BUCKET_NAME).objects.filter(Prefix=flags_prefix):
        obj = s3_cl.get_object(Bucket=BUCKET_NAME, Key=item.key)
        flags = pd.read_csv(obj["Body"])
        station_name = item.key.split(flags_prefix + "/")[1].split("_flag")[0]
        # the CSV is empty
        if flags.empty:
            continue
        # the CSV is not empty
        else:
            # send current dataframe and dataframe of previously summed counts to helper function
            flag_rate_df = _pairwise_rate(flags, flag_rate_df, station_name)

    ## Send final flag rates file to AWS as CSV
    csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/network_flag_rates.csv"
    # flag_rate_df.to_csv(csv_s3_filepath, index=False)
    print(f"Sending station flag rates CSV to: {csv_s3_filepath}")

    return flag_rate_df

In [122]:
network_rate_tables()

  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
  rates_

Sending station flag rates CSV to: s3://wecc-historical-wx/4_merge_wx/network_flag_rates.csv


  rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")


Unnamed: 0,eraqc_flag_values,elevation,hurs,pr_1h,pr,ps,rsds,sfcWind_dir,sfcWind,tas,...,ps_altimeter,ps_derived,accum_pr_24h,accum_pr_localmid,pr_24h,pr_5min,pr_localmid,accum_pr_1h,accum_pr_5min,pvp_derived
0,VCAPCD,0.0,0.001586439,8.2e-05,0.0,0.0005987326,0.002429234,0.003154,0.001392,0.002116,...,,,,,,,,,,
1,SNOTEL,0.0,0.0001435126,,0.321078,,0.03075973,,,0.002551,...,,,,,,,,,,
2,SHASAVAL,0.0,0.0002139301,0.000131,,,,0.011551,0.007433,0.000178,...,0.002454,0.002454,,,,,,,,
3,SGXWFO,0.0,2.127545e-08,0.001365,5e-06,9.070196e-05,0.01120008,0.040221,0.093519,0.00141,...,0.353959,0.336662,1.0,1.0,0.060711,0.0,0.001603,1.0,,
4,SCAN,0.0,3.392482e-05,,0.006791,,,0.003057,,0.000539,...,,,,,,,,,,
5,RAWS,0.0,4.626154e-06,3.4e-05,0.017633,0.0485557,0.01626169,0.095958,0.000583,0.00211,...,0.066155,0.061133,1.0,,8.9e-05,0.0,,1.0,,
6,OtherISD,0.019282,,,0.001198,0.018517,0.0,0.000392,0.000506,0.000576,...,0.059712,,,,,,,,,
7,NOS-PORTS,0.0,,,0.0,,1.047805e-07,0.003711,0.155035,0.000288,...,0.084542,0.084627,,,0.0,0.0,,,,
8,NOS-NWLON,0.0,5.414964e-07,0.349142,8e-06,9.077145e-07,4.74667e-08,0.004627,0.073874,0.000129,...,0.06726,0.067319,,,0.000128,0.290638,,,,
9,NDBC,0.070599,,,,0.01042413,,0.006146,0.001205,0.032881,...,,,,,,,,,,


In [None]:
def station_rate_tables() -> None:
    """
    Generates flag rates tables for each station across all networks.

    Parameters
    ----------
    flag_df_1: pd.DataFrame
        dataframe of previously summed station flag counts
    flag_df_2: pd.DataFrame
        flag counts dataframes for next station

    Returns
    -------
    summed_df: pd.DataFrame
    """
    network_list = ['VCAPCD','CDEC']

    # the function iteratively adds in flag counts to this dataframe
    flag_rate_df = []

    for network in network_list:
        # point to folder containing station flag count CSVs
        flags_prefix = f"{MERGE_DIR}/{network}/eraqc_counts_native_timestep"

        ## Merge flag counts

        # loop through all CSVs are the given level
        for item in s3.Bucket(BUCKET_NAME).objects.filter(Prefix=flags_prefix):
            obj = s3_cl.get_object(Bucket=BUCKET_NAME, Key=item.key)
            flags = pd.read_csv(obj["Body"])
            station_name = item.key.split(flags_prefix + "/")[1].split("_flag")[0]
            # the CSV is empty
            if flags.empty:
                continue
            # the CSV is not empty
            else:
                # send current dataframe and dataframe of previously summed counts to helper function
                flag_rate_df = _pairwise_rate(flags, flag_rate_df, station_name)

        ## Send final flag rates file to AWS as CSV
        csv_s3_filepath = f"s3://wecc-historical-wx/4_merge_wx/station_flag_rates.csv"
        # flag_rate_df.to_csv(csv_s3_filepath, index=False)
        print(f"Sending station flag rates CSV to: {csv_s3_filepath}")

    return flag_rate_df

In [120]:
station_rate_tables()

Sending station flag rates CSV to: s3://wecc-historical-wx/4_merge_wx/station_flag_rates.csv
Sending station flag rates CSV to: s3://wecc-historical-wx/4_merge_wx/station_flag_rates.csv


Unnamed: 0,eraqc_flag_values,elevation,tas,total_obs_count,hurs,ps_altimeter,ps_derived,ps,tdps_derived,rsds,sfcWind_dir,sfcWind,accum_pr,pr,pr_1h
0,CDEC_STG,0.0,0.002821,97822.0,,,,,,,,,,,
1,CDEC_RBW,0.0,1e-05,97317.0,0.013739,0.0,0.0,0.0,0.016554,,,,,,
2,CDEC_PVP,0.0,0.0,102452.0,0.0,0.0,0.0,2e-05,0.013733,,,,,,
3,CDEC_EPK,0.0,0.0,102461.0,0.0,0.0,0.0,0.0,6.8e-05,0.0,0.000439,0.0,,,
4,CDEC_DPO,0.0,0.0,9904.0,0.0,,,1.0,1.0,,0.000606,0.0,1.0,0.759087,
5,CDEC_CWD,0.0,0.077066,12366.0,,,,,,,,,1.0,0.001132,
6,CDEC_CDW,0.0,0.000882,401502.0,0.0,0.174764,0.174764,0.000306,0.001557,0.008466,0.006575,0.084779,1.0,0.000478,
7,CDEC_BLB,0.0,,102083.0,,,,,,,,,1.0,0.019621,
8,VCAPCD_TO,0.0,0.003403,86112.0,0.001498,,,,0.009395,0.001347,0.002822,0.001486,,,0.000279
9,VCAPCD_SV,0.0,0.00178,102822.0,0.001683,,,0.000311,0.007226,0.000856,0.002733,0.001537,,,0.0


## Map

In [None]:
map_list = merged_list

# Format dates in datetime format (this gets lost in import).
map_list["start-date"] = pd.to_datetime(map_list["start-date"], utc=True)
map_list["end-date"] = pd.to_datetime(map_list["end-date"], utc=True)

# Make a geodataframe.
gdf = gpd.GeoDataFrame(
    map_list,
    geometry=gpd.points_from_xy(map_list.longitude, map_list.latitude),
)
gdf.set_crs(epsg=4326, inplace=True)  # Set CRS

# Project data to match base tiles.
gdf_wm = gdf.to_crs(epsg=3857)  # Web mercator

# Read in geometry of continental US.
us = gpd.read_file(shapepath)

# Remove territories, AK, HI
rem_list = ["HI", "AK", "MP", "GU", "AS", "PR", "VI"]
us = us.loc[us.STUSPS.isin(rem_list) == False]

# Use to clip stations
us = us.to_crs(epsg=3857)
gdf_us = gdf_wm.clip(us)



In [None]:
# Plot
ax = gdf_us.plot(
    "tas",
    figsize=(15, 15),
    alpha=1,
    markersize=3,
    legend=True,
    cmap="nipy_spectral",
)
cx.add_basemap(ax, source=cx.providers.CartoDB.Positron)
ax.set_axis_off()