# Data Quality Visualizations

In [1]:
import time
import boto3
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import contextily as cx

# Set AWS credentials
s3 = boto3.resource("s3")
s3_cl = boto3.client("s3")  # for lower-level processes

# Set relative paths to other folders and objects in repository.
BUCKET_NAME = "wecc-historical-wx"
QAQC_DIR = "3_qaqc_wx"
MERGE_DIR = "4_merge_wx"
stations_csv_path = f"s3://{BUCKET_NAME}/{QAQC_DIR}/all_network_stationlist_qaqc.csv"
shapepath = "s3://wecc-historical-wx/0_maps/tl_2021_us_state"

In [2]:
total_flag_path = f"s3://{BUCKET_NAME}/{MERGE_DIR}/total_flag_counts_native_timestep.csv"
total_flag_list = pd.read_csv(total_flag_path)

Append, onto station_list, target variable flag rate.
We want the flag rate per variable per station. Let's test with ASOSAWOS.

In [19]:
network_flag_path = f"s3://{BUCKET_NAME}/{MERGE_DIR}/ASOSAWOS/eraqc_counts_native_timestep/ASOSAWOS_A0705300346_flag_counts_native_timestep.csv"
flag_df_1_hello = pd.read_csv(network_flag_path)

In [20]:
network_flag_path_2 = f"s3://{BUCKET_NAME}/{MERGE_DIR}/ASOSAWOS/eraqc_counts_native_timestep/ASOSAWOS_A0685400115_flag_counts_native_timestep.csv"
flag_df_2_hello = pd.read_csv(network_flag_path_2)

In [21]:
flag_df_1_hello

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,sfcWind_dir,sfcWind,tas,tdps
0,no_flag,140066,140066,140066,140066,140066,135752,140066,135949
1,27.0,0,0,0,0,0,4314,0,1691
2,28.0,0,0,0,0,0,0,0,1733
3,13.0,0,0,0,0,0,0,0,693
4,total_obs_count,140066,140066,140066,140066,140066,140066,140066,140066


In [22]:
flag_df_2_hello

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,sfcWind_dir,sfcWind,tas,tdps
0,no_flag,205851,205851,205851,205851,205851,205851,205851,205387
1,27.0,0,0,0,0,0,0,0,464
2,total_obs_count,205851,205851,205851,205851,205851,205851,205851,205851


In [23]:
flag_df_1 = flag_df_1_hello.set_index("eraqc_flag_values")
subset = flag_df_1[~flag_df_1.index.isin(["no_flag", "total_obs_count"])]

totals = subset.sum(numeric_only=True)
flag_df_1.loc["total_flag"] = pd.Series(totals)

frac = flag_df_1.loc["total_flag"] / flag_df_1.loc["total_obs_count"]
flag_df_1.loc["frac"] = pd.Series(frac)

rates_df = flag_df_1.loc[["frac"]]
rates_df = rates_df.rename(index={"frac": 'station_name'})

In [58]:
flag_df_2 = flag_df_2_hello.set_index("eraqc_flag_values")
subset_2 = flag_df_2[~flag_df_2.index.isin(["no_flag", "total_obs_count"])]

totals = subset_2.sum(numeric_only=True)
flag_df_2.loc["total_flag"] = pd.Series(totals)

frac_2= flag_df_2.loc["total_flag"] / flag_df_2.loc["total_obs_count"]
flag_df_2.loc["frac"] = pd.Series(frac_2)

rates_df_2 = flag_df_2.loc[["frac"]]
rates_df_2 = rates_df_2.rename(index={"frac": "station_name_2"})

In [25]:
rates_df = rates_df.reset_index()

In [26]:
rates_df

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,sfcWind_dir,sfcWind,tas,tdps
0,station_name,0.0,0.0,0.0,0.0,0.0,0.0308,0.0,0.029393


In [43]:
rates_df_2 = rates_df_2.reset_index()

In [50]:
rates_df_2

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,sfcWind_dir,sfcWind,tas,tdps
0,station_name_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002254


In [50]:
test = pd.merge(rates_df,rates_df_2, how='outer')

In [51]:
test

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,sfcWind_dir,sfcWind,tas,tdps
0,station_name,0.0,0.0,0.0,0.0,0.0,0.0308,0.0,0.029393
1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002254


In [27]:
network_flag_path_3 = f"s3://{BUCKET_NAME}/{MERGE_DIR}/ASOSAWOS/eraqc_counts_native_timestep/ASOSAWOS_99999994176_flag_counts_native_timestep.csv"
flag_df_3_hello = pd.read_csv(network_flag_path_3)

In [75]:
station_name = network_flag_path.split(f"s3://{BUCKET_NAME}/{MERGE_DIR}/ASOSAWOS/eraqc_counts_native_timestep" + "/")[1]

In [76]:
station_name

'ASOSAWOS_A0705300346_flag_counts_native_timestep.csv'

In [None]:
total_obs = rates_df_2[flag_df_1_hello['eraqc_flag_values']=='total_obs_count'].iloc[0,1]

In [52]:
rates_df["total_obs_count"] = total_obs

In [53]:
rates_df

Unnamed: 0,eraqc_flag_values,elevation,pr,ps_altimeter,ps,sfcWind_dir,sfcWind,tas,tdps,total_obs_count
0,station_name,0.0,0.0,0.0,0.0,0.0,0.0308,0.0,0.029393,140066


In [None]:
def _pairwise_rate(flag_df_1: pd.DataFrame, flag_df_2: pd.DataFrame,station_name: str) -> pd.DataFrame:
    """
    Sums two input flag count dataframes. This is a helper function for sum_flag_counts().

    Parameters
    ----------
    flag_df_1: pd.DataFrame
        dataframe of previously summed station flag counts
    flag_df_2: pd.DataFrame
        flag counts dataframes for next station

    Returns
    -------
    summed_df: pd.DataFrame

    """
    flag_df_1 = flag_df_1.set_index("eraqc_flag_values")
    subset = flag_df_1[~flag_df_1.index.isin(["no_flag", "total_obs_count"])]

    totals = subset.sum(numeric_only=True)
    flag_df_1.loc["total_flag"] = pd.Series(totals)

    frac = flag_df_1.loc["total_flag"] / flag_df_1.loc["total_obs_count"]
    flag_df_1.loc["frac"] = pd.Series(frac)

    rates_df = flag_df_1.loc[["frac"]]
    rates_df = rates_df.rename(index={"frac": station_name})

    rates_df = rates_df.reset_index()

    # append column of total observation count
    flag_df_1 = flag_df_1.reset_index()
    total_obs = flag_df_1[flag_df_1['eraqc_flag_values']=='total_obs_count'].iloc[0,1]
    rates_df['total_obs_count'] = total_obs

    if len(flag_df_2) == 0:
        return rates_df

    else:
        rates_df_merge = pd.merge(rates_df, flag_df_2, how="outer")
        return rates_df_merge

In [5]:
network = "VCAPCD"

vectorization, column-wise computation

include "total counts" column, counts for each station (can be referenced later)

use append for the station-wise rate tables
merge (?) for network-wise rate tables -> will take care of this for you
- if use append, would need to be exlicit about how to handle new columns (tell it to fill with NAs) -> perhaps more control

=> using merge for a single row should not be necessary

2D data may not be sufficient anymore
- we'll have x num of different variable names


include station type in map (buoy vs land (airport, mountain, etc.)) -> point shape


go with sparse dataframe, with total counts included (ie merge)


color = flag rate
point size = total obs count

In [64]:
# the function iteratively adds in flag counts to this dataframe
flag_rate_df = []

# point to folder containing station flag count CSVs
flags_prefix = f"{MERGE_DIR}/{network}/eraqc_counts_native_timestep"  # /per_network_flag_counts_native_timestep/"

## Merge flag counts

# loop through all CSVs are the given level
for item in s3.Bucket(BUCKET_NAME).objects.filter(Prefix=flags_prefix):
    obj = s3_cl.get_object(Bucket=BUCKET_NAME, Key=item.key)
    flags = pd.read_csv(obj["Body"])
    station_name = item.key.split(flags_prefix + "/")[1]
    # the CSV is empty
    if flags.empty:
        continue
    # the CSV is not empty
    else:
        # send current dataframe and dataframe of previously summed counts to helper function
        flag_rate_df = _pairwise_rate(flags, flag_rate_df,station_name)
        # print(station_name)

flag_df_2:  []
flag_df_1:    eraqc_flag_values  elevation           hurs     pr_1h        pr  \
0           no_flag   103040.0  102868.000000  103040.0  103040.0   
1              11.0        0.0     172.000000       0.0       0.0   
2              23.0        0.0       0.000000       0.0       0.0   
3              15.0        0.0       0.000000       0.0       0.0   
4              26.0        0.0       0.000000       0.0       0.0   
5              25.0        0.0       0.000000       0.0       0.0   
6              12.0        0.0       0.000000       0.0       0.0   
7   total_obs_count   103040.0  103040.000000  103040.0  103040.0   
8        total_flag        0.0     172.000000       0.0       0.0   
9              frac        0.0       0.001669       0.0       0.0   

              ps           rsds    sfcWind_dir        sfcWind            tas  \
0  103035.000000  102976.000000  102394.000000  102961.000000  102806.000000   
1       2.000000      64.000000      79.000000      7

In [65]:
flag_rate_df

Unnamed: 0,eraqc_flag_values,elevation,hurs,pr_1h,rsds,sfcWind_dir,sfcWind,tas,tdps_derived,total_obs_count,ps,pr
0,VCAPCD_TO_flag_counts_native_timestep.csv,0.0,0.001498,0.000279,0.001347,0.002822,0.001486,0.003403,0.009395,86112.0,,
1,VCAPCD_SV_flag_counts_native_timestep.csv,0.0,0.001683,0.0,0.000856,0.002733,0.001537,0.00178,0.007226,102822.0,0.000311,
2,VCAPCD_SU_flag_counts_native_timestep.csv,0.0,0.002337,0.0,0.002046,0.00385,0.002308,0.004024,0.009144,103124.0,0.001435,0.0
3,VCAPCD_PU_flag_counts_native_timestep.csv,0.0,0.001378,0.0,0.001227,0.001644,0.001355,0.000706,0.006322,86368.0,,
4,VCAPCD_OJ_flag_counts_native_timestep.csv,0.0,0.000908,0.000232,0.008078,0.001314,0.000908,0.000502,0.003614,103492.0,,0.0
5,VCAPCD_ER_flag_counts_native_timestep.csv,0.0,0.001669,0.0,0.000621,0.006269,0.000767,0.002271,0.006871,103040.0,4.9e-05,0.0


next steps: extract station name

## Map

In [None]:
# Format dates in datetime format (this gets lost in import).
station_list["start-date"] = pd.to_datetime(station_list["start-date"], utc=True)
station_list["end-date"] = pd.to_datetime(station_list["end-date"], utc=True)

# Make a geodataframe.
gdf = gpd.GeoDataFrame(
    station_list,
    geometry=gpd.points_from_xy(station_list.longitude, station_list.latitude),
)
gdf.set_crs(epsg=4326, inplace=True)  # Set CRS

# Project data to match base tiles.
gdf_wm = gdf.to_crs(epsg=3857)  # Web mercator

# Read in geometry of continental US.
us = gpd.read_file(shapepath)

# Remove territories, AK, HI
rem_list = ["HI", "AK", "MP", "GU", "AS", "PR", "VI"]
us = us.loc[us.STUSPS.isin(rem_list) == False]

# Use to clip stations
us = us.to_crs(epsg=3857)
gdf_us = gdf_wm.clip(us)

# Plot
ax = gdf_us.plot(
    "network",
    figsize=(15, 15),
    alpha=1,
    markersize=3,
    legend=True,
    cmap="nipy_spectral",
)
cx.add_basemap(ax, source=cx.providers.CartoDB.Positron)
ax.set_axis_off()

