# QA/QC pipeline testing

### Development of qaqc pipeline using xarray and pandas

- Functions tested for now:
    - qaqc_missing_latlon
    - qaqc_within_wecc
    - qaqc_elev_range
    - qaqc_precip_logic_nonegvals
    - qaqc_precip_logic_accum_amounts
    - spurious_buoy_check
    - qaqc_sensor_height_w
    - qaqc_sensor_height_t
    - qaqc_world_record
    - qaqc_crossvar_logic_tdps_to_tas
    - qaqc_crossvar_logic_calm_wind_dir

In [7]:
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import glob
import sys
import os
import warnings
from IPython.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Timing original/xarray/pandas version

- Saving files into temp/{station}_{version}.nc
- Testing the first 4 stations in a station (RAWS for now)

In [3]:
""" Python executable with era_py39 kernel """
PYTHON = sys.executable
PYTHON

'/Users/hector/anaconda3/envs/era_py39/bin/python3.9'

### Original version

`which python` ALLNETWORKS_qaqc.py --network RAWS --sample 4

- real	10m10.727s
- user	0m42.571s
- sys	0m26.752s

### xarray version

`which python` ALLNETWORKS_qaqc_xarray.py --network RAWS --sample 4

- real	10m49.677s
- user	0m45.524s
- sys	0m24.015s

### pandas (optimized) version

`which python` ALLNETWORKS_qaqc_pandas.py --network RAWS --sample 4

- real	8m49.695s
- user	0m34.030s
- sys	0m16.152s

### Conclusions on timing

- Most of the time is spent reading in from s3 bucket and writing back to aws.
- The optimized pandas functions run faster and are more clear and more consistent between each other.
- Only pandas (not mix of pandas and xarray) is used.
- The real difference with the pandas optimized version will be noticed once everything is run in AWS server.

In [4]:
import calc_qaqc_pandas as cpd
import QAQC_pipeline_pandas as qpd 

import calc_qaqc_xarray as cxr
import QAQC_pipeline_xarray as qxr

import QAQC_pipeline_original as qor

In [5]:
errors, end_api, timestamp = qpd.setup_error_handling()

In [6]:
ds = xr.open_dataset("train_files/RAWS_NWRU1.nc")


KeyboardInterrupt



In [None]:
exclude_qaqc = ["time", "station", "lat", "lon", "qaqc_process", "sfcWind_method"] # lat and lon have a different qc check
raw_qc_vars = [] # qc_variable for each data variable, will vary station to station
era_qc_vars = [] # our qc variable
for var in ds.variables:
    if 'q_code' in var:
        raw_qc_vars.append(var) # raw qc variable, need to keep for comparison, then drop
    if '_qc' in var:
        raw_qc_vars.append(var) # raw qc variables, need to keep for comparison, then drop

for var in ds.variables:
    if var not in exclude_qaqc and var not in raw_qc_vars:
        qc_var = var + "_eraqc" # variable/column label
        era_qc_vars.append(qc_var)
        ds[qc_var] = xr.full_like(ds[var], np.nan) # adds new variable in shape of original variable with designated nan fill value
df = ds.to_dataframe().reset_index()

In [None]:
test_pd = qpd.run_qaqc_pipeline(control, 
                                "RAWS", 
                                "timing_files/RAWS_NWRU1.nc",
                                errors,
                                "RAWS_NWRU1",
                                end_api,
                                verbose=True)

In [None]:
test_xr = qxr.run_qaqc_pipeline(control, 
                                "RAWS", 
                                "timing_files/RAWS_NWRU1.nc",
                                errors,
                                "RAWS_NWRU1",
                                end_api,
                                verbose=True)

In [None]:
test_or = qor.run_qaqc_pipeline(control, network="RAWS", verbose=True)

In [None]:
differences = {}

train_files = np.sort(glob.glob("train_files/*.nc"))
for f in train_files[::100]:
    station = f.split("/")[-1].split(".nc")[0]
    differences[station] = {}
    network = station.split("_")[0]
    with xr.open_dataset(f, engine="h5netcdf") as ds:
        try:
            test_xr = qxr.run_qaqc_pipeline(ds, network, f, errors, station, end_api, verbose=False)
            test_pd = qxr.run_qaqc_pipeline(ds, network, f, errors, station, end_api, verbose=False)
            test_or = qor.run_qaqc_pipeline(ds, network, verbose=False)
        except:
            pass
            for var in era_qc_vars:
                try:
                    mean = (test_or[var].reset_index()[var] - test_xr[var].to_dataframe().reset_index()[var]).mean()
                    std  = (test_or[var].reset_index()[var] - test_xr[var].to_dataframe().reset_index()[var]).std()
                    differences[station][var] = ["{} +- {}".format(mean,std)]
                except:
                    pass
    
                try:
                    mean = (test_or[var].reset_index()[var] - test_pd[var].reset_index()[var]).mean()
                    std  = (test_or[var].reset_index()[var] - test_pd[var].reset_index()[var]).std()
                    differences[station][var].extend(["{} +- {}".format(mean,std)])
                except:
                    pass

In [None]:
differences = {}

train_files = np.sort(glob.glob("train_files/*.nc"))
for f in train_files[::100]:
    station = f.split("/")[-1].split(".nc")[0]
    differences[station] = {}
    network = station.split("_")[0]
    with xr.open_dataset(f, engine="h5netcdf") as ds:
        ds = ds.drop_duplicates(dim="time")
        test_xr = qxr.run_qaqc_pipeline(ds, network, f, errors, station, end_api, verbose=False)
        test_pd = qxr.run_qaqc_pipeline(ds, network, f, errors, station, end_api, verbose=False)
        test_or = qor.run_qaqc_pipeline(ds, network, errors, end_api, verbose=False)


In [None]:
differences['ASOSAWOS_72041100137']

In [None]:
differences

In [None]:
ds

In [None]:
ds.drop_duplicates(dim="time")

In [None]:
a = [1,2]

In [None]:
a.extend(2)

In [None]:
fig,ax = plt.subplots()

(test_pd.sfcWind_dir_eraqc.reset_index().sfcWind_dir_eraqc - 
 test_xr.sfcWind_dir_eraqc.to_dataframe().reset_index().sfcWind_dir_eraqc)\
.plot(ax=ax, marker='.', ls='')
    

In [None]:
fig,ax = plt.subplots()

(test_pd.sfcWind_dir_eraqc.reset_index().sfcWind_dir_eraqc - 
 test_or.sfcWind_dir_eraqc.reset_index().sfcWind_dir_eraqc)\
.plot(ax=ax, marker='.', ls='')

In [None]:
import netCDF4 as nc
control = nc.Dataset("timing_files/RAWS_NWRU1.nc")
np.where(np.isnan(control["time"][:]))[0]

In [None]:
def plot_flag_scatter(xrf, pdf):
    
    xrf = xr.open_dataset(xrf)
    pdf = xr.open_dataset(pdf)
    variables = list(xrf.data_vars.keys())
    data_vars = [var for var in variables if "qc" not in var]
    eraqc_vars = [var for var in variables if "eraqc" in var]
    nVars = len(data_vars)
    
    fig,ax = plt.subplots(figsize=(10,3))
    
    i=0
    for dv,ev in zip(data_vars, eraqc_vars):
        xrNan = np.isnan(xrf[ev].squeeze())
        pdNan = np.isnan(pdf[ev].squeeze())
        
#         ax.plot(xrf.time, xrf[ev][0,:], 'x',
#                 markersize=4, c="C"+str(i),
#                 markerfacecolor="none",
#                 alpha=0.35
#                )
#         ax.plot(xrf.time, xrf[ev][0,:], '.',
#                 markersize=2, c="C"+str(i)
#                )
        diff = xrf[ev][0,:] - pdf[ev][0,:]
        ax.plot(i + diff, label=ev)
        i += 1
        
    # Shrink current axis by 20%
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

    # Put a legend to the right of the current axis
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
    
    station = xrf.encoding['source']
    station = station.split("/")[-1].split("_xar")[0]
    ax.set_title(station)
#         ylabel = "{} [{}]".format(xrf[dv].attrs['long_name'], 
#                                   xrf[dv].attrs['units'])
#         ax.set_ylabel(ylabel, fontdict={"size":8})

#### Plot the difference between xarray and pandas flag vars

In [None]:
plot_flag_scatter(xr_files[0], pd_files[0])
plot_flag_scatter(xr_files[1], pd_files[1])
plot_flag_scatter(xr_files[2], pd_files[2])
plot_flag_scatter(xr_files[3], pd_files[3])
plot_flag_scatter(xr_files[4], pd_files[4])
plot_flag_scatter(xr_files[5], pd_files[5])

# Timing individual funcions

In [None]:
ds = xr.open_dataset("train_files/RAWS_BCFO3.nc")
exclude_qaqc = ["time", "station", "lat", "lon", "qaqc_process", "sfcWind_method"] # lat and lon have a different qc check
raw_qc_vars = [] # qc_variable for each data variable, will vary station to station
era_qc_vars = [] # our qc variable
for var in ds.variables:
    if 'q_code' in var:
        raw_qc_vars.append(var) # raw qc variable, need to keep for comparison, then drop
    if '_qc' in var:
        raw_qc_vars.append(var) # raw qc variables, need to keep for comparison, then drop

for var in ds.variables:
    if var not in exclude_qaqc and var not in raw_qc_vars:
        qc_var = var + "_eraqc" # variable/column label
        era_qc_vars.append(qc_var)
        ds[qc_var] = xr.full_like(ds[var], np.nan) # adds new variable in shape of original variable with designated nan fill value
df = ds.to_dataframe().reset_index()

In [None]:
ds1 = xr.open_dataset("train_files/CIMIS_75.nc")
exclude_qaqc = ["time", "station", "lat", "lon", "qaqc_process", "sfcWind_method"] # lat and lon have a different qc check
raw_qc_vars = [] # qc_variable for each data variable, will vary station to station
era_qc_vars = [] # our qc variable
for var in ds1.variables:
    if 'q_code' in var:
        raw_qc_vars.append(var) # raw qc variable, need to keep for comparison, then drop
    if '_qc' in var:
        raw_qc_vars.append(var) # raw qc variables, need to keep for comparison, then drop

for var in ds1.variables:
    if var not in exclude_qaqc and var not in raw_qc_vars:
        qc_var = var + "_eraqc" # variable/column label
        era_qc_vars.append(qc_var)
        ds1[qc_var] = xr.full_like(ds1[var], np.nan) # adds new variable in shape of original variable with designated nan fill value
df1 = ds1.to_dataframe().reset_index()
[var for var in df1.columns if 'pr' in var]

In [None]:
ds2 = xr.open_dataset("train_files/NDBC_46023.nc")

exclude_qaqc = ["time", "station", "lat", "lon", "qaqc_process", "sfcWind_method"] # lat and lon have a different qc check
raw_qc_vars = [] # qc_variable for each data variable, will vary station to station
era_qc_vars = [] # our qc variable

for var in ds2.variables:
    if 'q_code' in var:
        raw_qc_vars.append(var) # raw qc variable, need to keep for comparison, then drop
    if '_qc' in var:
        raw_qc_vars.append(var) # raw qc variables, need to keep for comparison, then drop

for var in ds2.variables:
    if var not in exclude_qaqc and var not in raw_qc_vars:
        qc_var = var + "_eraqc" # variable/column label
        era_qc_vars.append(qc_var)
        ds2[qc_var] = xr.full_like(ds2[var], np.nan) # adds new variable in shape of original variable with designated nan fill value
df2 = ds2.to_dataframe().reset_index()
tmp = era_qc_vars
try:
    tmp.remove("elevation_eraqc")
except:
    pass

### Original

In [None]:
import ALLNETWORKS_qaqc as oo_functions

### xarray

In [None]:
import ALLNETWORKS_qaqc_xarray as xr_functions

### Pandas optimized

In [None]:
import ALLNETWORKS_qaqc_pandas as pd_functions

### Timing functions

In [None]:
oo_t0  = %timeit -o oo_functions.qaqc_missing_latlon(df)
xr_t0 = %timeit  -o xr_functions.qaqc_missing_latlon(ds)
pd_t0 = %timeit  -o pd_functions.qaqc_missing_latlon(df)

oo_t1 = %timeit -o oo_functions.qaqc_within_wecc(df)
xr_t1 = %timeit -o xr_functions.qaqc_within_wecc(ds)
pd_t1 = %timeit -o pd_functions.qaqc_within_wecc(df)

oo_t2 = %timeit -o oo_functions.qaqc_elev_infill(df)
xr_t2 = %timeit -o xr_functions.qaqc_elev_infill(ds, verbose=False)
pd_t2 = %timeit -o pd_functions.qaqc_elev_infill(df, verbose=False)

oo_t3 = %timeit -o oo_functions.qaqc_elev_range(df)
xr_t3 = %timeit -o xr_functions.qaqc_elev_range(ds, verbose=False)
pd_t3 = %timeit -o pd_functions.qaqc_elev_range(df, verbose=False)

oo_t4 = %timeit -o oo_functions.qaqc_precip_logic_nonegvals(df)
xr_t4 = %timeit -o xr_functions.qaqc_precip_logic_nonegvals(ds, verbose=False)
pd_t4 = %timeit -o pd_functions.qaqc_precip_logic_nonegvals(df, verbose=False)

oo_t5 = %timeit -o oo_functions.qaqc_precip_logic_accum_amounts(df1)
xr_t5 = %timeit -o xr_functions.qaqc_precip_logic_accum_amounts(ds1, verbose=False)
pd_t5 = %timeit -o pd_functions.qaqc_precip_logic_accum_amounts(df1, verbose=False)

# oo_t6 = %timeit -o oo_functions.spurious_buoy_check("NDBC_46023", df2, tmp)
oo_t6 = np.nan
# xr_t6 = %timeit -o xr_functions.spurious_buoy_check(ds2, verbose=False)
xr_t6 = np.nan
pd_t6 = %timeit -o pd_functions.spurious_buoy_check(df2.copy(), era_qc_vars, verbose=False)

oo_t7 = %timeit -o oo_functions.qaqc_sensor_height_w(ds, df)
xr_t7 = %timeit -o xr_functions.qaqc_sensor_height_w(ds, verbose=False)
pd_t7 = %timeit -o pd_functions.qaqc_sensor_height_w(df, verbose=False)

oo_t8 = %timeit -o oo_functions.qaqc_sensor_height_t(ds, df)
xr_t8 = %timeit -o xr_functions.qaqc_sensor_height_t(ds, verbose=False)
pd_t8 = %timeit -o pd_functions.qaqc_sensor_height_t(df, verbose=False)

oo_t9 = %timeit -o oo_functions.qaqc_world_record(df)
xr_t9 = %timeit -o xr_functions.qaqc_world_record(ds, verbose=False)
pd_t9 = %timeit -o pd_functions.qaqc_world_record(df, verbose=False)

oo_t10 = %timeit -o oo_functions.qaqc_crossvar_logic_tdps_to_tas(df)
xr_t10 = %timeit -o xr_functions.qaqc_crossvar_logic_tdps_to_tas(ds, verbose=False)
pd_t10 = %timeit -o pd_functions.qaqc_crossvar_logic_tdps_to_tas(df, verbose=False)

oo_t11 = %timeit -o oo_functions.qaqc_crossvar_logic_calm_wind_dir(df)
xr_t11 = %timeit -o xr_functions.qaqc_crossvar_logic_calm_wind_dir(ds, verbose=False)
pd_t11 = %timeit -o pd_functions.qaqc_crossvar_logic_calm_wind_dir(df, verbose=False)

In [None]:
oo_timing = [oo_t0,oo_t1,oo_t2,oo_t3,oo_t4,oo_t5,
             oo_t6,oo_t7,oo_t8,oo_t9,oo_t10,oo_t11]

xr_timing = [xr_t0,xr_t1,xr_t2,xr_t3,xr_t4,xr_t5,
             xr_t6,xr_t7,xr_t8,xr_t9,xr_t10,xr_t11]

pd_timing = [pd_t0,pd_t1,pd_t2,pd_t3,pd_t4,pd_t5,
             pd_t6,pd_t7,pd_t8,pd_t9,pd_t10,pd_t11]

In [None]:
functions = [
    "qaqc_missing_latlon",
    "qaqc_within_wecc",
    "qaqc_elev_infill",
    "qaqc_elev_range",
    "qaqc_precip_logic_nonegvals",
    "qaqc_precip_logic_accum_amounts",
    "spurious_buoy_check",
    "qaqc_sensor_height_w",
    "qaqc_sensor_height_t",
    "qaqc_world_record",
    "qaqc_crossvar_logic_tdps_to_tas",
    "qaqc_crossvar_logic_calm_wind_dir"
]

In [None]:
odt, xrt, pdt = [], [], []
for o,x,p in zip(oo_timing, xr_timing, pd_timing):
    try:
        odt.append(o.average)
    except:
        odt.append(np.nan)

    try:
        xrt.append(x.average)
    except:
        xrt.append(np.nan)
        
    try:
        pdt.append(p.average)
    except:
        pdt.append(np.nan)
        
odt = np.array(odt)
xrt = np.array(xrt)
pdt = np.array(pdt)

In [None]:
fig,ax = plt.subplots(figsize=(7,3))

ax.plot(odt,'.:', label="original")
ax.plot(xrt,'.:', label="xarray")
ax.plot(pdt,'.:', label="pandas opt")

ax.set_yscale("log")
ax.set_ylabel("Timing [s]")
leg = ax.legend()

ax.set_xticks(np.arange(len(pd_timing)))
_ = ax.set_xticklabels(functions, rotation = 45, ha="right")

NameError: name 'df' is not defined

In [21]:
ds = xr.open_dataset("train_files/RAWS_BCHN5.nc")
print(ds.station)
ds2 = ds.isel(station=0)
ds2

<xarray.DataArray 'station' (station: 1)>
array(['RAWS_BCHN5'], dtype=object)
Coordinates:
  * station  (station) object 'RAWS_BCHN5'
Attributes:
    long_name:  station_id
    comment:    Unique ID created by Eagle Rock Analytics. Includes network n...


In [28]:
ds2.assign_coords({"station":"RAWS_BCHN5"}).expand_dims({"station":1})
# ds2.assign_coords({"station":"RAWS_BCHN5"})

In [20]:
# ds2.assign_coords(coords={"station":"RAWS_BCHN5"}).expand_dims(dim={"station":"RAWS_BCHN5"})