# QAQC order and single function testing

## Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import matplotlib as mpl
import glob

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Import QAQC libraries

In [2]:
import os
import tempfile
import argparse 

# Import all qaqc script functions
try:
    from qaqc_plot import *
    from qaqc_utils import *
    from qaqc_wholestation import *
    from qaqc_logic_checks import *
    from qaqc_buoy_check import *
    from qaqc_frequent import *
    from qaqc_unusual_gaps import *
    from qaqc_unusual_large_jumps import *
    from qaqc_climatological_outlier import *
    from qaqc_unusual_streaks import *
except Exception as e:
    print("Error importing qaqc script: {}".format(e))

# Import qaqc stage calc functions
try:
    from QAQC_pipeline import *
except:
    print("Error importing QAQC_pipeline.py")
    
if not os.path.exists("./qaqc_figs"):
    os.makedirs("./qaqc_figs")

## Testing on a single network / station

### Load file and convert to pandas df 

In [3]:
""" Aqaqc_world_recordtions from QAQC libraries can be used here.
    For the qaqc pipeline, we need the pandas df in the same format used in it
"""

network = "ASOSAWOS"
rawdir, cleandir, qaqcdir, mergedir = get_file_paths(network)
print(rawdir)
print(cleandir)
print(qaqcdir)
print(mergedir)

1_raw_wx/ASOSAWOS/
2_clean_wx/ASOSAWOS/
3_qaqc_wx/ASOSAWOS/
4_merge_wx/ASOSAWOS/


In [4]:
""" We can test in the same way that the pipeline, 
    take a network and subsample an station
"""

files, stations = read_network_files(network, cleandir)
stations_sample = stations.sample(1)
station = stations_sample.iloc[0]
print(station)

ASOSAWOS_72677624036


- ASOSAWOS_74718503144
- ASOSAWOS_74917900392 

In [5]:
""" Or we can test on a locally stored station 
    file directly
"""
ds = xr.open_dataset('Train_Files/ASOSAWOS_74718503144.nc')
# ds = xr.open_dataset('Train_Files/RAWS_ZMBA2.nc')
ds

In [6]:
""" Then, the idea is to convert to pandas df in the 
    format needed for the qaqc pipeline
"""
df, MultiIndex, attrs, var_attrs = qaqc_ds_to_df(ds)
df.head(2)

  base = data.astype(np.int64)
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")


Unnamed: 0,time,ps,tas,tdps,pr,sfcWind,sfcWind_dir,elevation,qaqc_process,ps_qc,...,pr_eraqc,sfcWind_eraqc,sfcWind_dir_eraqc,elevation_eraqc,ps_altimeter_eraqc,psl_eraqc,pr_duration_eraqc,anemometer_height_m,thermometer_height_m,station
0,2006-01-01 00:53:00,101280.0,292.55,279.85,0.0,10.8,240.0,-16.0,V020,5,...,,,,,,,NaT,10.06,,ASOSAWOS_74718503144
1,2006-01-01 01:53:00,101310.0,292.05,281.45,0.0,4.1,270.0,-16.0,V020,5,...,,,,,,,NaT,10.06,,ASOSAWOS_74718503144


In [7]:
# new_df = qaqc_unusual_gaps(df, plots=True, verbose=True)

In [8]:
# import cProfile

In [9]:
def whole_station_checks(df):
    t0 = time.time()

    #---------------------------------------------------------
    ## Missing values -- does not proceed through qaqc if failure
    stn_to_qaqc = df.copy()  # Need to define before qaqc_pipeline, in case 
    new_df = qaqc_missing_vals(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api,
                                message="has an unchecked missing value",
                                test="qaqc_missing_vals",
                                verbose=verbose)
        return None # whole station failure, skip to next station
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_missing_vals', log_file=log_file, verbose=verbose)

    #---------------------------------------------------------
    ## Lat-lon -- does not proceed through qaqc if failure
    new_df = qaqc_missing_latlon(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="missing lat-lon", 
                                test="qaqc_missing_latlon",
                                verbose=verbose)
        return None # whole station failure, skip to next station
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_missing_latlon', log_file=log_file, verbose=verbose)
    #---------------------------------------------------------
    ## Within WECC -- does not proceed through qaqc if failure
    new_df = qaqc_within_wecc(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="lat-lon is out of range for WECC", 
                                test="qaqc_within_wecc",
                                verbose=verbose)
        return None # whole station failure, skip to next station
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_within_wecc', log_file=log_file, verbose=verbose)

    #---------------------------------------------------------
    ## Elevation -- if DEM in-filling fails, does not proceed through qaqc
    new_df = qaqc_elev_infill(stn_to_qaqc, verbose=verbose) # nan infilling must be before range check
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="DEM in-filling failed", 
                                test="DEM in-filling, may not mean station does not pass qa/qc -- check",
                                verbose=verbose)
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_elev_infill', log_file=log_file, verbose=verbose)
            
    #---------------------------------------------------------
    ## Elevation -- range within WECC
    new_df = qaqc_elev_range(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="elevation out of range for WECC", 
                                test="qaqc_elev_range",
                                verbose=verbose)
        return None # whole station failure, skip to next station
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_elev_range', log_file=log_file, verbose=verbose)
    
    #=========================================================
    ## Part 1b: Whole station checks - if failure, entire station does proceed through QA/QC

#     #---------------------------------------------------------
#     ## Sensor height: air temperature
#     new_df = qaqc_sensor_height_t(stn_to_qaqc, verbose=verbose)
#     if new_df is None:
#         errors = print_qaqc_failed(errors, station, end_api, 
#                                 message="Flagging problem with thermometer sensor height", 
#                                 test="qaqc_sensor_height_t",
#                                 verbose=verbose)
#     else:
#         stn_to_qaqc = new_df
#         printf('pass qaqc_sensor_height_t', log_file=log_file, verbose=verbose)

#     #---------------------------------------------------------
#     ## Sensor height: wind
#     new_df = qaqc_sensor_height_w(stn_to_qaqc, verbose=verbose)
#     if new_df is None:
#         errors = print_qaqc_failed(errors, station, end_api, 
#                                 message="Flagging problem with anemometer sensor height", 
#                                 test="qaqc_sensor_height_w",
#                                 verbose=verbose)
#     else:
#         stn_to_qaqc = new_df
#         printf('pass qaqc_sensor_height_w', log_file=log_file, verbose=verbose)

    #---------------------------------------------------------
    ## World record checks: air temperature, dewpoint, wind, pressure
    new_df = qaqc_world_record(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="Flagging problem with world record check", 
                                test="qaqc_world_record",
                                verbose=verbose)
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_world_record', log_file=log_file, verbose=verbose)

    printf("Done whole station tests, Ellapsed time: {:.2f} s.\n".format(time.time()-t0), log_file=log_file, verbose=verbose)
    #=========================================================
    ## Part 2: Variable logic checks
    
    t0 = time.time()
    printf("QA/QC logic checks", file=log_file, verbose=verbose)
    #---------------------------------------------------------
    ## dew point temp cannot exceed air temperature
    new_df = qaqc_crossvar_logic_tdps_to_tas_supersat(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="Flagging problem with temperature cross-variable logic check", 
                                test="qaqc_crossvar_logic_tdps_to_tas_supersat",
                                verbose=verbose)
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_crossvar_logic_tdps_to_tas_supersat', log_file=log_file, verbose=verbose)

    #---------------------------------------------------------
    ## dew point temp cannot exceed air temperature (wet bulb drying)  
    new_df = qaqc_crossvar_logic_tdps_to_tas_wetbulb(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="Flagging problem with temperature cross-variable logic check", 
                                test="qaqc_crossvar_logic_tdps_to_tas_wetbulb",
                                verbose=verbose)
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_crossvar_logic_tdps_to_tas_wetbulb', log_file=log_file, verbose=verbose)

    #---------------------------------------------------------
    ## precipitation is not negative
    new_df = qaqc_precip_logic_nonegvals(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="Flagging problem with negative precipitation values", 
                                test="qaqc_precip_logic_nonegvals",
                                verbose=verbose)
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_precip_logic_nonegvals', log_file=log_file, verbose=verbose)

    #---------------------------------------------------------
    ## precipitation duration logic
    new_df = qaqc_precip_logic_accum_amounts(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="Flagging problem with precip duration logic check", 
                                test="qaqc_precip_logic_accum_amounts",
                                verbose=verbose)
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_precip_logic_accum_amounts', log_file=log_file, verbose=verbose)      

    #---------------------------------------------------------
    ## wind direction should be 0 when wind speed is also 0
    new_df = qaqc_crossvar_logic_calm_wind_dir(stn_to_qaqc, verbose=verbose)
    if new_df is None:
        errors = print_qaqc_failed(errors, station, end_api, 
                                message="Flagging problem with wind cross-variable logic check", 
                                test="qaqc_crossvar_logic_calm_wind_dir",
                                verbose=verbose, 
                                file=file)
    else:
        stn_to_qaqc = new_df
        printf('pass qaqc_crossvar_logic_calm_wind_dir', log_file=log_file, verbose=verbose)

    printf("Done logic checks, Ellapsed time: {:.2f} s.\n".format(time.time()-t0), log_file=log_file, verbose=verbose)
    
    return stn_to_qaqc

In [10]:
iqr_thresh=5

# bypass check
vars_to_remove = ['index','station','qc','duration','method',
                  'anemometer_height_m','thermometer_height_m',
                  'lat','lon','elevation','time','month','year',
                  'sfcWind_dir','hurs', 
                  'pr', 'pr_qc', 'pr_depth_qc', 'pr_duration'
                 ] # list of var substrings to exclude if present in var
vars_to_check = [var for var in df.columns if not any(True for item in vars_to_remove if item in var)] # remove all non-primary variables
vars_to_check

['ps', 'tas', 'tdps', 'sfcWind', 'psl']

In [11]:
# in order to grab the time information more easily -- would prefer not to do this
df['month'] = pd.to_datetime(df['time']).dt.month # sets month to new variable
df['year'] = pd.to_datetime(df['time']).dt.year # sets year to new variable
global log_file
log_file = open("logtest.log","w")
verbose=True

In [12]:
newdf = whole_station_checks(df)

02-27-2024 15:21:16 : 	 Running: qaqc_missing_vals
02-27-2024 15:21:17 : 	 Updating missing values for: ps
02-27-2024 15:21:17 : 	 Updating missing values for: tas
02-27-2024 15:21:17 : 	 Updating missing values for: tdps
02-27-2024 15:21:17 : 	 Updating missing values for: pr
02-27-2024 15:21:17 : 	 Updating missing values for: sfcWind
02-27-2024 15:21:17 : 	 Updating missing values for: sfcWind_dir
02-27-2024 15:21:17 : 	 Updating missing values for: ps_altimeter
02-27-2024 15:21:17 : 	 Updating missing values for: psl
02-27-2024 15:21:17 : 	 Updating missing values for: month
02-27-2024 15:21:17 : 	 Updating missing values for: year
02-27-2024 15:21:17 : 	 pass qaqc_missing_vals
02-27-2024 15:21:17 : 	 Running: qaqc_missing_latlon
02-27-2024 15:21:17 : 	 pass qaqc_missing_latlon
02-27-2024 15:21:17 : 	 Running: qaqc_within_wecc
02-27-2024 15:21:20 : 	 pass qaqc_within_wecc
02-27-2024 15:21:20 : 	 Running: qaqc_elev_infill
02-27-2024 15:21:20 : 	 pass qaqc_elev_infill
02-27-2024 15:2

In [13]:
def median_clim(df, month, var):
    '''Part 2: Calculate climatological median for a specific month and variable'''
    clim = df[var].median(numeric_only=True)
    return clim

In [21]:
var = "tas"

In [22]:
meds = []
for month in range(1,12):
    meds.append(median_clim(df, month, var))
meds

[296.45,
 296.45,
 296.45,
 296.45,
 296.45,
 296.45,
 296.45,
 296.45,
 296.45,
 296.45,
 296.45]

In [23]:
meds = []
for month in range(1,12):
    monthly_df = df.loc[df['month']==month]
    df_valid = monthly_df[monthly_df[var+'_eraqc'].isnull()]
    # calculate monthly median per month
    meds.append(df_valid[var].median())
meds

[285.95,
 287.54999999999995,
 291.45,
 295.34999999999997,
 298.75,
 304.25,
 307.54999999999995,
 307.04999999999995,
 304.25,
 297.04999999999995,
 290.34999999999997]

In [25]:
q1 = df_valid[var].quantile(0.25)
q3 = df_valid[var].quantile(0.75)
iqr = q3 - q1
iqr

9.5

In [27]:
df_valid[var].quantile([0.25, 0.75]).diff().iloc[-1]

9.5

In [29]:
iqr_range(df_valid, var)

9.5

In [22]:
%%time
# df_part1 = gaps.qaqc_dist_gap_part1(df, vars_to_check, iqr_thresh, False, verbose=True)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.81 µs


In [24]:
%%time
df_part1 = qaqc_dist_gap_part1(df, vars_to_check, iqr_thresh, plot=True, verbose=True)

Variable = ps
Variable = tas
Variable = tdps
Variable = sfcWind
Variable = psl
CPU times: user 427 ms, sys: 28.5 ms, total: 456 ms
Wall time: 461 ms


In [27]:
%%time
df_part2 = qaqc_dist_gap_part2(df_part1, vars_to_check, plot=True, verbose=True)

In [16]:
cProfile.run('qaqc_unusual_gaps(df, plots=True, verbose=True)')

         100859647 function calls (99167604 primitive calls) in 471.108 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    41472    0.022    0.000    0.257    0.000 <__array_function__ internals>:177(all)
        4    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(append)
    10370    0.007    0.000    0.045    0.000 <__array_function__ internals>:177(argsort)
    20740    0.014    0.000    0.119    0.000 <__array_function__ internals>:177(array_equal)
    10370    0.005    0.000    0.039    0.000 <__array_function__ internals>:177(atleast_2d)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(bincount)
    41472    0.019    0.000    0.310    0.000 <__array_function__ internals>:177(broadcast_to)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(can_cast)
    62217    0.036    0.000    0.192    0.000 <__array_function__ internals>:177(concate

In [19]:
from pycallgraph2 import PyCallGraph
from pycallgraph2.output import GraphvizOutput

with PyCallGraph(output=GraphvizOutput()):
    qaqc_unusual_gaps(df, plots=True, verbose=True)


KeyboardInterrupt



In [54]:
#-----------------------------------------------------------------------------
def qaqc_dist_var_bypass_check_profile(df, vars_to_check, min_num_months=5):     
    for var in vars_to_check:
        for month in range(1,13):
            print(var, month)
    return df

In [56]:
#-----------------------------------------------------------------------------
def qaqc_dist_gap_part1_profile(df, vars_to_check, iqr_thresh, plot=True, verbose=False):
    
    for var in vars_to_check:
        for month in range(1,13):
            print(var, month)
            # per variable bypass check
            df = qaqc_dist_var_bypass_check_profile(df, vars_to_check) # flag here is 20
            # if 20 in df[var+'_eraqc']:
            #     # print("skip")
            #     continue # skip variable

            # station has above min_num_months number of valid observations, proceed with dist gap check
            # else:
            if True:
                # valid obs only
                df_valid = df.loc[df[var+'_eraqc'].isnull() == True]

                # calculate monthly climatological median, and bounds
                # mid, low, high = standardized_median_bounds(df_valid, month, var, iqr_thresh=iqr_thresh)

                # calculate monthly median per month
                df_month = monthly_med(df_valid)

                for i in df_month.loc[df_month['month'] == month][var]:
                    print(i,month)
                    # if (i < low) or (i > high):
                    #     year_to_flag = (df_month.loc[(df_month[var]==i) & 
                    #                        (df_month['month']==month)]['year'].values[0])
                    #     printf('Median {} value for {}-{} is beyond the {}*IQR limits -- flagging month'.format(
                    #         var,
                    #         month, 
                    #         int(year_to_flag),
                    #         iqr_thresh), log_file=log_file, verbose=verbose
                    #     )

                        # flag all obs in that month
                        # df.loc[(df_valid['time'].dt.month == month) & 
                        #        (df_valid['time'].dt.year == year_to_flag), var+'_eraqc'] = 21 # see era_qaqc_flag_meanings.csv
                        #           network=df['station'].unique()[0].split('_')[0])
                
    return df


In [57]:
qaqc_dist_gap_part1_profile(df, vars_to_check, iqr_thresh)

tas 1
tas 1
tas 2
tas 3
tas 4
tas 5
tas 6
tas 7
tas 8
tas 9
tas 10
tas 11
tas 12
pr 1
pr 2
pr 3
pr 4
pr 5
pr 6
pr 7
pr 8
pr 9
pr 10
pr 11
pr 12
rsds 1
rsds 2
rsds 3
rsds 4
rsds 5
rsds 6
rsds 7
rsds 8
rsds 9
rsds 10
rsds 11
rsds 12
sfcWind 1
sfcWind 2
sfcWind 3
sfcWind 4
sfcWind 5
sfcWind 6
sfcWind 7
sfcWind 8
sfcWind 9
sfcWind 10
sfcWind 11
sfcWind 12
tdps_derived 1
tdps_derived 2
tdps_derived 3
tdps_derived 4
tdps_derived 5
tdps_derived 6
tdps_derived 7
tdps_derived 8
tdps_derived 9
tdps_derived 10
tdps_derived 11
tdps_derived 12
276.47999999999996 1
273.71 1
272.59 1
275.37 1
273.15 1
273.71 1
274.26 1
275.37 1
271.47999999999996 1
274.81699999999995 1
tas 2
tas 1
tas 2
tas 3
tas 4
tas 5
tas 6
tas 7
tas 8
tas 9
tas 10
tas 11
tas 12
pr 1
pr 2
pr 3
pr 4
pr 5
pr 6
pr 7
pr 8
pr 9
pr 10
pr 11
pr 12
rsds 1
rsds 2
rsds 3
rsds 4
rsds 5
rsds 6
rsds 7
rsds 8
rsds 9
rsds 10
rsds 11
rsds 12
sfcWind 1
sfcWind 2
sfcWind 3
sfcWind 4
sfcWind 5
sfcWind 6
sfcWind 7
sfcWind 8
sfcWind 9
sfcWind 10
sfcWi

AttributeError: 'Int64Index' object has no attribute '_with_freq'

In [112]:
df.groupby(by=["month"])['year'].unique().apply(len)

month
1     11
2     11
3     12
4     14
5     15
6     17
7     17
8     19
9     19
10    18
11    13
12    13
Name: year, dtype: int64

In [266]:
var = "tas"
df[[var, "month","year"]].groupby(by=[ "month","year"]).count().groupby("month").count()

Unnamed: 0_level_0,tas
month,Unnamed: 1_level_1
1,11
2,11
3,12
4,14
5,15
6,17
7,17
8,19
9,19
10,18


In [222]:
def whole_stn_bypass_check(df, var):
    return df.loc[:,[var, "month","year"]].groupby(by=["month"])['year'].unique().apply(len)

In [223]:
%timeit whole_stn_bypass_check(df, var="tas")

2.72 ms ± 57.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [225]:
%timeit map(whole_stn_bypass_check, [df]*len(vars_to_check), vars_to_check)

152 ns ± 1.93 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [246]:
np.array(list(map(whole_stn_bypass_check, [df]*len(vars_to_check), vars_to_check)))

array([[11, 11, 12, 14, 15, 17, 17, 19, 19, 18, 13, 13],
       [11, 11, 12, 14, 15, 17, 17, 19, 19, 18, 13, 13],
       [11, 11, 12, 14, 15, 17, 17, 19, 19, 18, 13, 13],
       [11, 11, 12, 14, 15, 17, 17, 19, 19, 18, 13, 13],
       [11, 11, 12, 14, 15, 17, 17, 19, 19, 18, 13, 13]])

In [251]:
[1,2,3].count()

TypeError: list.count() takes exactly one argument (0 given)

In [244]:
stn_length = map(whole_stn_bypass_check, [df]*len(vars_to_check), vars_to_check)
stn_length = {k:v for k,v in zip(vars_to_check, stn_length)}
stn_length


{'tas': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64,
 'pr': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64,
 'rsds': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64,
 'sfcWind': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64,
 'tdps_derived': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64}

In [243]:
tmp

{'tas': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64,
 'pr': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64,
 'rsds': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64,
 'sfcWind': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64,
 'tdps_derived': month
 1     11
 2     11
 3     12
 4     14
 5     15
 6     17
 7     17
 8     19
 9     19
 10    18
 11    13
 12    13
 Name: year, dtype: int64}