Motivation: individual gross outliers from general station distribution are a common error in obs data by random recording, reporting, formatting, or instrumentation errors

Process:
1. uses individual observation deviations derived from monthly mean climatology calculated for each hour of the day
2. climatologies calculated using winsorised data to remove initial effect of outliers
    - Winsorising: all values beyond threhsold value from mean are set to that threshold value
    - 5 and 95% for hadisd
    - number of data values in population remains the same, not trimmed
3. raw unwinsorised observations are anomalised using these climatologies
4. standardized by IQR for that month and hour
    - IQR cannot be less than 1.5degC
5. values are low-pass filtered to remove any climate change signal causing overzealous removal at ends of time series
6. gaussian is fitted to the histogram of anomalies for each month
7. threshold value, rounded outwards where crosses y=0.1 line
8. distribution beyond threhsold value is scanned for gap, equal to bin width or more
9. all values beyond gap are flagged
10. obs that fall between critical threshold value and gap or critical threshold and end of distribution are tentatively flagged
    - these may be later reinstated on comparison with good data from neighboring stations

Notes:
- when applied to SLP, frequently flags storm signals, which may be of high interest, so this test is not applied to pressure data
- hadisd only applies to temp and dewpoint temp

In [1]:
import pandas as pd
import numpy as np
import xarray as xr

from scipy.stats.mstats import winsorize
import matplotlib.pyplot as plt
import scipy.stats as stats



In [2]:
ds = xr.open_dataset('/Users/victoriaford/Desktop/eaglerock/Historical Data Platform/Train_Files/ASOSAWOS_72051724165.nc')

df = ds.to_dataframe()
# df = df.reset_index()
# df['month'] = pd.to_datetime(df['time']).dt.month # sets month to new variable
# df['year'] = pd.to_datetime(df['time']).dt.year # sets year to new variable

  base = data.astype(np.int64)
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")


In [3]:
def clim_mon_mean_hourly(df, var, month, hour):
    '''Calculate the monthly mean climatology for each of the day'''
    
    df_m_h = df.loc[(df.time.dt.month == month) & (df.time.dt.hour == hour)]
    clim_value = df_m_h[var].mean(numeric_only = True)
    
    # special handling if value is nan? 
    
    return clim_value

def iqr_range_monhour(df, var, month, hour):
    '''Calculates the monthly interquartile range per hour'''
    
    q1 = df.loc[(df.time.dt.month == month) & (df.time.dt.hour == hour)].quantile(0.25, numeric_only=True)
    q3 = df.loc[(df.time.dt.month == month) & (df.time.dt.hour == hour)].quantile(0.75, numeric_only=True)
    
    iqr_df = q3 - q1
    iqr_df_val = iqr_df[var]
    
    # iqr cannot be less than 1.5°C in order to preserve low variance stations
    if iqr_df_val < 1.5:
        iqr_df_val = 1.5
    else:
        iqr_df_val = iqr_df_val
            
    return iqr_df_val


def clim_standardized_anom(df, vars_to_anom):
    '''
    First anomalizes data by monthly climatology for each hour, then
    standardizes by the monthly climatological anomaly IQR for each hour
    '''
    
    df2 = df.copy()
    
    for var in vars_to_anom:
        for m in range(1,13,1):
            for h in range(0,24,1):
                # each hour in each month
                anom_value = clim_mon_mean_hourly(df, var, month=m, hour=h)
                iqr_value = iqr_range_monhour(df, var, month=m, hour=h)
                
                # locate obs within specific month/hour
                df_m_h = df.loc[(df.time.dt.month == m) & (df.time.dt.hour == h)]
                
                # calculate the monthly climatological anomaly by hour and standardize by iqr
                df2.loc[(df.time.dt.month == m) & 
                        (df.time.dt.hour == h), 
                        var] = (df_m_h[var] - anom_value) / iqr_value
                
    return df2

def winsorize_temps(df, vars_to_anom, winz_limits):
    '''
    Replaces potential spurious outliers by limiting the extreme values
    using the winz_limits set (default is 5% and 95% percentiles)
    '''
    
    df2 = df.copy()
    
    for var in vars_to_anom:
        for m in range(1,13,1):
            for h in range(0,24,1):
                
                df_m_h = df.loc[(df.time.dt.month == m) & (df.time.dt.hour == h)]
                
                # winsorize only vars in vars_to_anom
                df_w = winsorize(df_m_h[var], limits=winz_limits, nan_policy='omit')
                
                df2.loc[(df.time.dt.month == m) & (df.time.dt.hour == h),
                       var] = df_w
                
    return df2

def median_yr_anom(df, var):
    '''Get median anomaly per year'''
    
    monthly_anoms = []
    
    # identify years in data
    years = df.time.dt.year.unique()
    
    for yr in years:
        df_yr = df.loc[df.time.dt.year == yr]

        ann_anom = df_yr[var].median()
        monthly_anoms.append(ann_anom)
        
    return monthly_anoms

def low_pass_filter_weights(median_anoms, month_low, month_high, filter_low, filter_high):
    '''Calculates weights for low pass filter'''
    
    filter_wgts = [1, 2, 3, 2, 1]
    
    if np.sum(filter_wgts[filter_low:filter_high] * 
              np.ceil(median_anoms[month_low:month_high] - 
                      np.floor(median_anoms[month_low:month_high]))) == 0:
        weight = 0
    
    else:
        weight = (
            np.sum(filter_wgts[filter_low:filter_high] * np.ceil(median_anoms[month_low:month_high])) / 
            np.sum(filter_wgts[filter_low:filter_high] * np.ceil(median_anoms[month_low:month_high] - 
                                                                 np.floor(median_anoms[month_low:month_high])))
        )
        
    return weight

def low_pass_filter(df, vars_to_anom):
    '''
    Low pass filtering on observations to remove any climate change signal 
    causing overzealous removal at ends of time series
    '''
    # identify years in data
    years = df.time.dt.year.unique()
    
    for var in vars_to_anom:
        
        median_anoms = median_yr_anom(df, var)
    
        for yr in range(len(years)):
            if yr == 0:
                month_low, month_high = 0, 3
                filter_low, filter_high = 2, 5
                
            elif yr == 1:
                month_low, month_high = 0, 4
                filter_low, filter_high = 1, 5
                
            elif yr == len(years)-2:
                month_low, month_high = -4, -1
                filter_low, filter_high = 0, 3

            elif yr == len(years)-1:
                month_low, month_high = -3, -1
                filter_low, filter_high = 0, 2

            else:
                month_low, month_high = yr-2, yr+3
                filter_low, filter_high = 0, 5
                            
            if np.sum(np.abs(median_anoms[month_low:month_high])) != 0:
                weights = low_pass_filter_weights(median_anoms, month_low, month_high, filter_low, filter_high)
                      
            # want to return specific year of data at a specific variable, the variable minus weight value
            df.loc[(df.time.dt.year == years[yr]), var] = df.loc[df.time.dt.year == years[yr]][var] - weights
            
    return df


## distribution gap plotting helpers
def create_bins(data, bin_size=0.25):
    '''Create bins from data covering entire data range'''

    # set up bins
    b_min = np.floor(np.nanmin(data))
    b_max = np.ceil(np.nanmax(data))
    bins = np.arange(b_min - bin_size, b_max + (3. * bin_size), bin_size)

    return bins

def pdf_bounds(df, mu, sigma, bins):
    '''Calculate pdf distribution, return pdf and threshold bounds'''

    y = stats.norm.pdf(bins, mu, sigma)
    
    # add vertical lines to indicate thresholds where pdf y=0.1
    pdf_bounds = np.argwhere(y > 0.1)

    # find first index
    left_bnd = round(bins[pdf_bounds[0][0] -1])
    right_bnd = round(bins[pdf_bounds[-1][0] + 1])
    thresholds = (left_bnd - 1, right_bnd + 1)
    
    return (y, left_bnd - 1, right_bnd + 1)

In [4]:
def clim_outlier_plot(df, var, month, network):
    '''
    Produces a histogram of monthly standardized distribution
    with PDF overlay and threshold lines where pdf falls below y=0.1.
    Any bin that is outside of the threshold is visually flagged.
    
    Differs from dist_gap_part2_plot for the climatological outlier
    as IQR standardization does not occur within plotting
    '''
    
    # select month
    df = df.loc[df.time.dt.month == month]
    
    # determine number of bins
    bins = create_bins(df)
    
    # plot histogram
    ax = plt.hist(df, bins=bins, log=False, density=True, alpha=0.3)
    xmin, xmax = plt.xlim()
    plt.ylim(ymin=0.1)
    
    # plot pdf
    mu = np.nanmean(df)
    sigma = np.nanmean(df)
    y = stats.norm.pdf(bins, mu, sigma)
    l = plt.plot(bins, y, 'k--', linewidth=1)
    
    # add vertical lines to indicate thresholds where pdf y=0.1
    pdf_bounds = np.argwhere(y > 0.1)
    
    # find first index
    left_bnd = round(bins[pdf_bounds[0][0] - 1])
    right_bnd = round(bins[pdf_bounds[-1][0] + 1])
    thresholds = (left_bnd - 1, right_bnd + 1)
    
    plt.axvline(thresholds[1], color='r') # right tail
    plt.axvline(thresholds[0], color='r') # left tail
    
    # flag visually obs that are beyond threshold
    for bar in ax[2].patches:
        x = bar.get_x() + 0.5 * bar.get_width()
        if x > thresholds[1]: # right tail
            bar.set_color('r')
        elif x < thresholds[0]: # left tail
            bar.set_color('r')
            
    # title and useful annotations
    plt.title('Climatological outlier check, {0}: {1}'.format(df['station'].unique()[0], var), fontsize=10);
    plt.annotate('Month: {}'.format(month), xy=(0.025, 0.95), xycoords='axes fraction', fontsize=8);
    plt.annotate('Mean: {}'.format(round(mu,3)), xy=(0.025, 0.9), xycoords='axes fraction', fontsize=8);
    plt.annotate('Std.Dev: {}'.format(round(sigma,3)), xy=(0.025, 0.85), xycoords='axes fraction', fontsize=8);
    plt.ylabel('Frequency (obs)')
    
    # save figure to AWS
    bucket_name = 'wecc-historical-wx'
    directory = '3_qaqc_wx'
    img_data = BytesIO()
    plt.savefig(img_data, format='png')
    img_data.seek(0)
    
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    figname = 'qaqc_climatological_outlier_{0}_{1}_{2}'.format(df['station'].unique()[0], var, month)
    bucket.put_object(Body=img_data, ContentType='image/png',
                     Key='{0}/{1}/qaqc_figs/{2}.png'.format(
                     directory, network, figname))
    
    # close figures to save memory
    plt.close()

In [10]:
def qaqc_climatological_outlier(df, winsorize=True, winz_limits=[0.05,0.05], plot=True, verbose=True):
    '''
    Flags individual gross outliers from climatological distribution.
    Only applied to air temperature and dew point temperature
    
    Input:
    ------
        df [pd.DataFrame]: station dataset converted to dataframe through QAQC pipeline
        plots [bool]: if True, produces plots of any flagged data and saved to AWS
        winsorize [bool]: if True, raw observations are winsorized to remove spurious outliers first
        winz_limits [list]: if winsorize is True, values represent the low and high percentiles to standardize to
            
    Returns:
    --------
        qaqc success:
            df [pd.DataFrame]: QAQC dataframe with flagged values (see below for flag meaning)
        qaqc failure:
            None
            
    Flag meaning:
    -------------
        25,qaqc_climatological_outlier,Value flagged as a climatological outlier
    '''
    
    #### ONLY IN NOTEBOOK DEVELOPMENT, REMOVED IN CODE FOR PIPELINE #####
    df = df.reset_index()
    df['month'] = pd.to_datetime(df['time']).dt.month # sets month to new variable
    df['year'] = pd.to_datetime(df['time']).dt.year # sets year to new variable  
    
    vars_to_check = ['tas', 'tdps', 'tdps_derived']
    vars_to_anom = [v for v in vars_to_check if v in df.columns]
    
    ## ONLY IN NOTEBOOK FOR DEVELOPMENT
    for var in vars_to_anom:      
        df[var+'_eraqc'] = np.nan
    
    # TO DO: filter to only use non-flagged data

    # winsorize data by percentiles
    if winsorize == True:
        df_std = winsorize_temps(df, vars_to_anom, winz_limits)
    else:
        df_std = df
        
    # standardize data by monthly climatological anomalies by hour
    df_std = clim_standardized_anom(df_std, vars_to_anom)

    # apply low pass filter
    df_std = low_pass_filter(df_std, vars_to_anom)
        
    # gaussian is fitted to the histogram of anomalies for each month
        # threshold value, rounded outwards where crosses y=0.1 line
        # distribution beyond threhsold value is scanned for gap, equal to bin width or more
        # all values beyond gap are flagged
            # HadISD: obs that fall between critical threshold value and gap or 
            # critical threshold and end of distribution are tentatively flagged
            # these may be later reinstated on comparison with good data from neighboring stations
            # into v2 of data product
            
    for var in vars_to_anom:
        
        for month in range(1,13):
            print('Searching for outliers in {0} in month {1}...'.format(var, month))
            
            df_m = df_std.loc[df_std.time.dt.month == month]
            
            # determine number of bins
            bins = create_bins(df_m[var])

            # pdf
            mu = np.nanmean(df_m[var])
            sigma = np.nanstd(df_m[var])

            y, left_bnd, right_bnd = pdf_bounds(df_m[var], mu, sigma, bins)

            # identify gaps as below y=0.1 from histogram, not pdf
            y_hist, bins = np.histogram(df_m[var], bins=bins, density=True)
            
            # identify bin indices outside of thresholds and check if bin is above 0.1
            bins_to_check = [i for i, n in enumerate(bins) if n <= left_bnd or n >= right_bnd][:-1] # remove last item due to # of bins exceeding hist by 1
            if len(bins_to_check) != 0:
                for b in bins_to_check:
                    if y_hist[b] > 0.1:
                        print('Flagging {0} bins in {1}'.format(len(b), var))
                        # list of index of full df to flag, not standardized df
                        idx_to_flag = [i for i in df_m.loc[(df[var] >= bins[b]) & (df2[var] < bins[b+1])].index]  
                        df.loc[df.index == idx_to_flag, var+'_eraqc'] = 25 # see era_qaqc_flag_meanings.csv 
                
    if plot == True:
        for var in vars_to_anom:
            if 25 in df[var+'_eraqc'].values: # only plot a figure if flag is present
                clim_outlier_plot(df, var, network=df['station'].unique()[0])
                
    return df

In [13]:
test_df = qaqc_climatological_outlier(df, plot=True)
test_df

Searching for outliers in tas in month 1...
Searching for outliers in tas in month 2...
Searching for outliers in tas in month 3...
Searching for outliers in tas in month 4...
Searching for outliers in tas in month 5...
Searching for outliers in tas in month 6...
Searching for outliers in tas in month 7...
Searching for outliers in tas in month 8...
Searching for outliers in tas in month 9...
Searching for outliers in tas in month 10...
Searching for outliers in tas in month 11...
Searching for outliers in tas in month 12...
Searching for outliers in tdps in month 1...
Searching for outliers in tdps in month 2...
Searching for outliers in tdps in month 3...
Searching for outliers in tdps in month 4...
Searching for outliers in tdps in month 5...
Searching for outliers in tdps in month 6...
Searching for outliers in tdps in month 7...
Searching for outliers in tdps in month 8...
Searching for outliers in tdps in month 9...
Searching for outliers in tdps in month 10...
Searching for outl

Unnamed: 0,station,time,ps,tas,tdps,pr,sfcWind,sfcWind_dir,elevation,qaqc_process,...,pr_depth_qc,sfcWind_qc,sfcWind_method,sfcWind_dir_qc,lat,lon,month,year,tas_eraqc,tdps_eraqc
0,ASOSAWOS_72051724165,2013-06-27 12:55:00,78690.0,284.45,277.95,,5.7,350.0,2220.0,V020,...,,5,N,5,41.824,-110.556,6,2013,,
1,ASOSAWOS_72051724165,2013-06-27 13:15:00,,285.65,277.95,,5.7,350.0,2220.0,V020,...,,1,N,1,41.824,-110.557,6,2013,,
2,ASOSAWOS_72051724165,2013-06-27 13:35:00,,287.05,278.15,,4.6,350.0,2220.0,V020,...,,1,N,1,41.824,-110.557,6,2013,,
3,ASOSAWOS_72051724165,2013-06-27 13:55:00,78750.0,289.05,278.65,,4.6,350.0,2220.0,V020,...,,5,N,5,41.824,-110.556,6,2013,,
4,ASOSAWOS_72051724165,2013-06-27 14:15:00,,289.95,278.85,,4.6,360.0,2220.0,V020,...,,1,N,1,41.824,-110.557,6,2013,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178389,ASOSAWOS_72051724165,2022-08-11 20:15:00,78640.0,300.15,276.15,,7.2,260.0,2220.0,V020,...,,5,N,5,41.824,-110.556,8,2022,,
178390,ASOSAWOS_72051724165,2022-08-11 20:35:00,78580.0,301.15,279.15,,9.8,240.0,2220.0,V020,...,,5,N,5,41.824,-110.556,8,2022,,
178391,ASOSAWOS_72051724165,2022-08-11 20:55:00,78580.0,301.15,279.15,,5.7,290.0,2220.0,V020,...,,5,V,5,41.824,-110.556,8,2022,,
178392,ASOSAWOS_72051724165,2022-08-11 21:15:00,78530.0,301.15,279.15,,10.3,250.0,2220.0,V020,...,,5,N,5,41.824,-110.556,8,2022,,


In [14]:
test_df.loc[test_df['tdps_eraqc'] == 25]

Unnamed: 0,station,time,ps,tas,tdps,pr,sfcWind,sfcWind_dir,elevation,qaqc_process,...,pr_depth_qc,sfcWind_qc,sfcWind_method,sfcWind_dir_qc,lat,lon,month,year,tas_eraqc,tdps_eraqc
