Motivation: individual gross outliers from general station distribution are a common error in obs data by random recording, reporting, formatting, or instrumentation errors

Process:
1. uses individual observation deviations derived from monthly mean climatology calculated for each hour of the day
2. climatologies calculated using winsorised data to remove initial effect of outliers
    - Winsorising: all values beyond threhsold value from mean are set to that threshold value
    - 5 and 95% for hadisd
    - number of data values in population remains the same, not trimmed
3. raw unwinsorised observations are anomalised using these climatologies
4. standardized by IQR for that month and hour
    - IQR cannot be less than 1.5degC
5. values are low-pass filtered to remove any climate change signal causing overzealous removal at ends of time series
6. gaussian is fitted to the histogram of anomalies for each month
7. threshold value, rounded outwards where crosses y=0.1 line
8. distribution beyond threhsold value is scanned for gap, equal to bin width or more
9. all values beyond gap are flagged
10. obs that fall between critical threshold value and gap or critical threshold and end of distribution are tentatively flagged
    - these may be later reinstated on comparison with good data from neighboring stations

Notes:
- when applied to SLP, frequently flags storm signals, which may be of high interest, so this test is not applied to pressure data
- hadisd only applies to temp and dewpoint temp

In [1]:
import pandas as pd
import numpy as np
import xarray as xr

from scipy.stats.mstats import winsorize
import matplotlib.pyplot as plt
import scipy.stats as stats



In [45]:
ds = xr.open_dataset('/Users/victoriaford/Desktop/eaglerock/Historical Data Platform/Train_Files/ASOSAWOS_72051724165.nc')

df = ds.to_dataframe()
# df = df.reset_index()
# df['month'] = pd.to_datetime(df['time']).dt.month # sets month to new variable
# df['year'] = pd.to_datetime(df['time']).dt.year # sets year to new variable

  base = data.astype(np.int64)
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")


In [3]:
def clim_mon_mean_hourly(df, var, month, hour):
    '''Calculate the monthly mean climatology for each of the day'''
    
    df_m_h = df.loc[(df.time.dt.month == month) & (df.time.dt.hour == hour)]
    clim_value = df_m_h[var].mean(numeric_only = True)
    
    # special handling if value is nan? 
    
    return clim_value

def iqr_range_monhour(df, var, month, hour):
    '''Calculates the monthly interquartile range per hour'''
    
    q1 = df.loc[(df.time.dt.month == month) & (df.time.dt.hour == hour)].quantile(0.25, numeric_only=True)
    q3 = df.loc[(df.time.dt.month == month) & (df.time.dt.hour == hour)].quantile(0.75, numeric_only=True)
    
    iqr_df = q3 - q1
    iqr_df_val = iqr_df[var]
    
    # iqr cannot be less than 1.5°C in order to preserve low variance stations
    if iqr_df_val < 1.5:
        iqr_df_val = 1.5
    else:
        iqr_df_val = iqr_df_val
            
    return iqr_df_val


def clim_standardized_anom(df, vars_to_anom):
    '''
    First anomalizes data by monthly climatology for each hour, then
    standardizes by the monthly climatological anomaly IQR for each hour
    '''
    
    df2 = df.copy()
    
    for var in vars_to_anom:
        for m in range(1,13,1):
            for h in range(0,24,1):
                # each hour in each month
                anom_value = clim_mon_mean_hourly(df, var, month=m, hour=h)
                iqr_value = iqr_range_monhour(df, var, month=m, hour=h)
                
                # locate obs within specific month/hour
                df_m_h = df.loc[(df.time.dt.month == m) & (df.time.dt.hour == h)]
                
                # calculate the monthly climatological anomaly by hour and standardize by iqr
                df2.loc[(df.time.dt.month == m) & 
                        (df.time.dt.hour == h), 
                        var] = (df_m_h[var] - anom_value) / iqr_value
                
    return df2

def winsorize_temps(df, vars_to_anom, winz_limits):
    '''
    Replaces potential spurious outliers by limiting the extreme values
    using the winz_limits set (default is 5% and 95% percentiles)
    '''
    
    df2 = df.copy()
    
    for var in vars_to_anom:
        for m in range(1,13,1):
            for h in range(0,24,1):
                
                df_m_h = df.loc[(df.time.dt.month == m) & (df.time.dt.hour == h)]
                
                # winsorize only vars in vars_to_anom
                df_w = winsorize(df_m_h[var], limits=winz_limits, nan_policy='omit')
                
                df2.loc[(df.time.dt.month == m) & (df.time.dt.hour == h),
                       var] = df_w
                
    return df2

# get average anomaly per year
def median_yr_anom(df, var):
    '''Get median anomaly per year'''
    
    monthly_anoms = []
    
    # identify years in data
    years = df.time.dt.year.unique()
    
    for yr in years:
        df_yr = df.loc[df.time.dt.year == yr]

        ann_anom = df_yr[var].median()
        monthly_anoms.append(ann_anom)
        
    return monthly_anoms

def low_pass_filter_weights(median_anoms, month_low, month_high, filter_low, filter_high):
    '''Calculates weights for low pass filter'''
    
    filter_wgts = [1, 2, 3, 2, 1]
    
    if np.sum(filter_wgts[filter_low:filter_high] * 
              np.ceil(median_anoms[month_low:month_high] - 
                      np.floor(median_anoms[month_low:month_high]))) == 0:
        weight = 0
    
    else:
        weight = (
            np.sum(filter_wgts[filter_low:filter_high] * np.ceil(median_anoms[month_low:month_high])) / 
            np.sum(filter_wgts[filter_low:filter_high] * np.ceil(median_anoms[month_low:month_high] - 
                                                                 np.floor(median_anoms[month_low:month_high])))
        )
        
    return weight

def low_pass_filter(df, vars_to_anom):
    '''
    Low pass filtering on observations to remove any climate change signal 
    causing overzealous removal at ends of time series
    '''
    # identify years in data
    years = df.time.dt.year.unique()
    
    for var in vars_to_anom:
        
        median_anoms = median_yr_anom(df, var)
    
        for yr in range(len(years)):
            if yr == 0:
                month_low, month_high = 0, 3
                filter_low, filter_high = 2, 5
                
            elif yr == 1:
                month_low, month_high = 0, 4
                filter_low, filter_high = 1, 5
                
            elif yr == len(years)-2:
                month_low, month_high = -4, -1
                filter_low, filter_high = 0, 3

            elif yr == len(years)-1:
                month_low, month_high = -3, -1
                filter_low, filter_high = 0, 2

            else:
                month_low, month_high = yr-2, yr+3
                filter_low, filter_high = 0, 5
                            
            if np.sum(np.abs(median_anoms[month_low:month_high])) != 0:
                weights = low_pass_filter_weights(median_anoms, month_low, month_high, filter_low, filter_high)
                      
            # want to return specific year of data at a specific variable, the variable minus weight value
            df.loc[(df.time.dt.year == years[yr]), var] = df.loc[df.time.dt.year == years[yr]][var] - weights
            
    return df


## distribution gap plotting helpers
def create_bins(data, bin_size=0.25):
    '''Create bins from data covering entire data range'''

    # set up bins
    b_min = np.floor(np.nanmin(data))
    b_max = np.ceil(np.nanmax(data))
    bins = np.arange(b_min - bin_size, b_max + (3. * bin_size), bin_size)

    return bins

def pdf_bounds(df, mu, sigma, bins):
    '''Calculate pdf distribution, return pdf and threshold bounds'''

    y = stats.norm.pdf(bins, mu, sigma)
    
    # add vertical lines to indicate thresholds where pdf y=0.1
    pdf_bounds = np.argwhere(y > 0.1)

    # find first index
    left_bnd = round(bins[pdf_bounds[0][0] -1])
    right_bnd = round(bins[pdf_bounds[-1][0] + 1])
    thresholds = (left_bnd - 1, right_bnd + 1)
    
    return (y, left_bnd - 1, right_bnd + 1)

In [69]:
def qaqc_climatological_outlier(df, winsorize=True, winz_limits=[0.05,0.05], plot=True, verbose=True):
    '''
    Flags individual gross outliers from climatological distribution.
    Only applied to air temperature and dew point temperature
    
    Input:
    ------
        df [pd.DataFrame]: station dataset converted to dataframe through QAQC pipeline
        plots [bool]: if True, produces plots of any flagged data and saved to AWS
        winsorize [bool]: if True, raw observations are winsorized to remove spurious outliers first
        winz_limits [list]: if winsorize is True, values represent the low and high percentiles to standardize to
            
    Returns:
    --------
        qaqc success:
            df [pd.DataFrame]: QAQC dataframe with flagged values (see below for flag meaning)
        qaqc failure:
            None
            
    Flag meaning:
    -------------
        25,qaqc_climatological_outlier,Value flagged as a climatological outlier
    '''
    
    
    #### ONLY IN NOTEBOOK DEVELOPMENT, REMOVED IN CODE FOR PIPELINE #####
    df = df.reset_index()
    df['month'] = pd.to_datetime(df['time']).dt.month # sets month to new variable
    df['year'] = pd.to_datetime(df['time']).dt.year # sets year to new variable
    
    df2 = df.copy()
    
    vars_to_check = ['tas', 'tdps', 'tdps_derived']
    vars_to_anom = [v for v in vars_to_check if v in df.columns]
    
    # TO DO: filter to only use non-flagged data

    # winsorize data by percentiles
    if winsorize == True:
        df = winsorize_temps(df, vars_to_anom, winz_limits)
    else:
        df = df
        
    # standardize data by monthly climatological anomalies by hour
    df = clim_standardized_anom(df, vars_to_anom)

    # apply low pass filter
    df = low_pass_filter(df, vars_to_anom)
        
    # gaussian is fitted to the histogram of anomalies for each month
        # threshold value, rounded outwards where crosses y=0.1 line
        # distribution beyond threhsold value is scanned for gap, equal to bin width or more
        # all values beyond gap are flagged
            # HadISD: obs that fall between critical threshold value and gap or 
            # critical threshold and end of distribution are tentatively flagged
            # these may be later reinstated on comparison with good data from neighboring stations
            # into v2 of data product
            
    for var in vars_to_anom:
        for month in range(1,13):
            print(var, month)
            
            df = df.loc[df.time.dt.month == month]
            
            # determine number of bins
            bins = create_bins(df[var])

            # pdf
            mu = np.nanmean(df[var])
            sigma = np.nanstd(df[var])

            y, left_bnd, right_bnd = pdf_bounds(df[var], mu, sigma, bins)
            
            # df index of where each obs falls in which hist bin
            d = np.digitize(df[var], bins, right=True)
            d = d-1 # start index at 0

            # identify gaps as below y=0.1 from histogram, not pdf
            y_hist, bins = np.histogram(df[var], bins=bins, density=True)
            print(len(bins))
            # bins are flagged for values beyond left_bnd, right_bnd
            bins_beyond_left_bnd = np.argwhere(bins <= left_bnd)
            if len(bins_beyond_left_bnd) != 0:
                for data in bins_beyond_left_bnd:
                    if y_hist[data] > 0.1: # bins with data > 0.1 beyond left_bnd
                        # identify values to flag
                        print(data)
                        idx_to_flag = np.argwhere(d == data)
                        for idx in idx_to_flag:
                            df.iloc[idx, var+'_eraqc'] = 25 # see era_qaqc_flag_meanings.csv

            bins_beyond_right_bnd = np.argwhere(bins >= right_bnd)
            print(bins_beyond_right_bnd)
            if len(bins_beyond_right_bnd) != 0:
                for data in bins_beyond_right_bnd:
                    if y_hist[data] > 0.1: # bins with data > 0.1 beyond right_bnd
                        # identify values to flag
                        print(data)
                        idx_to_flag = np.argwhere(d == data)
                        for idx in idx_to_flag:
                            df.iloc[idx, var+'_eraqc'] = 25 # see era_qaqc_flag_meanings.csv
                
    if plot == True:
        for var in vars_to_anom:
            if 25 in df2[var+'_eraqc'].values: # only plot a figure if flag is present
                clim_outlier_plot(df2, var, network=df['station'].unique())
                
    return df2

In [70]:
test_df = qaqc_climatological_outlier(df, plot=False)
test_df

tas 1
20
[[17]
 [18]
 [19]]


IndexError: index 19 is out of bounds for axis 0 with size 19

In [71]:
df = df.reset_index()
df['month'] = pd.to_datetime(df['time']).dt.month # sets month to new variable
df['year'] = pd.to_datetime(df['time']).dt.year # sets year to new variable

vars_to_check = ['tas', 'tdps', 'tdps_derived']
vars_to_anom = [v for v in vars_to_check if v in df.columns]

# TO DO: filter to only use non-flagged data

# winsorize data by percentiles
df = winsorize_temps(df, vars_to_anom, winz_limits=[0.05, 0.05])

# standardize data by monthly climatological anomalies by hour
df = clim_standardized_anom(df, vars_to_anom)

# apply low pass filter
df = low_pass_filter(df, vars_to_anom)


month=1
var='tas'

dfm = df.loc[df.time.dt.month==month]
# determine number of bins
bins = create_bins(dfm[var])

# pdf
mu = np.nanmean(dfm[var])
sigma = np.nanstd(dfm[var])

y, left_bnd, right_bnd = pdf_bounds(dfm[var], mu, sigma, bins)

In [72]:
left_bnd, right_bnd

(-3, 1)

In [73]:
y_hist, bins = np.histogram(dfm[var], bins=bins, density=True)
y_hist, bins

(array([0.        , 0.        , 0.        , 0.        , 0.01553716,
        0.09485843, 0.34099969, 0.4644792 , 0.58427885, 0.73556169,
        0.73474394, 0.73924154, 0.24205254, 0.03884289, 0.00940407,
        0.        , 0.        , 0.        , 0.        ]),
 array([-3.25, -3.  , -2.75, -2.5 , -2.25, -2.  , -1.75, -1.5 , -1.25,
        -1.  , -0.75, -0.5 , -0.25,  0.  ,  0.25,  0.5 ,  0.75,  1.  ,
         1.25,  1.5 ]))

In [75]:
d = np.digitize(dfm[var], bins)
d

array([12, 12, 12, ...,  9,  8,  8])

In [76]:
np.min(d), np.max(d)

(5, 20)

In [None]:
print(len(bins))
# bins are flagged for values beyond left_bnd, right_bnd
bins_beyond_left_bnd = np.argwhere(bins <= left_bnd)
if len(bins_beyond_left_bnd) != 0:
    for data in bins_beyond_left_bnd:
        if y_hist[data] > 0.1: # bins with data > 0.1 beyond left_bnd

In [77]:
idx = np.argwhere(d == 20)

for i in idx:
    print(dfm.iloc[i]['tas'])

9003   NaN
Name: tas, dtype: float64
9101   NaN
Name: tas, dtype: float64
25330   NaN
Name: tas, dtype: float64
25376   NaN
Name: tas, dtype: float64
25424   NaN
Name: tas, dtype: float64
25425   NaN
Name: tas, dtype: float64
25449   NaN
Name: tas, dtype: float64
25682   NaN
Name: tas, dtype: float64
25683   NaN
Name: tas, dtype: float64
25684   NaN
Name: tas, dtype: float64
25685   NaN
Name: tas, dtype: float64
25686   NaN
Name: tas, dtype: float64
25687   NaN
Name: tas, dtype: float64
25688   NaN
Name: tas, dtype: float64
25689   NaN
Name: tas, dtype: float64
25690   NaN
Name: tas, dtype: float64
25691   NaN
Name: tas, dtype: float64
25692   NaN
Name: tas, dtype: float64
25693   NaN
Name: tas, dtype: float64
25694   NaN
Name: tas, dtype: float64
25695   NaN
Name: tas, dtype: float64
25696   NaN
Name: tas, dtype: float64
25697   NaN
Name: tas, dtype: float64
25698   NaN
Name: tas, dtype: float64
25699   NaN
Name: tas, dtype: float64
25700   NaN
Name: tas, dtype: float64
25701   NaN
Na

In [61]:
dfm.iloc[225]

station            ASOSAWOS_72051724165
time                2015-01-01 19:50:00
ps                              77550.0
tas                           -2.765625
tdps                          -2.685268
pr                                  NaN
sfcWind                             0.0
sfcWind_dir                         NaN
elevation                        2220.0
qaqc_process                       V020
ps_qc                                 5
ps_altimeter                   101590.0
ps_altimeter_qc                       5
psl_qc                                9
tas_qc                                7
tdps_qc                               7
pr_qc                                  
pr_duration                         NaT
pr_depth_qc                         NaN
sfcWind_qc                            5
sfcWind_method                        C
sfcWind_dir_qc                        9
lat                              41.824
lon                            -110.556
month                                 1


In [50]:
np.histogram(dfm['tas'], bins=bins, density=True)

(array([0.        , 0.00245324, 0.00858632, 0.04252274, 0.11530205,
        0.11775529, 0.15169171, 0.24900337, 0.30338342, 0.49432689,
        0.53480527, 0.69344782, 0.67954615, 0.42113871, 0.14310539,
        0.03598078, 0.00531534, 0.00163549, 0.        , 0.        ,
        0.        , 0.        , 0.        ]),
 array([-3.25, -3.  , -2.75, -2.5 , -2.25, -2.  , -1.75, -1.5 , -1.25,
        -1.  , -0.75, -0.5 , -0.25,  0.  ,  0.25,  0.5 ,  0.75,  1.  ,
         1.25,  1.5 ,  1.75,  2.  ,  2.25,  2.5 ]))

In [51]:
bins

array([-3.25, -3.  , -2.75, -2.5 , -2.25, -2.  , -1.75, -1.5 , -1.25,
       -1.  , -0.75, -0.5 , -0.25,  0.  ,  0.25,  0.5 ,  0.75,  1.  ,
        1.25,  1.5 ,  1.75,  2.  ,  2.25,  2.5 ])

In [52]:
bins[23]

2.5