Motivation: individual gross outliers from general station distribution are a common error in obs data by random recording, reporting, formatting, or instrumentation errors

Process:
1. uses individual observation deviations derived from monthly mean climatology calculated for each hour of the day
2. climatologies calculated using winsorised data to remove initial effect of outliers
    - Winsorising: all values beyond threhsold value from mean are set to that threshold value
    - 5 and 95% for hadisd
    - number of data values in population remains the same, not trimmed
3. raw unwinsorised observations are anomalised using these climatologies
4. standardized by IQR for that month and hour
    - IQR cannot be less than 1.5degC
5. values are low-pass filtered to remove any climate change signal causing overzealous removal at ends of time series
6. gaussian is fitted to the histogram of anomalies for each month
7. threshold value, rounded outwards where crosses y=0.1 line
8. distribution beyond threhsold value is scanned for gap, equal to bin width or more
9. all values beyond gap are flagged
10. obs that fall between critical threshold value and gap or critical threshold and end of distribution are tentatively flagged
    - these may be later reinstated on comparison with good data from neighboring stations

Notes:
- when applied to SLP, frequently flags storm signals, which may be of high interest, so this test is not applied to pressure data
- hadisd only applies to temp and dewpoint temp

In [1]:
import pandas as pd
import numpy as np
import xarray as xr



In [2]:
ds = xr.open_dataset('/Users/victoriaford/Desktop/eaglerock/Historical Data Platform/Train_Files/ASOSAWOS_72051724165.nc')

df = ds.to_dataframe()
df = df.reset_index()
df['month'] = pd.to_datetime(df['time']).dt.month # sets month to new variable
df['year'] = pd.to_datetime(df['time']).dt.year # sets year to new variable

  base = data.astype(np.int64)
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")


In [3]:
df

Unnamed: 0,station,time,ps,tas,tdps,pr,sfcWind,sfcWind_dir,elevation,qaqc_process,...,pr_qc,pr_duration,pr_depth_qc,sfcWind_qc,sfcWind_method,sfcWind_dir_qc,lat,lon,month,year
0,ASOSAWOS_72051724165,2013-06-27 12:55:00,78690.0,284.45,277.95,,5.7,350.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,6,2013
1,ASOSAWOS_72051724165,2013-06-27 13:15:00,,285.65,277.95,,5.7,350.0,2220.0,V020,...,,NaT,,1,N,1,41.824,-110.557,6,2013
2,ASOSAWOS_72051724165,2013-06-27 13:35:00,,287.05,278.15,,4.6,350.0,2220.0,V020,...,,NaT,,1,N,1,41.824,-110.557,6,2013
3,ASOSAWOS_72051724165,2013-06-27 13:55:00,78750.0,289.05,278.65,,4.6,350.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,6,2013
4,ASOSAWOS_72051724165,2013-06-27 14:15:00,,289.95,278.85,,4.6,360.0,2220.0,V020,...,,NaT,,1,N,1,41.824,-110.557,6,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178389,ASOSAWOS_72051724165,2022-08-11 20:15:00,78640.0,300.15,276.15,,7.2,260.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,8,2022
178390,ASOSAWOS_72051724165,2022-08-11 20:35:00,78580.0,301.15,279.15,,9.8,240.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,8,2022
178391,ASOSAWOS_72051724165,2022-08-11 20:55:00,78580.0,301.15,279.15,,5.7,290.0,2220.0,V020,...,,NaT,,5,V,5,41.824,-110.556,8,2022
178392,ASOSAWOS_72051724165,2022-08-11 21:15:00,78530.0,301.15,279.15,,10.3,250.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,8,2022


In [None]:
# calculate monthly mean climatology for each hour of the day
# raw data is then anomalized by climatology for that month/hour
# standardized by iqr for month and hour

In [52]:
def clim_mon_mean_hourly(df, var, month, hour):
    '''Calculate the monthly mean climatology for each of the day'''
    
    df_m_h = df.loc[(df.time.dt.month == month) & (df.time.dt.hour == hour)]
    clim_value = df_m_h[var].mean(numeric_only = True)
    
    # special handling if value is nan? 
    
    return clim_value

def clim_anom_monhour(df):
    '''Anomalize data by monthly climatology for each hour'''
    
    vars_to_check = ['tas', 'tdps', 'tdps_derived']
    vars_to_anom = [v for v in vars_to_check if v in df.columns]
#     print(vars_to_anom)
    
    df2 = df.copy()
    
    for var in vars_to_anom:
    
        for m in range(1,13,1):
            for h in range(0,24,1):
                anom_value = clim_mon_mean_hourly(df, var, month=m, hour=h)
                df_m_h = df.loc[(df.time.dt.month == m) & (df.time.dt.hour == h)]

                # calculate the monthly climatological anomaly by hour
                df2.loc[(df.time.dt.month == m) & (df.time.dt.hour == h), var] = df_m_h[var] - anom_value
    
    return df2

def iqr_range_monhour(df, month, hour, var):
    '''Calculates the monthly interquartile range per hour'''
    
    q1 = df.loc[(df.time.dt.month == month) & (df.time.dt.hour == hour)].quantile(0.25, numeric_only=True)
    q3 = df.loc[(df.time.dt.month == month) & (df.time.dt.hour == hour)].quantile(0.75, numeric_only=True)
    
    iqr_df = q3 - q1
    
    return iqr_df[var]

def standardize_iqr_monhour(df):
    '''Standardizes the monthly climatological anomaly by the IQR for each hour'''
    
    df3 = df.copy()
    
    df2 = clim_anom_monhour(df) # monthly climatological anomalies per hour
    
    vars_to_check = ['tas', 'tdps', 'tdps_derived']
    vars_to_anom = [v for v in vars_to_check if v in df.columns]
#     print(vars_to_anom)
        
    for var in vars_to_anom:
    
        for m in range(1,13,1):
            for h in range(0,24,1):
                anom_m_h = df2.loc[(df2.time.dt.month == m) & (df2.time.dt.hour == h)]
                
                iqr_val = iqr_range_monhour(df, m, h, var)
                
                df3.loc[(df.time.dt.month == m) & (df.time.dt.hour == h), var] = anom_m_h[var] / iqr_val
    
    return df3

In [54]:
standardize_iqr_monhour(df)

Unnamed: 0,station,time,ps,tas,tdps,pr,sfcWind,sfcWind_dir,elevation,qaqc_process,...,pr_qc,pr_duration,pr_depth_qc,sfcWind_qc,sfcWind_method,sfcWind_dir_qc,lat,lon,month,year
0,ASOSAWOS_72051724165,2013-06-27 12:55:00,78690.0,0.955325,0.746228,,5.7,350.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,6,2013
1,ASOSAWOS_72051724165,2013-06-27 13:15:00,,0.650294,0.551395,,5.7,350.0,2220.0,V020,...,,NaT,,1,N,1,41.824,-110.557,6,2013
2,ASOSAWOS_72051724165,2013-06-27 13:35:00,,1.000294,0.601395,,4.6,350.0,2220.0,V020,...,,NaT,,1,N,1,41.824,-110.557,6,2013
3,ASOSAWOS_72051724165,2013-06-27 13:55:00,78750.0,1.500294,0.726395,,4.6,350.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,6,2013
4,ASOSAWOS_72051724165,2013-06-27 14:15:00,,0.769324,0.562647,,4.6,360.0,2220.0,V020,...,,NaT,,1,N,1,41.824,-110.557,6,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178389,ASOSAWOS_72051724165,2022-08-11 20:15:00,78640.0,0.537597,0.468912,,7.2,260.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,8,2022
178390,ASOSAWOS_72051724165,2022-08-11 20:35:00,78580.0,0.737597,0.843912,,9.8,240.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,8,2022
178391,ASOSAWOS_72051724165,2022-08-11 20:55:00,78580.0,0.737597,0.843912,,5.7,290.0,2220.0,V020,...,,NaT,,5,V,5,41.824,-110.556,8,2022
178392,ASOSAWOS_72051724165,2022-08-11 21:15:00,78530.0,0.764889,0.846517,,10.3,250.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,8,2022


In [40]:
clim_anom_monhour(df)

['tas', 'tdps']


Unnamed: 0,station,time,ps,tas,tdps,pr,sfcWind,sfcWind_dir,elevation,qaqc_process,...,pr_qc,pr_duration,pr_depth_qc,sfcWind_qc,sfcWind_method,sfcWind_dir_qc,lat,lon,month,year
0,ASOSAWOS_72051724165,2013-06-27 12:55:00,78690.0,3.821302,2.984911,,5.7,350.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,6,2013
1,ASOSAWOS_72051724165,2013-06-27 13:15:00,,2.601175,2.205580,,5.7,350.0,2220.0,V020,...,,NaT,,1,N,1,41.824,-110.557,6,2013
2,ASOSAWOS_72051724165,2013-06-27 13:35:00,,4.001175,2.405580,,4.6,350.0,2220.0,V020,...,,NaT,,1,N,1,41.824,-110.557,6,2013
3,ASOSAWOS_72051724165,2013-06-27 13:55:00,78750.0,6.001175,2.905580,,4.6,350.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,6,2013
4,ASOSAWOS_72051724165,2013-06-27 14:15:00,,3.846618,2.813235,,4.6,360.0,2220.0,V020,...,,NaT,,1,N,1,41.824,-110.557,6,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178389,ASOSAWOS_72051724165,2022-08-11 20:15:00,78640.0,2.687987,3.751299,,7.2,260.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,8,2022
178390,ASOSAWOS_72051724165,2022-08-11 20:35:00,78580.0,3.687987,6.751299,,9.8,240.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,8,2022
178391,ASOSAWOS_72051724165,2022-08-11 20:55:00,78580.0,3.687987,6.751299,,5.7,290.0,2220.0,V020,...,,NaT,,5,V,5,41.824,-110.556,8,2022
178392,ASOSAWOS_72051724165,2022-08-11 21:15:00,78530.0,3.824444,6.772137,,10.3,250.0,2220.0,V020,...,,NaT,,5,N,5,41.824,-110.556,8,2022


In [None]:
def qaqc_climatological_outlier(df, plot=True, verbose=True):
    '''
    Flags individual gross outliers from climatological distribution.
    Only applied to air temperature and dew point temperature
    
    Input:
    ------
        df [pd.DataFrame]: station dataset converted to dataframe through QAQC pipeline
        plots [bool]: if True, produces plots of any flagged data and saved to AWS
            
    Returns:
    --------
        qaqc success:
            df [pd.DataFrame]: QAQC dataframe with flagged values (see below for flag meaning)
        qaqc failure:
            None
            
    Flag meaning:
    -------------
        25,qaqc_climatological_outlier,Value flagged as a climatological outlier
        26,qaqc_climatological_outlier,Value flagged as a tentative climatological outlier. Review in neighboring stations check.
    '''
    
    vars_to_check = ['tas', 'tdps', 'tdps_derived']
    

In [None]:
def winsorise_data(df, var, percent=0.05):
    '''
    Winsorising: all values beyond a threshold value from the mean are set to that threshold value
        - Removes initial effect of outliers
        - HadISD uses 5% and 95%
        - Result: Population size remains the same, instead of trimming those observations from data
    '''
    
    # find observations beyond these thresholds and set to the percentile value at that point
    p_low = np.nanpercentile(df[var], percent)
    p_high = np.nanpercentile(df[var], 1-percent)
    print(p_low, p_high)
        
    df.loc[df[var] < p_low, var] = p_low
    df.loc[df[var] > p_high, var] = p_high
            
    return df

### Does not match the output from scipy.stats.winsorize

In [None]:
winsorise_data(df, 'tas')