In [1]:
import pandas as pd
import numpy as np
import xarray as xr

import scipy.stats as stats
import matplotlib.pyplot as plt

import boto3
from io import BytesIO



In [2]:
def _plot_format_helper(var):
    """Helper function for plots"""

    pr_vars = ['pr', 'pr_5min', 'pr_1h', 'pr_24h', 'pr_localmid']
    ps_vars = ['ps', 'psl', 'psl_altimeter']
    
    if var == 'tas':
        ylab = 'Air Temperature at 2m'
        unit = 'K'
        
    elif var == 'tdps' or var == 'tdps_derived':
        ylab = 'Dewpoint Temperature'
        unit = 'K'
        
    elif var == 'sfcWind':
        ylab = 'Surface Wind Speed'
        unit = '${m s^-1}$'
        
    elif var == 'sfcWind_dir':
        ylab = 'Surface Wind Direction'
        unit = 'degrees'
        
    elif var == 'rsds':
        ylab = 'Surface Radiation'
        unit = '${W m^-2}$'
        
    elif var in pr_vars:
        ylab = 'Precipitation' # should be which precip var it is
        unit = 'mm'

    elif var in ps_vars:
        ylab = 'Pressure' # should eventually be what pressure var it is
        unit = 'Pa'
        
    return (ylab, unit)


def monthly_med(df):
    """Calculates the monthly median"""
    return df.resample('M', on='time').median(numeric_only=True)

def create_bins(data, bin_size=0.25):
    '''Create bins from data covering entire data range'''

    # set up bins
    b_min = np.floor(np.nanmin(data))
    b_max = np.ceil(np.nanmax(data))
    bins = np.arange(b_min - bin_size, b_max + (3. * bin_size), bin_size)

    return bins

def iqr_standardize(df, var):
    q1 = df[var].quantile(0.25)
    q3 = df[var].quantile(0.75)
    iqr = q3 - q1
    
    return (df[var].values - df[var].median()) / iqr

def pdf_bounds(df, mu, sigma, bins):
    '''Calculate pdf distribution, return pdf and threshold bounds'''

    y = stats.norm.pdf(bins, mu, sigma)
    
    # add vertical lines to indicate thresholds where pdf y=0.1
    pdf_bounds = np.argwhere(y > 0.1)

    # find first index
    left_bnd = round(bins[pdf_bounds[0][0] -1])
    right_bnd = round(bins[pdf_bounds[-1][0] + 1])
    thresholds = (left_bnd - 1, right_bnd + 1)
    
    return (y, left_bnd - 1, right_bnd + 1)

def monthly_med(df):
    """Calculates the monthly median"""
    return df.resample('M', on='time').median(numeric_only=True)

def iqr_range(df, month, var):
    """Calculates the monthly interquartile range"""
    q1 = df.groupby('month').quantile(0.25, numeric_only=True)
    q3 = df.groupby('month').quantile(0.75, numeric_only=True)
    iqr_df = q3 - q1
    
    iqr_val = iqr_df.loc[iqr_df.index == month]
    
    # inflated to 4°C or 4 hPa for months with very small IQR
    var_check = ['tas', 'tdps', 'tdps_derived', 'ps', 'psl', 'psl_altimeter']
    if iqr_val[var].values < 4:
        if var in var_check:
            iqr_val[var].values = 4
    
    return iqr_val[var].values

def standardized_anom(df, month, var):
    """
    Calculates the monthly anomalies standardized by IQR range
    
    Returns:
        arr_std_anom: array of monthly standardized anomalies for var
    """
    
    df_monthly_med = monthly_med(df)
    df_clim_med = clim_med(df)
    
    arr_anom = (df_monthly_med.loc[df_monthly_med['month'] == month][var].values -
                df_clim_med.loc[df_clim_med.index == month][var].values)
        
    arr_std_anom = arr_anom / iqr_range(df, month, var)
    
    return arr_std_anom
    
def standardized_median_bounds(df, month, var, iqr_thresh=5):
    """Calculates the standardized median"""
    std_med = df.loc[df['month'] == month][var].median() # climatological median for that month
    
    lower_bnd = std_med - (iqr_thresh * iqr_range(df, month, var))
    upper_bnd = std_med + (iqr_thresh * iqr_range(df, month, var))
    
    return (std_med, lower_bnd[0], upper_bnd[0])

def qaqc_dist_whole_stn_bypass_check(df, vars_to_check, min_num_months=5):
    """
    Checks the number of valid observation months in order to proceed through 
    monthly distribution checks. Identifies whether a station record has too 
    few months and produces a fail pass flag. 
    """
    
    # set up df
    df = df.reset_index() 
    df['month'] = pd.to_datetime(df['time']).dt.month # sets month to new variable
    df['year'] = pd.to_datetime(df['time']).dt.year # sets year to new variable
             
    # set up a "pass_flag" to determine if station proceeds through distribution function
    pass_flag = 'pass'
    
    for var in vars_to_check:
        # add _eraqc column for each variable
        df[var+'_eraqc'] = np.nan # default value of nan    
    
        for month in range(1,13):

            # first check num of months in order to continue
            month_to_check = df.loc[df['month'] == month]

            # check for number of obs years
            if (len(month_to_check.year.unique()) < 5):
                df[var+'_eraqc'] = 18 # see era_qaqc_flag_meanings.csv
                pass_flag = 'fail'

    err_statement = '{} has too short of an observation record to proceed through the monthly distribution qa/qc checks -- bypassing station'.format(
                    df['station'].unique()[0])
    
    if pass_flag == 'fail':
        print(err_statement)
                
    return (df, pass_flag) 


def qaqc_dist_var_bypass_check(df, vars_to_check, min_num_months=5):
    """
    Checks the number of valid observation months per variable
    to proceed through monthly distribution checks.
    Primarily assesses whether if null values persist for a month
    """
        
    for var in vars_to_check:
        for month in range(1,13):
            monthly_df = df.loc[df['month']==month]
            
            # if all values are null for that month across years
            if monthly_df[var].isnull().all() == True:
                df[var+'_eraqc'] = 19 # see era_qaqc_flag_meanings.csv
            
            # if not all months have nans, need to assess how many years do
            elif monthly_med(df).loc[monthly_med(df)['month'] == month][var].isna().sum() > min_num_months:
                df[var+'_eraqc'] = 19 # see era_qaqc_flag_meanings.csv
        
    return df

def iqr_range(df, month, var):
    """Calculates the monthly interquartile range"""
    q1 = df.groupby('month').quantile(0.25, numeric_only=True)
    q3 = df.groupby('month').quantile(0.75, numeric_only=True)
    iqr_df = q3 - q1
    
    iqr_val = iqr_df.loc[iqr_df.index == month]
    
    ## come back to this -- currently breaking
#     # inflated to 4°C or 4 hPa for months with very small IQR
#     var_check = ['tas', 'tdps', 'tdps_derived', 'ps', 'psl', 'psl_altimeter']
#     if iqr_val[var].values < 4:
#         if var in var_check:
#             iqr_val[var].values = 4
    
    return iqr_val[var].values


def median_clim(df, month, var):
    '''Calculate climatological median for a specific month and variable'''
    
    clim = df[var].median(numeric_only=True)
    
    return clim

In [19]:
def qaqc_dist_gap_part1(df, iqr_thresh=5, plot=True):
    """
    Part 1 / monthly check
        - compare anomalies of monthly median values
        - standardize against interquartile range
        - compare stepwise from the middle of the distribution outwards
        - asymmetries are identified and flagged if severe
    Goal: identifies suspect months and flags all obs within month
    
    NOTE: Code is preliminary at present, and does not necessarily reflect the final IQR threshold 
    """
    
    # run through every var, excluding qaqc/duration/method vars
    vars_to_remove = ['index','station','qc','duration','method','lat','lon','elevation','time','month','year','sfcWind_dir'] # list of var substrings to exclude if present in var
    vars_to_check = [var for var in df.columns if not any(True for item in vars_to_remove if item in var)] # remove all non-primary variables
        
        
    for var in vars_to_check:
        for month in range(1,13):

            # per variable bypass check
            df = qaqc_dist_var_bypass_check(df, vars_to_check) # flag here is 19
            if 19 in df[var+'_eraqc']:
                continue # skip variable 

            # station has above min_num_months number of valid observations, proceed with dist gap check
            else:
                # calculate monthly climatological median, and bounds
                mid, low, high = standardized_median_bounds(df, month, var, iqr_thresh=iqr_thresh)

                # calculate monthly median per month
                df_month = monthly_med(df)

                for i in df_month.loc[df_month['month'] == month][var]:
                    if (i < low) or (i > high):
                        year_to_flag = (df_month.loc[(df_month[var]==i) & 
                                           (df_month['month']==month)]['year'].values[0])
                        print('Median {} value for {}-{} is beyond the {}*IQR limits -- flagging month'.format(
                            var,
                            month, 
                            int(year_to_flag),
                            iqr_thresh)
                        )

                        # flag all obs in that month
                        df.loc[(df['month']==month) & 
                               (df['year']==year_to_flag), var+'_eraqc'] = 20 # see era_qaqc_flag_meanings.csv

        if plot==True:
            for month in range(1,13):
                for var in vars_to_check:
                    if 19 not in df[var+'_eraqc'].values: # don't plot a figure if it's all nans/not enough months
                        dist_gap_part1_plot(df, month, var, flagval=20, iqr_thresh=iqr_thresh,
                                            network=df['station'].unique()[0].split('_')[0])
                
    return df

def dist_gap_part1_plot(df, month, var, flagval, iqr_thresh, network):
        
    # grab data by months
    df = df.loc[df['month'] == month]
        
    # grab flagged data
    flag_vals = df.loc[df[var + '_eraqc'] == flagval]
    
    # plot valid data
    ax = df.plot.scatter(x='time', y=var, label='Pass')
    
    # plot flagged data
    flag_vals.plot.scatter(ax=ax, x='time', y=var, color='r', label='Flagged')
    # should be consistent with other plots - I like Hector's open circles around flagged values

    # plot climatological median and threshold * IQR range
    mid, low_bnd, high_bnd = standardized_median_bounds(df, month, var, iqr_thresh=5)
    
    plt.axhline(y=mid, color='k', lw=0.5, label='Climatological monthly median')
    plt.fill_between(x=df['time'],
                    y1=low_bnd,
                    y2=high_bnd,
                    alpha=0.25, color='0.75', 
                    label='{} * IQR range'.format(iqr_thresh))
    
    # plot aesthetics
    plt.legend(loc='best')
    ylab = _plot_format_helper(var)
    plt.ylabel('{} [{}]'.format(ylab[0], ylab[1]));
    plt.xlabel('')
    plt.title('Distribution gap check pt 1: {0} / month: {1}'.format(
        df['station'].unique()[0],
        month), 
              fontsize=10);
    
    # save to AWS
    bucket_name = 'wecc-historical-wx'
    directory = '3_qaqc_wx'
    img_data = BytesIO()
    plt.savefig(img_data, format='png')
    img_data.seek(0)
    
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    figname = 'qaqc_dist_gap_check_part1_{0}_{1}_{2}'.format(df['station'].unique()[0], var, month)
    bucket.put_object(Body=img_data, ContentType='image/png',
                 Key='{0}/{1}/qaqc_figs/{2}.png'.format(
                 directory, network, figname))
    
    # close figures to save memory
    plt.close()
    
def qaqc_dist_gap_part2(df, plot=True):
    """
    Part 2 / monthly check
        - compare all obs in a single month, all years
        - histogram created from all obs and gaussian distribution fitted
        - threshold values determined using positions where fitted freq falls below y=0.1
        - rounds outwards to next integer plus one
        - going outwards from center, distribution is scanned for gaps which occur outside threshold
        - obs beyond gap are flagged
    Goal: identifies individual suspect observations and flags the entire month  
    """
    # run through every var, excluding qaqc/duration/method vars
    vars_to_remove = ['index','station','qc','duration','method','lat','lon','elevation','time','month','year','sfcWind_dir'] # list of var substrings to exclude if present in var
    vars_to_check = [var for var in df.columns if not any(True for item in vars_to_remove if item in var)] # remove all non-primary variables
        
    # whole station bypass check first
    df, pass_flag = qaqc_dist_whole_stn_bypass_check(df, vars_to_check)
    
    if pass_flag != 'fail':
        
        for var in vars_to_check:
            for month in range(1,13):
                
                # per variable bypass check
                df = qaqc_dist_var_bypass_check(df, vars_to_check) # flag here is 19
                if 19 in df[var+'_eraqc']:
                    continue # skip variable 
                
                # station has above min_num_months number of valid observations, proceed with dist gap check
                else:
                    # from center of distribution, scan for gaps (where bin = 0)
                    # when gap is found, and it is at least 2x bin width
                    # any bins beyond end of gap + beyond threshold value are flagged
                    
                    # subset by month
                    df = df.loc[df['month'] == month]
                    
                    # standardize against IQR range
                    df_month_iqr = iqr_standardize(df, var)

                    # determine number of bins
                    bins = create_bins(df_month_iqr)
                    
                    # pdf
                    mu = np.nanmean(df_month_iqr)
                    sigma = np.nanstd(df_month_iqr)

                    y, left_bnd, right_bnd = pdf_bounds(df_month_iqr, mu, sigma, bins)
                    
                    # identify gaps as below y=0.1 from histogram, not pdf                    
                    y_hist, bins = np.histogram(df_iqr, bins=bins, density=True)
                    
                    # identify climatology and iqr baselines in order to flag
                    iqr_baseline = iqr_range(df, month=month, var=var)
                    clim = median_clim(df, month=month, var=var)
                                        
                    # gaps are only flagged for values beyond left_bnd, right_bnd, as long as gap is 2*bin_width (2*0.25)
                    # considering that the # of bins for threshold is (4,7) from y=0.1
                    # safe to assume that gap is present if values >0.1 outside of left_bnd, right_bnd
                    bins_beyond_left_bnd = np.argwhere(bins <= left_bnd)
                    if len(bins_beyond_left_bnd) != 0: 
                        for data in bins_beyond_left_bnd:
                            if y_hist[data] > 0.1: # bins with data > 0.1 beyond left_bnd
                                
                                # identify values beyond left bnd
                                vals_to_flag = clim + (left_bnd * iqr_baseline) # left_bnd is negative
                                df.loc[df[var] <= vals_to_flag[0], var+'_eraqc'] = 21 # see era_qaqc_flag_meanings.csv


                    bins_beyond_right_bnd = np.argwhere(bins >= right_bnd)
                    if len(bins_beyond_right_bnd) != 0:
                        for data in bins_beyond_right_bnd:
                            if y_hist[data] > 0.1: # bins with data > 0.1 beyond right_bnd
                                
                                # identify values beyond right bnd
                                vals_to_flag = clim + (right_bnd * iqr_baseline) # upper limit threshold
                                df.loc[df[var] >= vals_to_flag[0], var+'_eraqc'] = 21 # see era_qaqc_flag_meanings.csv
                    
    ## Question: Do we need "all", "flagged_only", "none" options instead?
    if plot==True:
        for month in range(1,13):
            for var in vars_to_check:
                if 19 not in df[var+'_eraqc'].values: # don't plot a figure if it's all nans/not enough months
                    dist_gap_part2_plot(df, month, var,
                                        network=df['station'].unique()[0].split('_')[0])

    
    return df    
    
def dist_gap_part2_plot(df, month, var, network):
    
    # subset by month
    df = df.loc[df['month'] == month]
    
    # standardize against IQR range
    df_month_iqr = iqr_standardize(df, var)
    
    # determine number of bins
    bins = create_bins(df_month_iqr)
    
    # plot histogram
    ax = plt.hist(df_month_iqr, bins=bins, log=False, density=True, alpha=0.3);
    xmin, xmax = plt.xlim()
    plt.ylim(ymin=0.1)
    
    # pdf
    mu = np.nanmean(df_month_iqr)
    sigma = np.nanstd(df_month_iqr)
    y, left_bnd, right_bnd = pdf_bounds(df_month_iqr, mu, sigma, bins)
    l = plt.plot(bins, y, 'k--', linewidth=1)

    # bounds from distribution rounded up to nearest integer + 1
    plt.axvline(right_bnd, color='r') # right tail
    plt.axvline(left_bnd, color='r') # left tail
    
    # flag (visually) obs that are beyond threshold
    for bar in ax[2].patches:
        x = bar.get_x() + 0.5 * bar.get_width()
        if x > right_bnd: # right tail
            bar.set_color('r')
        elif x < left_bnd: # left tail
            bar.set_color('r')

    # title and useful annotations
    plt.title('Distribution gap check pt 2: {0}: {1}'.format(df['station'].unique()[0], var), fontsize=10);
    plt.annotate('Month: {}'.format(month), xy=(0.025, 0.95), xycoords='axes fraction', fontsize=8);
    plt.annotate('Mean: {}'.format(round(mu,3)), xy=(0.025, 0.9), xycoords='axes fraction', fontsize=8);
    plt.annotate('Std.Dev: {}'.format(round(sigma,3)), xy=(0.025, 0.85), xycoords='axes fraction', fontsize=8);
    plt.ylabel('Frequency (obs)')
    
    # save figure to AWS
    bucket_name = 'wecc-historical-wx'
    directory = '3_qaqc_wx'
    img_data = BytesIO()
    plt.savefig(img_data, format='png')
    img_data.seek(0)
    
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    figname = 'qaqc_dist_gap_check_part2_{0}_{1}_{2}'.format(df['station'].unique()[0], var, month)
    bucket.put_object(Body=img_data, ContentType='image/png',
                     Key='{0}/{1}/qaqc_figs/{2}.png'.format(
                     directory, network, figname))


def flagged_timeseries_plot(df, flag_to_viz):
    
    network = df['station'].unique()[0].split('_')[0]
    
    vars_to_remove = ['index','station','qc','duration','method','lat','lon','elevation','time','month','year','sfcWind_dir'] # list of var substrings to exclude if present in var
    vars_to_check = [var for var in df.columns if not any(True for item in vars_to_remove if item in var)] # remove all non-primary variables
        
    for flag in flag_to_viz:
    
        # assess where each variable has flagged values
        for var in vars_to_check:
            flagged_data = df.loc[df[var+'_eraqc'] == flag]

            # only produce a plot if there is flagged values
            if len(flagged_data) == 0:
                continue

            # plot
            ax = df.plot.scatter(x='time', y=var, color='k', s=0.8, label='Valid')

            # plot flagged data
            flagged_data.plot.scatter(ax=ax, x='time', y=var, color='r', s=0.9, label='Flag: {}'.format(flag))

            # plot aesthetics
            plt.legend(loc='best', ncol=2)
            ylab = _plot_format_helper(var)
            plt.ylabel('{} [{}]'.format(ylab[0], ylab[1]));
            plt.xlabel('')
            plt.title('{0}'.format(df['station'].unique()[0]), fontsize=10);

        # save to AWS
        bucket_name = 'wecc-historical-wx'
        directory = '3_qaqc_wx'
        img_data = BytesIO()
        plt.savefig(img_data, format='png')
        img_data.seek(0)

        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        figname = 'flagged_timeseries_{0}_{1}'.format(df['station'].unique()[0], var)
        bucket.put_object(Body=img_data, ContentType='image/png',
                     Key='{0}/{1}/qaqc_figs/{2}.png'.format(
                     directory, network, figname))

        # close figures to save memory
        plt.close()

def qaqc_unusual_gaps(df, iqr_thresh=5, plots=True):
    
    # bypass check
    vars_to_remove = ['index','station','qc','duration','method','lat','lon','elevation','time','month','year','sfcWind_dir'] # list of var substrings to exclude if present in var
    vars_to_check = [var for var in df.columns if not any(True for item in vars_to_remove if item in var)] # remove all non-primary variables
        
    # whole station bypass check first
    df, pass_flag = qaqc_dist_whole_stn_bypass_check(df, vars_to_check)
    
    if pass_flag == 'fail':
        return df
    else:
        df_part1 = qaqc_dist_gap_part1(df, iqr_thresh, plots)
        df_part2 = qaqc_dist_gap_part2(df_part1, plots)

        if plots == True:
            flagged_timeseries_plot(df_part2, flag_to_viz = [19, 20, 21])
    
    return df_part2

In [24]:
# testing file
file = xr.open_dataset('/Users/victoriaford/Desktop/Train_Files/CDEC_CWD.nc')
df = file.to_dataframe()

In [25]:
qaqc_unusual_gaps(df)

CDEC_CWD has too short of an observation record to proceed through the monthly distribution qa/qc checks -- bypassing station


Unnamed: 0,station,time,tas,pr,tas_qc,elevation,lat,lon,month,year,tas_eraqc,pr_eraqc
0,CDEC_CWD,2017-04-13 19:00:00,274.82,0.0,,3093.72,36.48383,-118.17755,4,2017,18,18
1,CDEC_CWD,2017-04-13 20:00:00,274.82,0.0,,3093.72,36.48383,-118.17755,4,2017,18,18
2,CDEC_CWD,2017-04-13 21:00:00,274.26,0.0,,3093.72,36.48383,-118.17755,4,2017,18,18
3,CDEC_CWD,2017-04-13 22:00:00,273.15,0.0,,3093.72,36.48383,-118.17755,4,2017,18,18
4,CDEC_CWD,2017-04-13 23:00:00,270.93,0.0,,3093.72,36.48383,-118.17755,4,2017,18,18
...,...,...,...,...,...,...,...,...,...,...,...,...
12361,CDEC_CWD,2018-09-12 10:00:00,255.37,,78,3093.72,36.48383,-118.17755,9,2018,18,18
12362,CDEC_CWD,2018-09-12 11:00:00,255.37,,78,3093.72,36.48383,-118.17755,9,2018,18,18
12363,CDEC_CWD,2018-09-12 12:00:00,255.37,,78,3093.72,36.48383,-118.17755,9,2018,18,18
12364,CDEC_CWD,2018-09-12 14:00:00,255.37,,78,3093.72,36.48383,-118.17755,9,2018,18,18
