**Step 2: Perform validation on hourly or daily data.**

In [None]:
%matplotlib inline
import xarray as xr
import numpy as np 
import matplotlib.pyplot as plt
import os
import pandas as pd
from datetime import datetime, timedelta, date

import dask
import dask.array as dda
import dask.distributed as dd

# rhodium-specific kubernetes cluster configuration
import rhg_compute_tools.kubernetes as rhgk

In [None]:
client, cluster = rhgk.get_big_cluster()

In [None]:
cluster

In [None]:
# test a few validation functions 
'''test_file = xr.open_dataset(os.path.join('/gcs/impactlab-data/climate/source_data/ERA-5/{}/{}/netcdf/F320/'.format('tas', 'hourly'), 
                                         't2m_1998_12_20.nc'))'''
test_file = xr.open_dataset(os.path.join('/gcs/impactlab-data/climate/source_data/ERA-5/{}/{}/netcdf/F320/'.format('pr', 'hourly'), 
                                         'total_precip_1998_12_20.nc'))
# test = test_file['t2m'][:6, :2, :2]
# ds = test_file['tp'][:6, :2, :2]

Functions for validation

In [None]:
# validation functions
def test_for_nans(ds, var):
    """
    test for presence of NaNs
    """
    assert ds[var].isnull().sum() == 0, "there are nans!"

def test_timesteps(ds, timestep):
    """
    correct number of timesteps 
    """
    if timestep == 'hourly':
        assert (len(ds.time) == 24), "there are not 24 hours in this file!"
    elif timestep == 'daily':
        assert (len(ds.time) == 365 or len(ds.time) == 366), "there are not 365 or 366 days in this file!"
    
def test_temp_range(ds, var):
    """
    make sure temp values are in a valid range. valid for tas, tasmin, tasmax 
    """
    assert (ds[var].min() > 150) and (ds[var].max() < 350), "{} values are invalid".format(var) 
    
def test_dtr_range(ds, var):
    """
    make sure DTR values are in a valid range
    """
    assert (ds[var].min() > 0) and (ds[var].max() < 40), "diurnal temperature range values are invalid" 

def test_low_temp_range(ds, var): 
    """
    if we have some really low temp values, we want to know. valid for tas, tasmin, tasmax  
    """
    threshold = 180 # K
    return ds[var].where(ds[var] < threshold).count()

def test_high_temp_range(ds, var):
    """
    if we have some really high temp values, we want to know. valid for tas, tasmin, tasmax  
    """
    threshold = 330 # K
    return ds[var].where(ds[var] > threshold).count()

def test_polar_high_temp(ds, var):
    """
    if we have some really low or high polar temp values, we want to know. valid for tas, tasmin, tasmax  
    """
    threshold = 317 #315.5 # K
    loc_NH = ds[var].sel(latitude=slice(90,50), drop=True)
    num_NH = loc_NH.where(loc_NH > threshold).count()
    
    loc_SH = ds[var].sel(latitude=slice(-50,-90), drop=True)
    num_SH = loc_SH.where(loc_SH > threshold).count()
    return num_NH, num_SH

def test_negative_values(ds, var):
    """
    test for presence of negative values. valid for DTR or precip 
    """
    # this is not set to 0 to deal with floating point error 
    assert ds[var].where(ds[var] < -0.001).count() == 0, "there are negative values!"

def test_maximum_precip(ds, var):
    """
    test that max precip is reasonable 
    """
    threshold = 2.0 # max observed is 1.825m --> maximum occurs between 0.5-0.8
    return ds[var].where(ds[var] > threshold).count()

def validate_era5_variable(spec): 
    """
    validate ERA-5 hourly or daily files. 
    valid for hourly `t2m` or `pr` OR daily tasmin, tasmax, DTR, pr
    """

    filepath, timestep, var = spec
    print(spec)
    
    # first check to be sure file exists
    if os.path.isfile(filepath):
        pass
    else:
        raise FileNotFoundError("%s was not created" %filepath)
    
    # now validate 
    temperature_vars = {"t2m", "tasmin", "tasmax"}
    
    try: 
    
        with xr.open_dataset(filepath) as ds:

            # validation checks for all variables 
            try: 
                test_for_nans(ds, var)
            except AssertionError: 
                return ("NaNs!", filepath) 
            test_timesteps(ds, timestep)

            if var in temperature_vars: 
                # temperature specific checks 
                test_temp_range(ds, var)
                occurrances_low = test_low_temp_range(ds, var)
                if occurrances_low > 0:
                    return [occurrances_low, filepath]
                occurrances_high = test_high_temp_range(ds, var)
                if occurrances_high > 0:
                    return [occurrances_high, filepath]
                [occur_NH, occur_SH] = test_polar_high_temp(ds, var)
                if occur_NH or occur_SH > 0:
                    return [occur_NH, occur_SH, filepath]
            elif var == "tp" or var == 'pr':
                # precip specific checks 
                try: 
                    test_negative_values(ds, var)
                except AssertionError: 
                    return ("negative values", filepath)

                max_occurrances = test_maximum_precip(ds, var)
                if max_occurrances > 0:
                    return [max_occurrances, filepath]
            elif var == "dtr":
                # DTR specific checks
                try: 
                    test_dtr_range(ds, var)
                except AssertionError: 
                    return ("invalid DTR range", filepath)
                test_negative_values(ds, var)
            else: 
                raise ValueError("this variable is not supported in the current validation routines")
    except OSError:
        return ("unknown file format", filepath)

In [None]:
def generate_JOBS_files(variable, start_date, end_date, timestep):
    
    if variable == 't2m' and timestep == 'hourly':
        direc_var = 'tas'
        filename_var = 't2m'
        file_var = 't2m'
    elif variable == 'pr' and timestep == 'hourly':
        direc_var = 'pr'
        filename_var = 'total_precip'
        file_var = 'tp'
    else:
        direc_var = variable
        filename_var = variable
        file_var = variable
        
    # make list of daily datetime indices, this includes leap years 
    dt_index_full = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # reformat month/day for the retrieval function 
    dt_index_years = dt_index_full.year.astype(str)
    dt_index_months = dt_index_full.month.map("{:02}".format)
    dt_index_days = dt_index_full.day.map("{:02}".format)
    
    if timestep == 'hourly':
        directory = '/gcs/impactlab-data/climate/source_data/ERA-5/{}/{}/netcdf/F320/'.format(direc_var, timestep)
        daily_files = ['%s_%s_%s_%s.nc' %(filename_var, year, month, day) for year, month, 
               day in zip(dt_index_years, dt_index_months, dt_index_days)]
    elif timestep == 'daily':
        directory = '/gcs/impactlab-data/climate/source_data/ERA-5/{}/daily/netcdf/v1.2'.format(variable)
        daily_files = ['%s_daily_%s-%s.nc' %(filename_var, year, year) for year in np.unique(dt_index_years)]
    
    daily_filepaths = [os.path.join(directory, daily_file) for daily_file in daily_files]
    JOBS_validation = [(filepath, timestep, file_var) for filepath in daily_filepaths]
    return JOBS_validation

## Starting validation ##

This validation script works for the following hourly variables: 

hourly_vars = `t2m`, `pr`

And daily variables:

daily_vars = `tasmin`, `tasmax`, `dtr`, `pr`

NOTE: for doing daily versus hourly validation, be sure to update the `timestep` setting below. Also update `era_start` and `era_end` depending on what range of hourly or daily files you want to validate. 

In [None]:
# time period for validation 
'''era_start = '12-17-1995'
era_end = '01-15-2015'
'''
era_start = '01-01-1995'
era_end = '12-31-2000'
var = 'tasmin'
timestep = 'daily'

JOBS = generate_JOBS_files(var, era_start, era_end, timestep)

In [None]:
del futures_validation

In [None]:
%%time
futures_validation = client.map(validate_era5_variable, JOBS)
dd.progress(futures_validation)

In [None]:
# gathers output from workers
results = client.gather(futures_validation)

In [None]:
results

In [None]:
# test number of not None instances in results i.e. where a function 'failed'
if results is not None:
    print(sum(x is not None for x in results))

**Only execute cell below is above test NOT 0. Not needed otherwise.**

In [None]:
# Saves the output of above -- identifying where a value (and not None) is located
if results is not None:
    list_results = [x is not None for x in results]

    # worker index for flagged output (if above = True)
    res = [i for i, val in enumerate(list_results) if val]

    for i in res:
        print(results[i])