**Step 2: Perform validation on newly downoaded hourly or newly create daily data.**

In [1]:
%matplotlib inline
import xarray as xr
import numpy as np 
import matplotlib.pyplot as plt
import os
import pandas as pd
from datetime import datetime, timedelta, date

import dask
import dask.array as dda
import dask.distributed as dd

# rhodium-specific kubernetes cluster configuration
import rhg_compute_tools.kubernetes as rhgk

In [2]:
client, cluster = rhgk.get_big_cluster()
cluster.scale(30)

In [3]:
client

0,1
Client  Scheduler: gateway://traefik-impactlab-hub-dask-gateway.impactlab-hub:80/impactlab-hub.50d31a404f5242f181d3707a5e9af77e  Dashboard: /services/dask-gateway/clusters/impactlab-hub.50d31a404f5242f181d3707a5e9af77e/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [118]:
cluster.close()

Functions for validation

In [4]:
# temperature validation functions

def test_for_nans(ds, var):
    # no nans
    assert ds[var].isnull().sum() == 0, "there are nans!"

def test_timesteps(ds, timestep):
    if timestep == 'hourly':
        assert (len(ds.time) == 24), "there are not 24 hours in this file!"
    elif timestep == 'daily':
        assert (len(ds.time) == 365 or len(ds.time) == 366), "there are not 365 or 366 days in this file!"
    
def test_temp_range(ds, var):
    # make sure temp values are in a valid range
    # asserts if statement below is false
    assert (ds[var].min() > 150) or (ds[var].max() < 350), "temperature values are invalid" 

def test_low_temp_range(ds, var):
    threshold = 180
    location = ds[var].where(ds[var] < threshold)
    num = np.count_nonzero(~np.isnan(location))
    return num

def test_high_temp_range(ds, var):
    threshold = 330
    location = ds[var].where(ds[var] > threshold)
    num = np.count_nonzero(~np.isnan(location))
    return num

def test_polar_high_temp(ds, var):
    threshold = 317 #315.5
    loc_NH = ds[var].sel(latitude=slice(90,50)).where(ds[var].sel(latitude=slice(90,50)) > threshold)
    num_NH = np.count_nonzero(~np.isnan(loc_NH))
    
    loc_SH = ds[var].sel(latitude=slice(-50,-90)).where(ds[var].sel(latitude=slice(-50,-90)) > threshold)
    num_SH = np.count_nonzero(~np.isnan(loc_SH))
    return num_NH, num_SH

def validate_era5_temp(spec):
    '''
    validate ERA-5 hourly or daily temperature files. 
    works for hourly `t2m` or saved daily tas, tmin, tmax
    '''
    filepath, timestep, var = spec
    print(spec)
    
    # first check to be sure file exists
    if os.path.isfile(filepath):
        pass
    else:
        raise FileNotFoundError("%s was not created" %filepath)
    # now validate: test for nans, correct num of timesteps, 
    # correct variable exists, and temperature range is not absurd 
    with xr.open_dataset(filepath) as ds: # if grib: , engine='cfgrib'
        test_for_nans(ds, var)
        test_temp_range(ds, var)
        test_timesteps(ds, timestep)

        occurrances_low = test_low_temp_range(ds, var)
        if occurrances_low > 0:
            return [occurrances_low, filepath]
        
        occurrances_high = test_high_temp_range(ds, var)
        if occurrances_high > 0:
            return [occurrances_high, filepath]
        
        [occur_NH, occur_SH] = test_polar_high_temp(ds, var)
        if occur_NH or occur_SH > 0:
            return [occur_NH, occur_SH, filepath]

In [95]:
# precip validation functions

def test_for_nans(ds, var):
    # no nans
    assert ds[var].isnull().sum() == 0, "there are nans!"
    
def test_timesteps(ds, timestep):
    if timestep == 'hourly':
        assert (len(ds.time) == 24), "there are not 24 hours in this file!"
    elif timestep == 'daily':
        assert (len(ds.time) == 365 or len(ds.time) == 366), "there are not 365 or 366 days in this file!"

def test_negative_precip(ds, var):
    threshold = 0.0
    location = ds[var].where(ds[var] < threshold)
    num = np.count_nonzero(~np.isnan(location))
    return num

def test_maximum_precip(ds, var):
    threshold = 2.0 # max observed is 1.825m --> maximum occurs between 0.5-0.8
    location = ds[var].where(ds[var] > threshold)
    num = np.count_nonzero(~np.isnan(location))
    return num
    
def validate_era5_precip(spec):
    '''
    validate ERA-5 daily precipitation files. 
    '''
    filepath, timestep, var = spec
    
    # first check to be sure file exists
    if os.path.isfile(filepath):
        pass
    else:
        raise FileNotFoundError("%s was not created" %filepath)
    # now validate: test for nans, correct num of timesteps, 
    # correct variable exists, and precipitation range is not absurd 
    with xr.open_dataset(filepath) as ds:
        test_for_nans(ds, var)
        test_timesteps(ds, timestep)
        
        negative_occurrances = test_negative_precip(ds, var)
        if negative_occurrances > 0:
            return [negative_occurrances, filepath]
        
        max_occurrances = test_maximum_precip(ds, var)
        if max_occurrances > 0:
            return [max_occurrances, filepath]

In [96]:
def generate_JOBS_files(variable, directory, timestep):
    daily_files = ['%s_%s_%s_%s.nc' %(variable, year, month, day) for year, month, 
               day in zip(dt_index_years, dt_index_months, dt_index_days)]
    hourly_dir = directory

    if variable == 'total_precip':
        variable = 'tp'
    
    daily_filepaths = [os.path.join(hourly_dir, daily_file) for daily_file in daily_files]
    JOBS_validation = [(filepath, timestep, variable) for filepath in daily_filepaths]
    return JOBS_validation

Identifying time period for validation

In [97]:
era_start = '01-01-2000'
era_end = '12-31-2000'

# make list of daily datetime indices, this includes leap years 
dt_index_full = pd.date_range(start=era_start, end=era_end, freq='D')

# reformat month/day for the retrieval function 
dt_index_years = dt_index_full.year.astype(str)
dt_index_months = dt_index_full.month.map("{:02}".format)
dt_index_days = dt_index_full.day.map("{:02}".format)
daynum = dt_index_full.dayofyear

# make list of hours for retrieval function 
hours = [hr.strftime("%H:%M") for hr in pd.date_range(start='10-09-2019', end='10-10-2019', freq='H')[:-1]]

Starting validation execution

In [107]:
var = ['t2m', 'tas'] # ['total_precip', 'precip']
time_step = 'hourly' # 'daily'
directory = '/gcs/impactlab-data/climate/source_data/ERA-5/{}/{}/netcdf/F320/'.format(var[1], time_step)

In [108]:
JOBS = generate_JOBS_files(var[0], directory, time_step)

In [109]:
JOBS[0]

('/gcs/impactlab-data/climate/source_data/ERA-5/tas/hourly/netcdf/F320/t2m_2000_01_01.nc',
 'hourly',
 't2m')

In [110]:
if var[0] == 't2m':
    validate_func = validate_era5_temp
elif var[0] == 'total_precip':
    validate_func = validate_era5_precip

In [111]:
%%time
futures_validation = client.map(validate_func, JOBS)
dd.progress(futures_validation)

CPU times: user 5.44 ms, sys: 75 µs, total: 5.52 ms
Wall time: 5.02 ms


VBox()

In [112]:
# gathers output from workers
results = client.gather(futures_validation)

In [113]:
results

In [116]:
# test number of not None instances in results i.e. where a function 'failed'
if results is not None:
    print(sum(x is not None for x in results))

**Only execute cell below is above test NOT 0. Not needed otherwise.**

In [117]:
# Saves the output of above -- identifying where a value (and not None) is located
if results is not None:
    list_results = [x is not None for x in results]

    # worker index for flagged output (if above = True)
    res = [i for i, val in enumerate(list_results) if val]

    for i in res:
        print(results[i])

**Once validation is complete, create a year file of daily values of precip, tmax, tmin, tas and DTR using `create_era5_yearlong_daily_files.ipynb`**