**Step 2: Perform validation on newly downoaded data. Then, calculate the daily tmin, tmax, tas and DTR to save yearly files. Update to Diana's 'era5_download_and_aggregate_to_daily.ipynb' notebook**

In [2]:
%matplotlib inline
import xarray as xr
import numpy as np 
import matplotlib.pyplot as plt
import os
import pandas as pd
from datetime import datetime, timedelta, date

import dask
import dask.array as dda
import dask.distributed as dd

# rhodium-specific kubernetes cluster configuration
import rhg_compute_tools.kubernetes as rhgk

In [3]:
client, cluster = rhgk.get_big_cluster()
cluster.scale(30)

In [4]:
client

0,1
Client  Scheduler: gateway://traefik-impactlab-hub-dask-gateway.impactlab-hub:80/impactlab-hub.41889301468f4c31815ad4a642702641  Dashboard: /services/dask-gateway/clusters/impactlab-hub.41889301468f4c31815ad4a642702641/status,Cluster  Workers: 30  Cores: 30  Memory: 724.78 GB


In [12]:
cluster.close()

**Validation of ERA-5 daily files**

In [29]:
era_start = '01-01-1979' # 01-01-1979
era_end = '12-31-2020'   # 12-31-2020

# make list of daily datetime indices, this includes leap years 
dt_index_full = pd.date_range(start=era_start, end=era_end, freq='D')

# reformat month/day for the retrieval function 
dt_index_years = dt_index_full.year.astype(str)
dt_index_months = dt_index_full.month.map("{:02}".format)
dt_index_days = dt_index_full.day.map("{:02}".format)
daynum = dt_index_full.dayofyear

# make list of hours for retrieval function 
hours = [hr.strftime("%H:%M") for hr in pd.date_range(start='10-09-2019', end='10-10-2019', freq='H')[:-1]]

In [92]:
def test_for_nans(ds, var):
    # no nans
    assert ds[var].isnull().sum() == 0, "there are nans!"

def test_temp_range(ds, var):
    # make sure temp values are in a valid range
    # asserts if statement below is false
    assert (ds[var].min() > 150) or (ds[var].max() < 350), "temperature values are invalid" 

def test_low_temp_range(ds, var):
    threshold = 180
    location = ds[var].where(ds[var] < threshold)
    num = np.count_nonzero(~np.isnan(location))
    return num

def test_high_temp_range(ds, var):
    threshold = 330
    location = ds[var].where(ds[var] > threshold)
    num = np.count_nonzero(~np.isnan(location))
    return num

def test_polar_high_temp(ds, var):
    threshold = 317 #315.5
    loc_NH = ds[var].sel(latitude=slice(90,50)).where(ds[var].sel(latitude=slice(90,50)) > threshold)
    num_NH = np.count_nonzero(~np.isnan(loc_NH))
    
    loc_SH = ds[var].sel(latitude=slice(-50,-90)).where(ds[var].sel(latitude=slice(-50,-90)) > threshold)
    num_SH = np.count_nonzero(~np.isnan(loc_SH))
    return num_NH, num_SH

def validate_era5_temp(spec):
    '''
    validate ERA-5 hourly or daily temperature files. 
    works for hourly `t2m` or saved daily tas, tmin, tmax
    '''
    filepath, timestep, var = spec
    print(spec)
    
    # first check to be sure file exists
    if os.path.isfile(filepath):
        pass
    else:
        raise FileNotFoundError("%s was not created" %filepath)
    # now validate: test for nans, correct num of timesteps, 
    # correct variable exists, and temperature range is not absurd 
    with xr.open_dataset(filepath) as ds: # if grib: , engine='cfgrib'
        test_for_nans(ds, var)
        test_temp_range(ds, var)

        occurrances_low = test_low_temp_range(ds, var)
        if occurrances_low > 0:
            return [occurrances_low, filepath]
        
        occurrances_high = test_high_temp_range(ds, var)
        if occurrances_high > 0:
            return [occurrances_high, filepath]
        
        [occur_NH, occur_SH] = test_polar_high_temp(ds, var)
        if occur_NH or occur_SH > 0:
            return [occur_NH, occur_SH, filepath]

In [93]:
daily_files = ['t2m_%s_%s_%s.nc' %(year, month, day) for year, month, 
               day in zip(dt_index_years, dt_index_months, dt_index_days)]
hourly_dir = '/gcs/impactlab-data/climate/source_data/ERA-5/hourly/netcdf_files'

daily_filepaths = [os.path.join(hourly_dir, daily_file) for daily_file in daily_files]
JOBS_validation = [(filepath, 'hourly', 't2m') for filepath in daily_filepaths]

In [94]:
JOBS_validation[100]

('/gcs/impactlab-data/climate/source_data/ERA-5/hourly/grib_files/t2m_1979_04_11.grib',
 'hourly',
 't2m')

In [95]:
%%time
futures_validation = client.map(validate_era5_temp, JOBS_validation)
dd.progress(futures_validation)

CPU times: user 1.8 s, sys: 116 ms, total: 1.91 s
Wall time: 1.84 s


VBox()

In [None]:
# gathers output from workers
results = client.gather(futures_validation)

In [None]:
# test number of not None instances in results i.e. where a function 'failed'
print(sum(x is not None for x in results))

In [None]:
# Saves the output of above -- identifying where a value (and not None) is located -- only do if above is > 0
list_results = [x is not None for x in results]

# worker index for flagged output (if above = True)
res = [i for i, val in enumerate(list_results) if val]

for i in res:
    print(results[i])

**Once validation is complete, create a year file of daily values of tmax, tmin, tas and DTR**

In [79]:
def calc_daily_era5_average(spec):
    '''
    calculate daily-averaged ERA-5 temperature data 
    '''
    filepath, timestep, variable = spec
    var = variable
    with xr.open_dataset(filepath) as ds:
        return(ds[var].mean('time'))

def calc_daily_era5_tmax(spec):
    '''
    calculate daily-averaged ERA-5 temperature data 
    '''
    filepath, timestep, variable = spec
    var = variable
    with xr.open_dataset(filepath) as ds:
        return(ds[var].max('time'))

def calc_daily_era5_tmin(spec):
    '''
    calculate daily-averaged ERA-5 temperature data 
    '''
    filepath, timestep, variable = spec
    var = variable
    with xr.open_dataset(filepath) as ds:
        return(ds[var].min('time'))
    
def calc_daily_dinural_temp_range(spec):
    '''
    calculate daily-averaged diurnal temperature renage (DTR)  
    '''
    filepath, timestep, variable = spec
    var = variable
    with xr.open_dataset(filepath) as ds:
        return(ds[var].max('time') - ds[var].min('time'))

In [72]:
def create_jobs(year):
    start = '01-01-{}'.format(year)
    end = '12-31-{}'.format(year)

    # make list of daily datetime indices, this includes leap years 
    dt_index_full = pd.date_range(start=start, end=end, freq='D')

    # reformat month/day for the retrieval function 
    dt_index_years = dt_index_full.year.astype(str)
    dt_index_months = dt_index_full.month.map("{:02}".format)
    dt_index_days = dt_index_full.day.map("{:02}".format)
    
    daily_files = ['t2m_%s_%s_%s.nc' %(year, month, day) for year, month, 
               day in zip(dt_index_years, dt_index_months, dt_index_days)]
    hourly_dir = '/gcs/impactlab-data/climate/source_data/ERA-5/hourly/netcdf_files' # update path

    daily_filepaths = [os.path.join(hourly_dir, daily_file) for daily_file in daily_files]
    JOBS = [(filepath, 'hourly', 't2m') for filepath in daily_filepaths] 
    return [JOBS, dt_index_full]

def save_yearlong_dailydata_file(directory, year, ds, var):
    '''
    save file of daily data for one variable for one year
    directory(str)
    year(str)
    ds(Dataset)
    var(str)
    '''
    today = str(date.today())
    daily_file = xr.Dataset( {var: ds},
                           attrs={
        'author': 'Meredith Fish',
        'contact': 'meredith.fish@rutgers.edu',
        'project': ('impactlab-rhg/climate/source-data/ERA-5'),
        'source': ('impactlab-rhg/climate/downscaled/ERA-5/hourly'),
        'created': today})
    filename = '%s_daily_%s-%s.nc' %(var, year, year)
    daily_file.to_netcdf(os.path.join(directory, filename))

In [75]:
def execute_daily_file_creation(year, variable):
    '''
    use `create_jobs` to create the JOBS files. 
    calculate the metric based on variable of interest. 
    save netcdf file.
    '''
    
    [JOBS, dt_index_full] = create_jobs(year)

    save_directory = '/gcs/impactlab-data/climate/source_data/ERA-5/{}/daily/netcdf/'.format(variable)

    if variable == 'dtr':
        calc = calc_daily_dinural_temp_range
    elif variable == 'tas':
        calc = calc_daily_era5_average
    elif variable == 'tasmax':
        calc = calc_daily_era5_tmax
    elif variable == 'tasmin':
        calc = calc_daily_era5_tmin
    
    futures = client.map(calc, JOBS)
    da_list = client.gather(futures)

    # concatenate DataArrays in list 
    da_year = xr.concat(da_list, dim='time')
    # add datetime index 
    da_year['time'] = dt_index_full
    
    # save file
    save_yearlong_dailydata_file(save_directory, year, da_year, variable)

In [76]:
for i_yr in np.arange(1995,2015):
    print(i_yr)
    execute_daily_file_creation(i_yr, 'dtr')

1996


In [77]:
! ls /gcs/impactlab-data/climate/source_data/ERA-5/dtr/daily/netcdf/

dtr_daily_1995-1995.nc	dtr_daily_1996-1996.nc
