**Step 2: Perform validation on newly downoaded data. Then, calculate the daily tmin, tmax and tas (2-m average) to save yearly files. Update to Diana's 'era5_download_and_aggregate_to_daily.ipynb' notebook**

In [1]:
%matplotlib inline
import xarray as xr
import numpy as np 
import matplotlib.pyplot as plt
import os
import pandas as pd
from datetime import datetime, timedelta, date

import dask
import dask.array as dda
import dask.distributed as dd

# rhodium-specific kubernetes cluster configuration
import rhg_compute_tools.kubernetes as rhgk

In [2]:
client, cluster = rhgk.get_standard_cluster()
cluster.scale(30)

In [3]:
client

0,1
Client  Scheduler: gateway://traefik-impactlab-hub-dask-gateway.impactlab-hub:80/impactlab-hub.3a13a5f1a3b54706a7222adec9224b8f  Dashboard: /services/dask-gateway/clusters/impactlab-hub.3a13a5f1a3b54706a7222adec9224b8f/status,Cluster  Workers: 30  Cores: 30  Memory: 362.39 GB


In [7]:
cluster.close()

In [4]:
era_start = '01-01-1979' # 1979 
era_end = '12-31-2020'   # 2020

# make list of daily datetime indices, this includes leap years 
dt_index_full = pd.date_range(start=era_start, end=era_end, freq='D')

# reformat month/day for the retrieval function 
dt_index_years = dt_index_full.year.astype(str)
dt_index_months = dt_index_full.month.map("{:02}".format)
dt_index_days = dt_index_full.day.map("{:02}".format)
daynum = dt_index_full.dayofyear

# make list of hours for retrieval function 
hours = [hr.strftime("%H:%M") for hr in pd.date_range(start='10-09-2019', end='10-10-2019', freq='H')[:-1]]

**Validation of ERA-5 daily files**

In [None]:
def test_for_nans(ds):
    # no nans
    assert ds['t2m'].isnull().sum() == 0, "there are nans!"

def test_temp_range(ds, var):
    # make sure temp values are in a valid range
    assert (ds[var].min() < 150.15) or (ds[var].max() > 350.15), "temperature values are invalid" # 0 to 50 C

def test_low_temp_range(ds, var):
    threshold = 180
    location = ds[var].where(ds[var] < threshold)
    num = np.count_nonzero(~np.isnan(location))
    return num

def test_high_temp_range(ds, var):
    threshold = 330
    location = ds[var].where(ds[var] > threshold)
    num = np.count_nonzero(~np.isnan(location))
    return num

def test_polar_high_temp(ds, var):
    threshold = 317 #315.5
    loc_NH = ds[var].sel(latitude=slice(90,50)).where(ds[var].sel(latitude=slice(90,50)) > threshold)
    num_NH = np.count_nonzero(~np.isnan(loc_NH))
    
    loc_SH = ds[var].sel(latitude=slice(-50,-90)).where(ds[var].sel(latitude=slice(-50,-90)) > threshold)
    num_SH = np.count_nonzero(~np.isnan(loc_SH))
    return num_NH, num_SH

def validate_era5_temp(spec):
    '''
    validate ERA-5 hourly or daily temperature files. 
    works for hourly `t2m` or saved daily tas, tmin, tmax
    '''
    filepath, timestep, var = spec
    
    # first check to be sure file exists
    if os.path.isfile(filepath):
        pass
    else:
        raise FileNotFoundError("%s was not created" %filepath)
    # now validate: test for nans, correct num of timesteps, 
    # correct variable exists, and temperature range is not absurd 
    with xr.open_dataset(filepath) as ds:
        test_for_nans(ds)
        test_temp_range(ds, var)

        occurrances_low = test_low_temp_range(ds, var)
        if occurrances_low > 0:
            return [occurrances_low, filepath]
        
        occurrances_high = test_high_temp_range(ds, var)
        if occurrances_high > 0:
            return [occurrances_high, filepath]
        
        [occur_NH, occur_SH] = test_polar_high_temp(ds, var)
        if occur_NH or occur_SH > 0:
            return [occur_NH, occur_SH, filepath]

In [None]:
daily_files = ['t2m_%s_%s_%s.nc' %(year, month, day) for year, month, 
               day in zip(dt_index_years, dt_index_months, dt_index_days)]
hourly_dir = '/gcs/impactlab-data/climate/source_data/ERA-5/hourly'

daily_filepaths = [os.path.join(hourly_dir, daily_file) for daily_file in daily_files]
JOBS_validation = [(filepath, 'hourly', 't2m') for filepath in daily_filepaths]

In [None]:
%%time
futures_validation = client.map(validate_era5_temp, JOBS_validation)
dd.progress(futures_validation)

In [None]:
# gathers output from workers
results = client.gather(futures_validation)

In [None]:
# test number of not None instances in results i.e. where a function 'failed'
print(sum(x is not None for x in results))

In [None]:
# Saves the output of above -- identifying where a value (and not None) is located -- only do if above is > 0
list_results = [x is not None for x in results]

# worker index for flagged output (if above = True)
res = [i for i, val in enumerate(list_results) if val]

for i in res:
    print(results[i])

**Once validation is complete, create daily files of tmax, tmin and tas**

In [5]:
def calc_daily_era5_average(spec):
    '''
    calculate daily-averaged ERA-5 temperature data 
    '''
    filepath, timestep = spec
    var = 't2m'
    with xr.open_dataset(filepath) as ds:
        return(ds[var].mean('time'))

def calc_daily_era5_tmax(spec):
    '''
    calculate daily-averaged ERA-5 temperature data 
    '''
    filepath, timestep = spec
    var = 't2m'
    with xr.open_dataset(filepath) as ds:
        return(ds[var].max('time'))

def calc_daily_era5_tmin(spec):
    '''
    calculate daily-averaged ERA-5 temperature data 
    '''
    filepath, timestep = spec
    var = 't2m'
    with xr.open_dataset(filepath) as ds:
        return(ds[var].min('time'))

In [6]:
daily_files = ['t2m_%s_%s_%s.nc' %(year, month, day) for year, month, 
               day in zip(dt_index_years, dt_index_months, dt_index_days)]
hourly_dir = '/gcs/impactlab-data/climate/source_data/ERA-5/hourly' # add /v5.1 for 2000-2006

daily_filepaths = [os.path.join(hourly_dir, daily_file) for daily_file in daily_files]
JOBS = [(filepath, 'hourly') for filepath in daily_filepaths] # , 't2m'

In [17]:
%%time
# tas
futures_tas = client.map(calc_daily_era5_average, JOBS)
dd.progress(futures_tas)

CPU times: user 37.7 ms, sys: 6.15 ms, total: 43.8 ms
Wall time: 42.2 ms


VBox()

In [18]:
tas_da_list = client.gather(futures_tas)

In [19]:
# concatenate DataArrays in list 
tas_year = xr.concat(tas_da_list, dim='time')

# add datetime index 
tas_year['time'] = dt_index_full

**Split daily averages into yearly files and save**

In [22]:
def save_yearlong_dailydata_file(directory, year, ds, var):
    '''
    save file of daily data for one variable for one year
    directory(str)
    year(str)
    ds(Dataset)
    var(str)
    '''
    today = str(date.today())
    daily_file = xr.Dataset( {var: ds},
                           attrs={
        'author': 'Meredith Fish',
        'contact': 'meredith.fish@rutgers.edu',
        'project': ('impactlab-rhg/climate/source-data/ERA-5'),
        'source': ('impactlab-rhg/climate/downscaled/ERA-5/hourly'),
        'created': today})
    filename = '%s_daily_%s-%s.nc' %(var, year, year)
    daily_file.to_netcdf(os.path.join(directory, filename))

In [23]:
# save tas daily file 
directory = '/gcs/impactlab-data/climate/source_data/ERA-5/day/tas/v1.1'

for i_yr in np.arange(1979,2021): # ALWAYS CHECK
    tas_per_yr = tas_year.sel(time=slice('%s-01-01' %str(i_yr),'%s-12-31' %str(i_yr)))
    save_yearlong_dailydata_file(directory, i_yr, tas_per_yr, 'tas') #year

**Repeat for Tmax -- recreate JOBS file**

In [47]:
%%time
# tmax
futures_tmax = client.map(calc_daily_era5_tmax, JOBS)
dd.progress(futures_tmax)

CPU times: user 41 ms, sys: 16.1 ms, total: 57.1 ms
Wall time: 45.9 ms


VBox()

In [48]:
tmax_da_list = client.gather(futures_tmax)

In [49]:
# concatenate DataArrays in list 
tmax_year = xr.concat(tmax_da_list, dim='time')

# add datetime index 
tmax_year['time'] = dt_index_full

In [51]:
# save tmax daily file 
directory = '/gcs/impactlab-data/climate/source_data/ERA-5/day/tmax/v1.1'

for i_yr in np.arange(1979,2021):
    tmax_per_yr = tmax_year.sel(time=slice('%s-01-01' %str(i_yr),'%s-12-31' %str(i_yr)))
    save_yearlong_dailydata_file(directory, i_yr, tmax_per_yr, 'tmax') #year

**Repeat for Tmin -- recreate JOBS file**

In [34]:
%%time
# tmin
futures_tmin = client.map(calc_daily_era5_tmin, JOBS)
dd.progress(futures_tmin)

CPU times: user 32.5 ms, sys: 1.37 ms, total: 33.8 ms
Wall time: 32.2 ms


VBox()

In [35]:
tmin_da_list = client.gather(futures_tmin)

In [36]:
# concatenate DataArrays in list 
tmin_year = xr.concat(tmin_da_list, dim='time')

In [37]:
# add datetime index 
tmin_year['time'] = dt_index_full

In [38]:
# save tmin daily file 
directory = '/gcs/impactlab-data/climate/source_data/ERA-5/day/tmin/v1.1'

for i_yr in np.arange(2019,2021):
    tmin_per_yr = tmin_year.sel(time=slice('%s-01-01' %str(i_yr),'%s-12-31' %str(i_yr)))
    save_yearlong_dailydata_file(directory, i_yr, tmin_per_yr, 'tmin') #year

--- Check newly created files ---

In [54]:
! ls /gcs/impactlab-data/climate/source_data/ERA-5/day/tas/v1.1/

tas_daily_1979-1979.nc	tas_daily_1993-1993.nc	tas_daily_2007-2007.nc
tas_daily_1980-1980.nc	tas_daily_1994-1994.nc	tas_daily_2008-2008.nc
tas_daily_1981-1981.nc	tas_daily_1995-1995.nc	tas_daily_2009-2009.nc
tas_daily_1982-1982.nc	tas_daily_1996-1996.nc	tas_daily_2010-2010.nc
tas_daily_1983-1983.nc	tas_daily_1997-1997.nc	tas_daily_2011-2011.nc
tas_daily_1984-1984.nc	tas_daily_1998-1998.nc	tas_daily_2012-2012.nc
tas_daily_1985-1985.nc	tas_daily_1999-1999.nc	tas_daily_2013-2013.nc
tas_daily_1986-1986.nc	tas_daily_2000-2000.nc	tas_daily_2014-2014.nc
tas_daily_1987-1987.nc	tas_daily_2001-2001.nc	tas_daily_2015-2015.nc
tas_daily_1988-1988.nc	tas_daily_2002-2002.nc	tas_daily_2016-2016.nc
tas_daily_1989-1989.nc	tas_daily_2003-2003.nc	tas_daily_2017-2017.nc
tas_daily_1990-1990.nc	tas_daily_2004-2004.nc	tas_daily_2018-2018.nc
tas_daily_1991-1991.nc	tas_daily_2005-2005.nc	tas_daily_2019-2019.nc
tas_daily_1992-1992.nc	tas_daily_2006-2006.nc	tas_daily_2020-2020.nc
