# ERA5 sanity check
Checks if forcing data in merged ERA5 files are all:
- Within user-specified ranges;
- Not missing;
- Not NaN.

Also checks the time dimension in each file to find if timesteps are:
- Not NaN;
- Consecutive;
- Equidistant.


In [1]:
# Modules
from pathlib import Path
from datetime import datetime
import netCDF4 as nc4
import numpy as np
import pandas as pd
import os
import sys

#### User settings

In [10]:
# Location of merged files
path_to_data = Path( 'C:/Globus endpoint/summaWorkflow_data/domain_BowAtBanff/forcing/2_merged_data' )

In [11]:
# Location and name of logfile
log_folder = path_to_data / 'sanity_checks'
log_file = 'log_sanity_checks.txt'

In [3]:
# File pattern
file_base = 'ERA5_merged_'
file_end = '.nc'

In [4]:
# Years to check (Jan-years[0] to Dec-years[1])
years = [1979,1979]

In [5]:
# Feasible variable ranges
ranges = {
    'pptrate': [0,float('inf')],
    'airpres': [0,float('inf')],
    'airtemp': [173,373],
    'spechum': [0,float('inf')],
    'SWRadAtm': [0,float('inf')],
    'LWRadAtm': [0,float('inf')],
    'windspd': [0,float('inf')],
}

#### Standard settings

In [6]:
# Define the variables we want to check
var_names = {'time','pptrate','airpres','airtemp','spechum','SWRadAtm','LWRadAtm','windspd'}

#### Do the checks

In [13]:
# Make the output folder if doesn't exist
log_folder.mkdir(parents=True, exist_ok=True)

In [14]:
# Prepare a dictionary to store results in
report = {
    'file name': [],
    'data type': [],
    'data unit': [],
    'num NaNs': [],
    'num missing': [],
    'num < min': [],
    'num > max': []
}

In [27]:
# Open the log file
logFile = open(log_folder / log_file,'w')

# log start
logFile.write('Opened for writing on ' + str(datetime.now()) + '\n');

# Loop over variables first, so that reports aggregated per variable for easy comparison
for var in var_names:

    # Print where we are
    logFile.write('\n')
    logFile.write('Now checking variable // ' + var + ' // \n')
    if var == 'time':
        logFile.write("{:20} {:10} {:35} {:10} {:15} {:15}\n".format('file_name','data_type','data_unit','num_NaNs','consecutive?','equidistant?'))
    else:
        logFile.write("{:20} {:10} {:35} {:10} {:12} {:10} {:10}\n".format('file_name','data_type','data_unit','num_NaNs','num_missing','num_<_min','num_>_max'))
    
    # Loop over all files (year & month)
    for year in range(years[0],years[1]+1):
        for month in range(1,13):
        
            # Specify the file name
            file_name = (file_base + str(year) + str(month).zfill(2) + file_end)
            file_full = path_to_data / file_name

            # Check if this file exists
            if not os.path.isfile(file_full):
                continue
        
            # Open netcdf file for specific year and month
            with nc4.Dataset(file_full) as src:
            
                # Extract the variable into a numpy array
                dat = np.array(src[var][:])
                
                # Get basic information
                chk_size = dat.shape
                chk_isnan = np.isnan(dat).sum()
                
                # Get the information that depends on attributes
                try: chk_type = src[var].dtype
                except: chk_type = 'n/a'
                
                try: chk_units = src[var].units
                except: chk_units = 'n/a'
                
                try:
                    chk_missv = src[var].missing_value
                    chk_missn = (dat == chk_missv).sum()
                except:
                    chk_missv = chk_missn = 'n/a'
                
                # Count how often the data goes beyond the defined 'sane' ranges
                if var in ranges:
                    chk_min = ranges[var][0]
                    chk_max = ranges[var][1]                    
                    chk_under = (dat < chk_min).sum()
                    chk_over = (dat > chk_max).sum()
                else:
                    chk_under = 'n/a'
                    chk_over = 'n/a'
                
                # Check if time values are consecutive and equidistant
                if var == 'time':
                    if all(np.sort(dat) == dat): chk_cons = True 
                    else: chk_cons = False
                    if all(np.diff(dat) == 1): chk_equid = True 
                    else: chk_equid = False                    
                
                # update the dictionary
                report['file name'].append(file_name)
                report['data type'].append(chk_type)
                report['data unit'].append(chk_units)
                report['num NaNs'].append(chk_isnan)
                report['num missing'].append(chk_missn)
                report['num < min'].append(chk_under)
                report['num > max'].append(chk_over)
                
                # print to file
                if var == 'time':
                    logFile.write("{:20} {:10} {:35} {:10} {:15} {:15}\n".format(str(file_name),
                                                                                str(chk_type),
                                                                                str(chk_units),
                                                                                str(chk_isnan),
                                                                                str(chk_cons),
                                                                                str(chk_equid)))
                else:
                    logFile.write("{:20} {:10} {:35} {:10} {:12} {:10} {:10}\n".format(str(file_name),
                                                                                str(chk_type),
                                                                                str(chk_units),
                                                                                str(chk_isnan),
                                                                                str(chk_missn),
                                                                                str(chk_under),
                                                                                str(chk_over)))
                    
# log end
logFile.write('\n')
logFile.write('Finished on ' + str(datetime.now()) + '\n');

# File handling
logFile.close()