In [34]:
import h5netcdf

In [1]:
# libraries
import os
import joblib
from osgeo import gdal
import pandas as pd
import numpy as np
import xarray as xr
import glob
import re
import rioxarray
import dask.array
from dask.distributed import Client, progress

In [2]:
client = Client(n_workers=4, threads_per_worker=2)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 4
Total threads: 8,Total memory: 480.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41817,Workers: 4
Dashboard: /proxy/8787/status,Total threads: 8
Started: Just now,Total memory: 480.00 GiB

0,1
Comm: tcp://127.0.0.1:41257,Total threads: 2
Dashboard: /proxy/41165/status,Memory: 120.00 GiB
Nanny: tcp://127.0.0.1:44711,
Local directory: /scratch-local/qiahan.4267242/dask-scratch-space/worker-ew_g3r31,Local directory: /scratch-local/qiahan.4267242/dask-scratch-space/worker-ew_g3r31

0,1
Comm: tcp://127.0.0.1:39253,Total threads: 2
Dashboard: /proxy/41933/status,Memory: 120.00 GiB
Nanny: tcp://127.0.0.1:42085,
Local directory: /scratch-local/qiahan.4267242/dask-scratch-space/worker-_m7kl5bh,Local directory: /scratch-local/qiahan.4267242/dask-scratch-space/worker-_m7kl5bh

0,1
Comm: tcp://127.0.0.1:39191,Total threads: 2
Dashboard: /proxy/45423/status,Memory: 120.00 GiB
Nanny: tcp://127.0.0.1:34545,
Local directory: /scratch-local/qiahan.4267242/dask-scratch-space/worker-4ue96ir7,Local directory: /scratch-local/qiahan.4267242/dask-scratch-space/worker-4ue96ir7

0,1
Comm: tcp://127.0.0.1:33399,Total threads: 2
Dashboard: /proxy/44477/status,Memory: 120.00 GiB
Nanny: tcp://127.0.0.1:44025,
Local directory: /scratch-local/qiahan.4267242/dask-scratch-space/worker-__19oipa,Local directory: /scratch-local/qiahan.4267242/dask-scratch-space/worker-__19oipa


In [3]:
workingPath = "/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/"

In [4]:
# function for getting directory of input data
def get_directories_with_number_and_ending(directory_path, ending):
    directories = []
    if ending:
        pattern = re.compile(r'^\d+.*{}$'.format(re.escape(ending)))  # Match directory names with number at the beginning and specific ending
    else:
        pattern = re.compile(r'^\d+$')  # Match directory names with only numbers

    for entry in os.scandir(directory_path):
        if entry.is_dir():
            directory_name = entry.name
            if pattern.match(directory_name):
                directories.append(directory_name)

    return directories

In [5]:
### define the path of input data, set the reference file for spatial resample
inputData = workingPath+"1input_data/"
# get all the filefolders named as year
year_list = get_directories_with_number_and_ending(workingPath+"1input_data/", "global")
# the input data from year[0] to year[..], based on the process id in sbatch script
year = year_list[4-1]
print(year)

2015global


In [6]:
import geopandas as gpd
from shapely.geometry import box

shapefile_path = "/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/Emulator/input_data/EuropeBoundary.shp"
gdf = gpd.read_file(shapefile_path)
bbox = gdf.total_bounds
bbox

array([-31.28903052,  34.93055094,  68.93136141,  81.85192337])

In [7]:
!ncinfo $inputData/$year/era5land/era5-land_total_precipitation_2015-01.nc

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_64BIT_OFFSET data model, file format NETCDF3):
    Conventions: CF-1.6
    history: 2023-06-20 04:05:16 GMT by grib_to_netcdf-2.25.1: /opt/ecmwf/mars-client/bin/grib_to_netcdf.bin -S param -o /cache/data3/adaptor.mars.internal-1687233692.3057544-27640-11-33b28015-7bc4-44ed-b4b9-99dcd6803491.nc /cache/tmp/33b28015-7bc4-44ed-b4b9-99dcd6803491-adaptor.mars.internal-1687233611.317501-27640-18-tmp.grib
    dimensions(sizes): longitude(3600), latitude(1801), time(744)
    variables(dimensions): float32 longitude(longitude), float32 latitude(latitude), int32 time(time), int16 tp(time, latitude, longitude)
    groups: 


In [9]:
%%time
### 0) read era5land data
# select Europe data of ERA5Land and save it as netcdf
all1 = xr.open_mfdataset(inputData+year+"/era5land/era5-land*.nc", parallel=True, chunks={"time":-1,"latitude":250,"longitude":250})
all1['longitude'] = (all1['longitude'] + 180) % 360 - 180
all1 = all1.sortby('longitude')

all1 = all1.sel(
    latitude=slice(bbox[3], bbox[1]),  
    longitude=slice(bbox[0], bbox[2])  
)

CPU times: user 498 ms, sys: 29.6 ms, total: 528 ms
Wall time: 989 ms


In [16]:
all1.chunks

Frozen({'time': (744, 672, 744, 720, 744, 720, 744, 744, 720, 744, 720, 744), 'latitude': (28, 110, 110, 110, 110, 1), 'longitude': (12, 110, 110, 80, 110, 110, 110, 110, 110, 110, 30)})

In [10]:
all_resample = all1.resample(time="1800S").interpolate('linear')

In [11]:
%%time
# calculate the Rin and Rli difference for every hour
Rin = all1['ssrd'].diff("time")/3600  #xr.concat([all1['ssrd'].isel(time=0),all1['ssrd']], dim="time")
Rin[0::24] = all1['ssrd'][1::24]/3600 # assign the original values in t01
# Rin.mean(dim=['longitude','latitude']).compute()[48]
Rli = all1['strd'].diff("time")/3600  
Rli[0::24] = all1['strd'][1::24]/3600
# Rli.mean(dim=['longitude','latitude']).plot()
Rin = Rin.resample(time="1800S").interpolate('linear')
Rli = Rli.resample(time="1800S").interpolate('linear')


CPU times: user 2.17 s, sys: 39.3 ms, total: 2.21 s
Wall time: 2.21 s


In [12]:
all_resample['ssrd'] = Rin
all_resample['strd'] = Rli


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


In [13]:
all_resample.chunks

Frozen({'time': (17519,), 'latitude': (168, 250, 51), 'longitude': (212, 100, 250, 250, 190)})

In [9]:
all1.nbytes/2**30

122.686006244272

In [14]:
all2 = all_resample.chunk({"time":500, "latitude":250, "longitude":250})

In [15]:
all2.chunks

Frozen({'time': (500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 19), 'latitude': (250, 219), 'longitude': (250, 250, 250, 250, 2)})

In [16]:
%%time
# all2.to_netcdf('/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2015global/era5land/era5land2015_10km1027.nc',engine='h5netcdf') #, format='NETCDF4'
all2.to_zarr('/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2015global/era5land/era5land2015_10km1028.zarr',mode='w')


CPU times: user 3min 23s, sys: 14 s, total: 3min 37s
Wall time: 19min 14s


<xarray.backends.zarr.ZarrStore at 0x153b30973350>

# test is the exported netcdf complete

In [13]:
lat1 = 40
lat2 = 60
lon1 = 2
lon2 = 22

In [20]:
test = xr.open_zarr(inputData+year+"/era5land/era5land2015_10km1027.zarr") 
# test = test.sel(
#     latitude=slice(lat2,lat1),  
#     longitude=slice(lon1, lon2)  
# )

In [24]:
test['sp']

Unnamed: 0,Array,Chunk
Bytes,15.34 GiB,119.21 MiB
Shape,"(8760, 469, 1002)","(500, 250, 250)"
Dask graph,180 chunks in 2 graph layers,180 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 15.34 GiB 119.21 MiB Shape (8760, 469, 1002) (500, 250, 250) Dask graph 180 chunks in 2 graph layers Data type float32 numpy.ndarray",1002  469  8760,

Unnamed: 0,Array,Chunk
Bytes,15.34 GiB,119.21 MiB
Shape,"(8760, 469, 1002)","(500, 250, 250)"
Dask graph,180 chunks in 2 graph layers,180 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [30]:
test['u10'].mean(dim={"longitude","latitude"}).isnull().sum().compute()

In [22]:
a = np.nanmean(test['u10'], axis=(1,2)) #xarray
nan_mask = np.isnan(a)

# use np.sum() to count the number of True, which is NaN
nan_count = np.sum(nan_mask)

In [23]:
nan_count

0