## Preparing data

In [1]:
import xarray as xr
import geopandas as gpd
from dask.distributed import Client, LocalCluster
from datetime import datetime, timedelta
from functools import partial
from PyStemmusScope import variable_conversion as vc
from rasterio.warp import reproject, Resampling
import numpy as np
import glob
import dask.array as da
import pandas as pd
from dask_jobqueue import SLURMCluster

In [2]:
import argparse

# 设置命令行参数
parser = argparse.ArgumentParser()
parser.add_argument('--year', type=int, required=True, help='Year for the data')
args = parser.parse_args()

year = args.year

# 使用 year 和 month 执行你的逻辑
print(f"Processing data for {year}", flush=True)


usage: ipykernel_launcher.py [-h] --year YEAR
ipykernel_launcher.py: error: the following arguments are required: --year


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
year = 2001

In [3]:
# parent_in_path = f"/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data"
parent_in_path = f"/projects/0/einf2480/era5land2000_2009_2020"
data_paths = {
            "era5land": f"{parent_in_path}/{year}/*.nc",
            "lai": f"{parent_in_path}/{year}global/lai_v2/*.nc",
            "ssm": f"{parent_in_path}/{year}global/ssm/GlobalGSSM11km2014_20240214.tif",
            "co2": f"{parent_in_path}/{year}global/co2/CAMS_CO2_2003-2020.nc",
            "landcover": f"{parent_in_path}/landcover/ESACCI-LC-L4-LCCS-Map-300m-P1Y-2013-v2.0.7cds.nc",
            "vcmax": f"{parent_in_path}/Vcmax/TROPOMI_Vmax_Tg_mean.tif",
            "canopyheight": f"{parent_in_path}/canopy_height/canopy_height_11kmGlobal20240215.tif",
            }

In [4]:
def era5_preprocess(ds):    
    # Convert the longitude coordinates from [0, 360] to [-180, 180]
    if 'valid_time' in ds.dims:
        ds = ds.rename({'valid_time': 'time'})
        
    ds = ds.assign_coords(
        latitude=ds.latitude.astype('float32'),
        longitude=ds.longitude.astype('float32')
    )
    ds = ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180))
    return ds

def fix_coords(ds):
    if 'band' in ds.dims:
        ds = ds.rename_dims({'band': 'time'})
        ds = ds.rename_vars({'band': 'time'})

    if 'x' in ds.dims and 'y' in ds.dims:
        ds = ds.rename_dims({'x': 'longitude', 'y': 'latitude'})
        ds = ds.rename_vars({'x': 'longitude', 'y': 'latitude'})
        
    elif 'lon' in ds.dims and 'lat' in ds.dims:
        ds = ds.rename_dims({'lon': 'longitude', 'lat': 'latitude'})
        ds = ds.rename_vars({'lon': 'longitude', 'lat': 'latitude'})
    return ds

In [5]:
cluster = SLURMCluster(
    name='dask-worker',
    cores=16,
    processes=16,
    queue='fat',
    memory='120GiB',
    local_directory='$TMPDIR',
    walltime='4:00:00'
)
cluster.scale(jobs=4)
client = Client(cluster)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36163 instead


0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/36163/status,

0,1
Dashboard: /proxy/36163/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://145.136.63.38:35165,Workers: 0
Dashboard: /proxy/36163/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [7]:
%%time
era5land = xr.open_mfdataset(data_paths['era5land'], preprocess=era5_preprocess, chunks={'longitude': 250, 'latitude': 250})
# era5land = era5land.chunk({'time': 750})
era5land = era5land.sortby(['longitude', 'latitude'])
era5land = era5land.chunk(
    time=750,
    longitude=250, 
    latitude=250
)
# # # svae to zarr
out_path = f"{parent_in_path}/{'era5land'}_{year}.zarr"
era5land.to_zarr(out_path, mode='w')



CPU times: user 9min 24s, sys: 26.3 s, total: 9min 50s
Wall time: 35min 11s


<xarray.backends.zarr.ZarrStore at 0x15367caa1580>

In [8]:
client.shutdown()

In [17]:
test2011 = xr.open_zarr(out_path)