# Calculate monthly means from hourly data

In [5]:
%matplotlib inline
from dask.distributed import Client
import dask
import xarray as xr
import time
import os

# Spawn a `dask` cluster

In [2]:
from dask_jobqueue import SLURMCluster
ncpu=24
cluster = SLURMCluster(cores=ncpu,
        processes=ncpu,
        memory="116.16GB",
        walltime="00:10:00",
        project='ucb164_summit1',
        queue="shas-testing") # Or use shas
cluster.scale(jobs=1)

from dask.distributed import Client
client = Client(cluster)

In [9]:
client

0,1
Client  Scheduler: tcp://10.225.5.217:39306  Dashboard: http://10.225.5.217:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError


# Calculate monthly means as a embarrassingly parallel problem using `dask`.

In [4]:
src_directory = "/scratch/summit/erke2265/MERRA2/"
tgt_directory = "/scratch/summit/erke2265/MERRA2_monthly/"
!rm /scratch/summit/erke2265/MERRA2_monthly/*

def calc_monthly_mean(src_directory, tgt_directory, filename):
    print(filename)
    src_path = src_directory + filename
    tgt_path = tgt_directory + filename.replace("hourly", "monthly")  
    
    ds = xr.open_dataset(src_path)
    ds_sorted = ds.sortby('time', ascending=True)
    monthly_mean = ds_sorted.groupby('time.month').mean()
    monthly_mean.to_netcdf(path=tgt_path)

results_interim = {}
for filename in os.listdir(src_directory):
    results_interim[filename] = dask.delayed(calc_monthly_mean)(src_directory, tgt_directory, filename)

results_scheduled = dask.delayed(list)(results_interim)
results = results_scheduled.compute()

In [8]:
client.shutdown()