# Dask jobqueue example for JUWELS at JSC
covers the following aspects, i.e. how to
* add the JUWELS specific Dask jobqueue configuration
* get overview on available JUWELS compute node resources
* specify batch queue and project budget name
* open, scale and close a default jobqueue cluster
* do an example calculation on larger than memory data

In [1]:
import dask, dask_jobqueue, os
import dask.distributed as dask_distributed

## Load jobqueue configuration defaults

In [2]:
additional_config = dask.config.collect(paths=['.']) # look up further Dask configurations in local directory
dask.config.update(dask.config.config, additional_config, priority='new');

In [3]:
dask.config.get('jobqueue.juwels-jobqueue-config')

{'cores': 96,
 'memory': '90000M',
 'processes': 1,
 'local-directory': '/tmp',
 'death-timeout': 60,
 'extra': ['--host ${SLURMD_NODENAME}.ib.juwels.fzj.de'],
 'interface': None,
 'shebang': '#!/usr/bin/env bash',
 'walltime': '00:15:00',
 'log-directory': 'dask_jobqueue_logs',
 'name': 'dask-worker',
 'queue': None,
 'project': None,
 'job-cpu': None,
 'job-mem': None,
 'job-extra': [],
 'env-extra': []}

## Set up jobqueue cluster ...

In [4]:
!sinfo -t idle --format="%9P %.5a %.5D %.5t" # get overview on available resources per queue

PARTITION AVAIL NODES STATE
batch*       up   152  idle
devel        up    17  idle
mem192       up     9  idle
esm          up     5  idle
large      down   152  idle
gpus         up    17  idle
develgpus    up     5  idle
maint        up   196  idle


In [5]:
jobqueue_cluster = dask_jobqueue.SLURMCluster(
    config_name='juwels-jobqueue-config',
    project='esmtst', # specify budget name associated with project
    queue='esm', # choose queue by available resources
    scheduler_options={"host": os.environ['HOSTNAME']} # globally visible local scheduler network location
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36816 instead
  http_address["port"], self.http_server.port


In [6]:
print(jobqueue_cluster.job_script())

#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -e dask_jobqueue_logs/dask-worker-%J.err
#SBATCH -o dask_jobqueue_logs/dask-worker-%J.out
#SBATCH -p esm
#SBATCH -A esmtst
#SBATCH -n 1
#SBATCH --cpus-per-task=96
#SBATCH --mem=84G
#SBATCH -t 00:15:00

/p/home/jusers/rath1/juwels/PROJECT_training2005/2020-08_dask_intro/miniconda3/envs/py3_dask/bin/python -m distributed.cli.dask_worker tcp://10.11.159.191:43724 --nthreads 96 --memory-limit 90.00GB --name name --nanny --death-timeout 60 --local-directory /tmp --host ${SLURMD_NODENAME}.ib.juwels.fzj.de



## ... and the client process

In [7]:
client = dask_distributed.Client(jobqueue_cluster)

## Start jobqueue workers

In [8]:
jobqueue_cluster.scale(jobs=1)

In [9]:
!squeue -u {os.environ["USER"]}

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           2524520       esm dask-wor    rath1 CF       0:00      1 jwc00n003


In [10]:
client

0,1
Client  Scheduler: tcp://10.11.159.191:43724  Dashboard: http://10.11.159.191:36816/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


## Do calculation on larger than memory data

In [11]:
import dask.array as da

In [12]:
fake_data = da.random.uniform(0, 1, size=(365, 1e4, 1e4), chunks=(365,500,500)) # problem specific chunking
fake_data

Unnamed: 0,Array,Chunk
Bytes,292.00 GB,730.00 MB
Shape,"(365, 10000, 10000)","(365, 500, 500)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 292.00 GB 730.00 MB Shape (365, 10000, 10000) (365, 500, 500) Count 400 Tasks 400 Chunks Type float64 numpy.ndarray",10000  10000  365,

Unnamed: 0,Array,Chunk
Bytes,292.00 GB,730.00 MB
Shape,"(365, 10000, 10000)","(365, 500, 500)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray


In [13]:
import time

In [14]:
start_time = time.time()
fake_data.mean(axis=0).compute()
elapsed = time.time() - start_time
print('elapse time ',elapsed,' in seconds')

elapse time  14.90658164024353  in seconds


## Close jobqueue cluster and client process

In [15]:
!squeue -u {os.environ["USER"]}

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           2524520       esm dask-wor    rath1  R       0:20      1 jwc00n003


In [16]:
jobqueue_cluster.close()
client.close()

In [17]:
!squeue -u {os.environ["USER"]}

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           2524520       esm dask-wor    rath1 CG       0:20      1 jwc00n003


## Conda environment

In [18]:
!conda list --explicit

# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
@EXPLICIT
https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2020.6.20-hecda079_0.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.34-hc38a660_9.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-7.5.0-hdf63c60_15.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-9.3.0-hdf63c60_15.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgomp-9.3.0-h24d8f2e_15.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-1_gnu.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-9.3.0-h24d8f2e_15.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h516909a_2.tar.bz2
https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.16.1-h516909a_0.tar.bz2
https://conda.