# Setting up a Ray cluster with SmartSim

## 1. Start the cluster
We set up a SmartSim experiment, which will handle the launch of the Ray cluster.

First we import the relevant modules.

In [1]:
from smartsim import Experiment, slurm
from smartsim.ray import RayCluster

NUM_WORKERS = 3
alloc=slurm.get_allocation(nodes=1+NUM_WORKERS, time="12:00:00", options={"ntasks": str(1+NUM_WORKERS), "partition": "spider", "C": "V100"})

13:49:52 osprey.us.cray.com SmartSim[120927] INFO Allocation successful with Job ID: 242491


In [2]:
exp = Experiment("ray-cluster", launcher='slurm')
cluster = RayCluster(name="ray-cluster", run_args={"time":"06:00:00"}, path='',
                     launcher='slurm', workers=NUM_WORKERS, alloc=alloc, batch=False, ray_num_cpus=8)

if cluster.batch:
    cluster.head_model.batch_settings._preamble = [
                                                   #"module load ccm",
                                                    "source ~/.bashrc",
                                                   "conda activate smartsim",
                                                   "ulimit -s unlimited", "ulimit -u unlimited",
                                                   "ulimit -n unlimited", "ulimit -c unlimited",
                                                   "ulimit -a"]

# if NUM_WORKERS:
#     cluster.worker_model.batch_settings._preamble = ["source ~/.bashrc", "conda activate smartsim"]
exp.generate(cluster, overwrite=True)

13:49:55 osprey.us.cray.com SmartSim[120927] INFO Working in previously created experiment


In [3]:
exp.start(cluster, block=False, summary=False)

13:50:05 osprey.us.cray.com SmartSim[120927] INFO Ray cluster launched on nodes: ['spider-0002', 'spider-0003', 'spider-0004', 'spider-0005']


## 2. Start the ray driver script

In [4]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/ppo_tune.py')

In [30]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/ppo_train.py')

In [5]:
cluster.start_ray_job('/lus/scratch/arigazzi/smartsim-dev/SmartSim/tutorials/05_starting_ray/templates/mnist_pytorch_trainable.py')

## 3. Stop cluster and release allocation

In [5]:
exp.stop(cluster)

13:49:30 osprey.us.cray.com SmartSim[119244] INFO Stopping model workers with job name workers-CBCC1IKDXXGX
13:49:32 osprey.us.cray.com SmartSim[119244] INFO Stopping model head with job name head-CBCC1A59C9F1


In [6]:
if alloc:
    slurm.release_allocation(alloc)

13:49:36 osprey.us.cray.com SmartSim[119244] INFO Releasing allocation: 242490
13:49:36 osprey.us.cray.com SmartSim[119244] INFO Successfully freed allocation 242490


In [6]:
!squeue%clear

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
            242403     bdw18  Chpl-ep  chapelu  R       6:04     16 prod-[0001-0016]
            242404     bdw18 head-CBC arigazzi  R       2:57      1 prod-0017
            242394     clx28 sstsim.x visharma  R      40:31     32 prod-[0065-0096]


In [7]:
!scancel 242404


11:26:17 osprey.us.cray.com SmartSim[75449] INFO head(242404): Failed
